go-watchdog/watchdog.go

411 lines
9.6 KiB
Go
Raw Permalink Normal View History

package watchdog
2019-06-07 20:37:53 -06:00
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io/ioutil"
"net"
"net/http"
2019-06-08 03:11:42 -06:00
"net/url"
2019-06-07 20:37:53 -06:00
"os/exec"
2019-06-08 03:11:42 -06:00
"strings"
2019-06-07 20:37:53 -06:00
"time"
)
type Status int
const (
StatusDown Status = iota
StatusUp
)
func (s Status) String() string {
// ... just wishing Go had enums like Rust...
switch s {
case StatusUp:
return "up"
case StatusDown:
return "down"
default:
return "[[internal error]]"
}
}
2019-07-04 19:54:51 -06:00
const (
MessageDown = "went down"
MessageUp = "came back up"
MessageHiccup = "hiccupped"
)
2019-06-07 20:37:53 -06:00
type Dog struct {
Watchdog string
Name string
CheckURL string
Keywords string
Badwords string
Localizations map[string]string
Recover string
Webhooks []string
AllWebhooks map[string]Webhook
Logger chan string
status Status
changed bool
error error
2019-07-04 19:54:51 -06:00
//failures int
//passes int
//lastFailed time.Time
//lastPassed time.Time
//lastNotified time.Time
2019-06-07 20:37:53 -06:00
}
2019-06-08 03:11:42 -06:00
func New(d *Dog) *Dog {
2019-07-04 19:54:51 -06:00
//d.lastPassed = time.Now().Add(-5 * time.Minute)
d.status = StatusUp
d.changed = false
2019-06-08 03:11:42 -06:00
return d
2019-06-07 20:37:53 -06:00
}
2019-06-08 03:11:42 -06:00
func (d *Dog) Watch() {
d.watch()
2019-06-07 20:37:53 -06:00
for {
// TODO set cancellable callback ?
time.Sleep(5 * time.Minute)
2019-06-08 03:11:42 -06:00
d.watch()
2019-06-07 20:37:53 -06:00
}
}
2019-07-04 19:19:14 -06:00
// Now that I've added the ability to notify when a server is back up
// this definitely needs some refactoring. It's bad now.
2019-06-08 03:11:42 -06:00
func (d *Dog) watch() {
d.Logger <- fmt.Sprintf("Check: '%s'", d.Name)
2019-06-07 20:37:53 -06:00
2019-07-04 19:54:51 -06:00
// This may be up or down
err := d.hardcheck()
2019-06-07 20:37:53 -06:00
if nil == err {
2019-07-04 19:54:51 -06:00
d.Logger <- fmt.Sprintf("Up: '%s'", d.Name)
// if it's down, coming up, notify
if d.changed {
2019-07-04 19:54:51 -06:00
d.notify(MessageUp)
}
2019-06-07 20:37:53 -06:00
return
}
2019-07-04 19:54:51 -06:00
// If being down is a change, check to see if it's just a hiccup
if d.changed {
time.Sleep(time.Duration(5) * time.Second)
err2 := d.softcheck()
if nil != err2 {
// it's really down
d.Logger <- fmt.Sprintf("Down: '%s': %s", d.Name, err2)
} else {
// it's not really down, so reset the change info
d.changed = false
d.status = StatusUp
// and notify of the hiccup
d.Logger <- fmt.Sprintf("Hiccup: '%s': %s", d.Name, err)
d.notify(MessageHiccup)
return
}
2019-06-20 17:36:33 -06:00
}
2019-07-04 19:54:51 -06:00
// TODO what if the server is flip-flopping rapidly?
// how to rate limit?
// "{{ .Server }} is on cooldown for 30 minutes"
// * We've had success since the last notification
// * It's been at least 5 minutes since the last notification
//fiveMinutesAgo := time.Now().Add(-5 * time.Minute)
//if d.lastPassed.After(d.lastNotified) && d.lastNotified.Before(fiveMinutesAgo) {
//}
2019-06-07 20:37:53 -06:00
t := 10
for {
2019-07-04 19:54:51 -06:00
// try to recover, then backoff exponentially
2019-06-08 03:11:42 -06:00
d.recover()
2019-06-07 20:37:53 -06:00
time.Sleep(time.Duration(t) * time.Second)
t *= 2
2019-07-04 19:54:51 -06:00
if t > 120 {
t = 120
2019-06-07 20:37:53 -06:00
}
2019-06-12 00:52:33 -06:00
2019-07-04 19:54:51 -06:00
err := d.softcheck()
2019-06-07 20:37:53 -06:00
if nil != err {
2019-07-04 19:54:51 -06:00
// this is down, and we know it's down
d.status = StatusDown
2019-07-04 19:54:51 -06:00
d.Logger <- fmt.Sprintf("Unrecoverable: '%s': %s", d.Name, err)
if d.changed {
d.changed = false
d.notify(MessageDown)
}
2019-06-07 20:37:53 -06:00
} else {
2019-07-04 19:54:51 -06:00
// it came back up
d.status = StatusUp
d.Logger <- fmt.Sprintf("Up: '%s'", d.Name)
2019-07-04 19:54:51 -06:00
if d.changed {
// and the downtime was short - just a recovery
d.notify(MessageHiccup)
} else {
// and the downtime was some time
d.notify(MessageUp)
}
d.changed = false
2019-07-04 19:54:51 -06:00
break
2019-06-07 20:37:53 -06:00
}
2019-07-04 19:54:51 -06:00
}
}
2019-06-07 20:37:53 -06:00
2019-07-04 19:54:51 -06:00
func (d *Dog) softcheck() error {
2019-06-07 20:37:53 -06:00
client := NewHTTPClient()
2019-06-08 03:11:42 -06:00
response, err := client.Get(d.CheckURL)
2019-06-07 20:37:53 -06:00
if nil != err {
2019-06-08 03:11:42 -06:00
d.error = fmt.Errorf("Connection Failure: " + err.Error())
2019-06-07 20:37:53 -06:00
return err
}
b, err := ioutil.ReadAll(response.Body)
if nil != err {
2019-06-08 03:11:42 -06:00
d.error = fmt.Errorf("Network Failure: " + err.Error())
2019-06-07 20:37:53 -06:00
return err
}
// Note: empty matches empty as true, so this works for checking redirects
2019-06-08 03:11:42 -06:00
if !bytes.Contains(b, []byte(d.Keywords)) {
err = fmt.Errorf("Down: '%s' Not Found for '%s'", d.Keywords, d.Name)
d.Logger <- fmt.Sprintf("%s", err)
2019-06-08 03:11:42 -06:00
d.error = err
2019-06-07 20:37:53 -06:00
return err
}
if "" != d.Badwords {
2019-07-04 19:19:14 -06:00
if bytes.Contains(b, []byte(d.Badwords)) {
err = fmt.Errorf("Down: '%s' Found for '%s'", d.Badwords, d.Name)
d.Logger <- fmt.Sprintf("%s", err)
d.error = err
return err
}
2019-06-07 20:37:53 -06:00
}
return nil
}
2019-07-04 19:54:51 -06:00
func (d *Dog) hardcheck() error {
previousStatus := d.status
err := d.softcheck()
// Are we up, or down?
if nil != err {
d.status = StatusDown
//d.failures += 1
//d.lastFailed = time.Now()
} else {
d.status = StatusUp
//d.lastPassed = time.Now()
//d.passes += 1
}
// Has that changed?
if previousStatus != d.status {
d.changed = true
} else {
d.changed = false
}
return err
}
2019-06-08 03:11:42 -06:00
func (d *Dog) recover() {
if "" == d.Recover {
return
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
cmd := exec.CommandContext(ctx, "bash")
pipe, err := cmd.StdinPipe()
pipe.Write([]byte(d.Recover))
if nil != err {
d.Logger <- fmt.Sprintf("[Recover] Could not write to bash '%s': %s", d.Recover, err)
2019-06-08 03:11:42 -06:00
}
err = cmd.Start()
if nil != err {
d.Logger <- fmt.Sprintf("[Recover] Could not start '%s': %s", d.Recover, err)
2019-06-08 03:11:42 -06:00
}
err = pipe.Close()
if nil != err {
d.Logger <- fmt.Sprintf("[Recover] Could not close '%s': %s", d.Recover, err)
2019-06-08 03:11:42 -06:00
}
err = cmd.Wait()
cancel()
if nil != err {
d.Logger <- fmt.Sprintf("[Recover] '%s' failed for '%s': %s", d.Recover, d.Name, err)
2019-06-08 03:11:42 -06:00
}
}
func (d *Dog) notify(msg string) {
d.Logger <- fmt.Sprintf("Notifying the authorities of %s's status change", d.Name)
2019-07-04 19:54:51 -06:00
//d.lastNotified = time.Now()
2019-06-08 03:11:42 -06:00
for i := range d.Webhooks {
name := d.Webhooks[i]
if "" == name {
continue
}
h, ok := d.AllWebhooks[name]
if !ok {
// TODO check in main when config is read
d.Webhooks[i] = ""
d.Logger <- fmt.Sprintf("[Warning] Could not find webhook '%s' for '%s'", name, h.Name)
2019-06-08 03:11:42 -06:00
continue
}
d.notifyOne(h, msg)
2019-06-24 20:00:01 -06:00
}
}
2019-06-08 03:11:42 -06:00
func (d *Dog) notifyOne(h Webhook, msg string) {
2019-06-24 20:00:01 -06:00
// TODO do this in main on config init
if "" == h.Method {
h.Method = "POST"
}
2019-06-08 03:11:42 -06:00
2019-06-24 20:00:01 -06:00
var body *strings.Reader
var err error
// TODO real templates
if 0 != len(h.Form) {
form := url.Values{}
for k := range h.Form {
v := h.Form[k]
// because `{{` gets urlencoded
//k = strings.Replace(k, "{{ .Name }}", d.Name, -1)
v = strings.Replace(v, "{{ .Watchdog }}", d.Watchdog, -1)
2019-06-24 20:00:01 -06:00
v = strings.Replace(v, "{{ .Name }}", d.Name, -1)
v = strings.Replace(v, "{{ .Status }}", d.localize(d.status.String()), -1)
v = strings.Replace(v, "{{ .Message }}", d.localize(msg), -1)
2019-06-24 20:00:01 -06:00
d.Logger <- fmt.Sprintf("[HEADER] %s: %s", k, v)
form.Set(k, v)
}
body = strings.NewReader(form.Encode())
} else if 0 != len(h.JSON) {
bodyBuf, err := json.Marshal(h.JSON)
2019-06-08 03:11:42 -06:00
if nil != err {
2019-06-24 20:00:01 -06:00
d.Logger <- fmt.Sprintf("[Notify] JSON Marshal Error for '%s': %s", h.Name, err)
return
2019-06-08 03:11:42 -06:00
}
2019-06-24 20:00:01 -06:00
// `{{` should be left alone
v := string(bodyBuf)
v = strings.Replace(v, "{{ .Watchdog }}", d.Watchdog, -1)
v = strings.Replace(v, "{{ .Name }}", d.Name, -1)
v = strings.Replace(v, "{{ .Status }}", d.localize(d.status.String()), -1)
v = strings.Replace(v, "{{ .Message }}", d.localize(msg), -1)
body = strings.NewReader(v)
2019-06-24 20:00:01 -06:00
}
2019-06-08 03:11:42 -06:00
2019-06-24 20:00:01 -06:00
client := NewHTTPClient()
req, err := http.NewRequest(h.Method, h.URL, body)
if nil != err {
d.Logger <- fmt.Sprintf("[Notify] HTTP Client Network Error for '%s': %s", h.Name, err)
return
}
2019-06-08 03:11:42 -06:00
2019-06-24 20:00:01 -06:00
if 0 != len(h.Form) {
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
} else if 0 != len(h.JSON) {
req.Header.Set("Content-Type", "application/json")
}
2019-06-08 03:11:42 -06:00
2019-06-24 20:00:01 -06:00
if 0 != len(h.Auth) {
user := h.Auth["user"]
if "" == user {
user = h.Auth["username"]
}
2019-06-24 20:00:01 -06:00
pass := h.Auth["pass"]
if "" == user {
pass = h.Auth["password"]
2019-06-08 03:11:42 -06:00
}
2019-06-24 20:00:01 -06:00
req.SetBasicAuth(user, pass)
}
2019-06-08 03:11:42 -06:00
2019-06-24 20:00:01 -06:00
req.Header.Set("User-Agent", "Watchdog/1.0")
for k := range h.Headers {
req.Header.Set(k, h.Headers[k])
}
2019-06-08 03:11:42 -06:00
2019-06-24 20:00:01 -06:00
resp, err := client.Do(req)
if nil != err {
d.Logger <- fmt.Sprintf("[Notify] HTTP Client Error for '%s': %s", h.Name, err)
return
}
if !(resp.StatusCode >= 200 && resp.StatusCode < 300) {
d.Logger <- fmt.Sprintf("[Notify] Response Error for '%s': %s", h.Name, resp.Status)
return
}
2019-06-07 20:37:53 -06:00
2019-06-24 20:00:01 -06:00
// TODO json vs xml vs txt
var data map[string]interface{}
req.Header.Add("Accept", "application/json")
decoder := json.NewDecoder(resp.Body)
err = decoder.Decode(&data)
if err != nil {
d.Logger <- fmt.Sprintf("[Notify] Response Body Error for '%s': %s", h.Name, resp.Status)
return
2019-06-08 03:11:42 -06:00
}
2019-06-24 20:00:01 -06:00
// TODO some sort of way to determine if data is successful (keywords)
d.Logger <- fmt.Sprintf("[Notify] Success? %#v", data)
2019-06-07 20:37:53 -06:00
}
func (d *Dog) localize(msg string) string {
for k := range d.Localizations {
if k == msg {
return d.Localizations[k]
}
}
return msg
}
2019-06-07 20:37:53 -06:00
type Config struct {
Watchdog string `json:"watchdog"`
Watches []ConfigWatch `json:"watches"`
Webhooks []Webhook `json:"webhooks"`
Localizations map[string]string `json:"localizations"`
2019-06-07 20:37:53 -06:00
}
type ConfigWatch struct {
2019-06-08 03:11:42 -06:00
Name string `json:"name"`
URL string `json:"url"`
Keywords string `json:"keywords"`
Badwords string `json:"badwords"`
2019-06-08 03:11:42 -06:00
Webhooks []string `json:"webhooks"`
RecoverScript string `json:"recover_script"`
}
2019-06-24 20:00:01 -06:00
type Webhook struct {
2019-06-08 03:11:42 -06:00
Name string `json:"name"`
Method string `json:"method"`
URL string `json:"url"`
Auth map[string]string `json:"auth"`
Headers map[string]string `json:"headers"`
2019-06-08 03:11:42 -06:00
Form map[string]string `json:"form"`
2019-06-12 00:52:33 -06:00
JSON map[string]string `json:"json"`
2019-06-08 03:11:42 -06:00
Config map[string]string `json:"config"`
Configs []map[string]string `json:"configs"`
}
// The default http client uses unsafe defaults
2019-06-08 03:11:42 -06:00
func NewHTTPClient() *http.Client {
transport := &http.Transport{
Dial: (&net.Dialer{
Timeout: 10 * time.Second,
}).Dial,
TLSHandshakeTimeout: 5 * time.Second,
}
client := &http.Client{
Timeout: time.Second * 5,
Transport: transport,
}
return client
}