From e2c90975850b645c447b586ee570a2ee050965dc Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 8 Dec 2013 17:54:12 +0100 Subject: [PATCH] Reorganize some files, add support for units in alarm rules, support network blip detection (including more leniency for heartbeat failures), fix bug in alarm matching, remove debug output, add auto-start mechanism and killscript to cstatsd, fix deps.sh --- .gitignore | 2 ++ alert/alert | 20 +++++++----- ccollectd/ccollectd | 31 +++++++++++++------ cstatsd/config/cstatsd.yaml.example | 5 +++ cstatsd/cstatsd | 18 ++++++++++- cstatsd/kill-stats | 5 +++ cstatsd/{stats-machine.py => stats-machine} | 2 +- cstatsd/{stats-ports.py => stats-ports} | 0 .../{stats-processes.py => stats-processes} | 0 cstatsd/{stats-tahoe.py => stats-tahoe} | 0 deps.sh | 4 ++- 11 files changed, 67 insertions(+), 20 deletions(-) create mode 100755 cstatsd/kill-stats rename cstatsd/{stats-machine.py => stats-machine} (99%) rename cstatsd/{stats-ports.py => stats-ports} (100%) rename cstatsd/{stats-processes.py => stats-processes} (100%) rename cstatsd/{stats-tahoe.py => stats-tahoe} (100%) diff --git a/.gitignore b/.gitignore index 20a01bb..c13cc10 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ ccollectd/pubkey.dat ccollectd/privkey.dat cstatsd/pubkey.dat cstatsd/privkey.dat +cstatsd/cstatsd.pid +alert/rules.pickle diff --git a/alert/alert b/alert/alert index 4ab63a3..195c582 100755 --- a/alert/alert +++ b/alert/alert @@ -120,6 +120,12 @@ class Bot(object): target, rel, value = args[1:4] target = self.parse_target(target) + if value[-1].lower() in ("k", "m", "g", "t"): + unit = value[-1].lower() + value = value[:-1] + value = float(value) + value = value * (1024 ** (("k", "m", "g", "t").index(unit) + 1)) + rule_id = uuid.uuid4() rules[rule_id] = { "target": target, @@ -187,6 +193,8 @@ class Bot(object): elif data["msg_type"] == "down": self.last_down["%(host)s!%(service)s.%(unit)s" % data] = time.time() self.send_all("\x02\x030,4 [ DOWN ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is \x02down!\x02" % data) + elif data["msg_type"] == "blip": + self.send_all("\x02\x030,4 [ WARNING ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that a blip occurred for \x036%(unit)s\x03!" % data) elif data["msg_type"] == "value": for rule_id, rule in rules.iteritems(): check_vals = { @@ -196,10 +204,14 @@ class Bot(object): "unit": [data["unit"]] } + failed = False for segment in ("host", "service", "resource", "unit"): for val in check_vals[segment]: if rule["target"][segment] is not True and not fnmatch.fnmatch(val, rule["target"][segment]): - continue + failed = True + break + if failed: + continue # Skip to next # We haven't broken out in the past bit of code, so we're still matching the pattern... eligible_keys = [key for key in data["values"].keys() if fnmatch.fnmatch(key, rule["target"]["attribute"])] @@ -223,8 +235,6 @@ class Bot(object): alarm = (value != rule_value) else: alarm = False - - print value, operator, rule_value, alarm self.trigger_alarm(rule_id, data, alarm, value, key) @@ -234,21 +244,17 @@ class Bot(object): if key not in self.known_alarms: if active: self.transmit_alarm(rule_id, data, active, offending_value, offending_key) - print "ALARM ACTIVE, UNKNOWN ENTRY" self.known_alarms[key] = time.time() else: self.known_alarms[key] = False - print "ALARM STOP, UNKNOWN ENTRY" else: if self.known_alarms[key] == False and active: # Alarm activated self.transmit_alarm(rule_id, data, active, offending_value, offending_key) - print "ALARM ACTIVE, EXISTING ENTRY" self.known_alarms[key] = time.time() elif self.known_alarms[key] != False and not active: # Alarm deactivated self.transmit_alarm(rule_id, data, active, offending_value, offending_key) - print "ALARM STOP, EXISTING ENTRY" self.known_alarms[key] = False def transmit_alarm(self, rule_id, data, active, offending_value=None, offending_key=None): diff --git a/ccollectd/ccollectd b/ccollectd/ccollectd index 9a0c58d..51acdf6 100755 --- a/ccollectd/ccollectd +++ b/ccollectd/ccollectd @@ -1,6 +1,6 @@ #!/usr/bin/env python2 -import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys +import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys, socket from nacl.public import PublicKey, PrivateKey, Box ctx = zmq.Context() @@ -23,15 +23,19 @@ boxes = {} def heartbeat(): for hostname, node in nodes.iteritems(): - try: - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.settimeout(config["heartbeat"]["timeout"]) - s.connect((node["ip"], node["port"])) - s.shutdown(socket.SHUT_RDWR) - s.close() - up = True - except socket.error, e: - up = False + retries = 0 + while retries < config["heartbeat"]["attempts"]: + try: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.settimeout(float(config["heartbeat"]["timeout"]) / (retries + 1)) + s.connect((node["ip"], node["port"])) + s.shutdown(socket.SHUT_RDWR) + s.close() + up = True + break + except socket.error, e: + up = False + retries += 1 try: status_changed = (up != last_node_status[hostname]) @@ -45,9 +49,16 @@ def heartbeat(): if status_changed: if up: msg_type = "up" + send_message = True else: msg_type = "down" + send_message = True + else: + if retries > 0: + msg_type = "blip" + send_message = True + if send_message: distributor.send(msgpack.packb({ "host": config["hostname"], "message": { diff --git a/cstatsd/config/cstatsd.yaml.example b/cstatsd/config/cstatsd.yaml.example index 1f05c0f..7dba2ef 100644 --- a/cstatsd/config/cstatsd.yaml.example +++ b/cstatsd/config/cstatsd.yaml.example @@ -1,2 +1,7 @@ endpoint: tcp://*:6543 pubkey: a266a0634790a79c6934385892f7c377d35b8f03b9c6ac7d5bfed4a94f93ba65 + +autostart: + - stats-processes + - stats-ports + - stats-machines diff --git a/cstatsd/cstatsd b/cstatsd/cstatsd index acaaa61..1242585 100755 --- a/cstatsd/cstatsd +++ b/cstatsd/cstatsd @@ -1,8 +1,13 @@ #!/usr/bin/env python2 -import zmq, yaml, binascii, nacl +import zmq, yaml, binascii, nacl, sys, subprocess, os from nacl.public import PublicKey, PrivateKey, Box +basedir = os.path.dirname(os.path.realpath(__file__)) + +with open("cstatsd.pid", "w") as pidfile: + pidfile.write(str(os.getpid())) + ctx = zmq.Context() with open("config/cstatsd.yaml", "r") as cfile: @@ -21,6 +26,17 @@ collector.bind("ipc:///tmp/cstatsd") shipper = ctx.socket(zmq.PUSH) shipper.bind(config["endpoint"]) +try: + disable_autostart = (sys.argv[1] == "--disable-autostart") +except: + disable_autostart = False + +if disable_autostart == False: + with open("/dev/null", "w+") as stfu: + for script in config["autostart"]: + print os.path.join(basedir, script) + subprocess.Popen([os.path.join(basedir, script)], stdout=stfu, stderr=stfu) + while True: message = collector.recv() nonce = nacl.utils.random(Box.NONCE_SIZE) diff --git a/cstatsd/kill-stats b/cstatsd/kill-stats new file mode 100755 index 0000000..2894cd5 --- /dev/null +++ b/cstatsd/kill-stats @@ -0,0 +1,5 @@ +#!/bin/bash + +PID=`cat cstatsd.pid` +pkill -P $PID +kill $PID diff --git a/cstatsd/stats-machine.py b/cstatsd/stats-machine similarity index 99% rename from cstatsd/stats-machine.py rename to cstatsd/stats-machine index 83f9d05..c01a557 100755 --- a/cstatsd/stats-machine.py +++ b/cstatsd/stats-machine @@ -103,7 +103,7 @@ while True: } })) - swap_data = psutil.virtual_memory() + swap_data = psutil.swap_memory() sock.send(msgpack.packb({ "service": "machine", "msg_type": "value", diff --git a/cstatsd/stats-ports.py b/cstatsd/stats-ports similarity index 100% rename from cstatsd/stats-ports.py rename to cstatsd/stats-ports diff --git a/cstatsd/stats-processes.py b/cstatsd/stats-processes similarity index 100% rename from cstatsd/stats-processes.py rename to cstatsd/stats-processes diff --git a/cstatsd/stats-tahoe.py b/cstatsd/stats-tahoe similarity index 100% rename from cstatsd/stats-tahoe.py rename to cstatsd/stats-tahoe diff --git a/deps.sh b/deps.sh index ce54a73..6f5ca12 100755 --- a/deps.sh +++ b/deps.sh @@ -1,2 +1,4 @@ -apt-get install -y libzmq-dev +#!/bin/bash +# You need squeeze-backports if you run this on squeeze! +apt-get install -y libzmq-dev libffi-dev pip install pyzmq msgpack-python pynacl pyyaml