Reorganize some files, add support for units in alarm rules, support network blip detection (including more leniency for heartbeat failures), fix bug in alarm matching, remove debug output, add auto-start mechanism and killscript to cstatsd, fix deps.sh

develop
Sven Slootweg 10 years ago
parent 4cf0601b05
commit e2c9097585

2
.gitignore vendored

@ -6,3 +6,5 @@ ccollectd/pubkey.dat
ccollectd/privkey.dat
cstatsd/pubkey.dat
cstatsd/privkey.dat
cstatsd/cstatsd.pid
alert/rules.pickle

@ -120,6 +120,12 @@ class Bot(object):
target, rel, value = args[1:4]
target = self.parse_target(target)
if value[-1].lower() in ("k", "m", "g", "t"):
unit = value[-1].lower()
value = value[:-1]
value = float(value)
value = value * (1024 ** (("k", "m", "g", "t").index(unit) + 1))
rule_id = uuid.uuid4()
rules[rule_id] = {
"target": target,
@ -187,6 +193,8 @@ class Bot(object):
elif data["msg_type"] == "down":
self.last_down["%(host)s!%(service)s.%(unit)s" % data] = time.time()
self.send_all("\x02\x030,4 [ DOWN ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is \x02down!\x02" % data)
elif data["msg_type"] == "blip":
self.send_all("\x02\x030,4 [ WARNING ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that a blip occurred for \x036%(unit)s\x03!" % data)
elif data["msg_type"] == "value":
for rule_id, rule in rules.iteritems():
check_vals = {
@ -196,10 +204,14 @@ class Bot(object):
"unit": [data["unit"]]
}
failed = False
for segment in ("host", "service", "resource", "unit"):
for val in check_vals[segment]:
if rule["target"][segment] is not True and not fnmatch.fnmatch(val, rule["target"][segment]):
continue
failed = True
break
if failed:
continue # Skip to next
# We haven't broken out in the past bit of code, so we're still matching the pattern...
eligible_keys = [key for key in data["values"].keys() if fnmatch.fnmatch(key, rule["target"]["attribute"])]
@ -224,8 +236,6 @@ class Bot(object):
else:
alarm = False
print value, operator, rule_value, alarm
self.trigger_alarm(rule_id, data, alarm, value, key)
def trigger_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
@ -234,21 +244,17 @@ class Bot(object):
if key not in self.known_alarms:
if active:
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
print "ALARM ACTIVE, UNKNOWN ENTRY"
self.known_alarms[key] = time.time()
else:
self.known_alarms[key] = False
print "ALARM STOP, UNKNOWN ENTRY"
else:
if self.known_alarms[key] == False and active:
# Alarm activated
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
print "ALARM ACTIVE, EXISTING ENTRY"
self.known_alarms[key] = time.time()
elif self.known_alarms[key] != False and not active:
# Alarm deactivated
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
print "ALARM STOP, EXISTING ENTRY"
self.known_alarms[key] = False
def transmit_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):

@ -1,6 +1,6 @@
#!/usr/bin/env python2
import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys
import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys, socket
from nacl.public import PublicKey, PrivateKey, Box
ctx = zmq.Context()
@ -23,15 +23,19 @@ boxes = {}
def heartbeat():
for hostname, node in nodes.iteritems():
retries = 0
while retries < config["heartbeat"]["attempts"]:
try:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(config["heartbeat"]["timeout"])
s.settimeout(float(config["heartbeat"]["timeout"]) / (retries + 1))
s.connect((node["ip"], node["port"]))
s.shutdown(socket.SHUT_RDWR)
s.close()
up = True
break
except socket.error, e:
up = False
retries += 1
try:
status_changed = (up != last_node_status[hostname])
@ -45,9 +49,16 @@ def heartbeat():
if status_changed:
if up:
msg_type = "up"
send_message = True
else:
msg_type = "down"
send_message = True
else:
if retries > 0:
msg_type = "blip"
send_message = True
if send_message:
distributor.send(msgpack.packb({
"host": config["hostname"],
"message": {

@ -1,2 +1,7 @@
endpoint: tcp://*:6543
pubkey: a266a0634790a79c6934385892f7c377d35b8f03b9c6ac7d5bfed4a94f93ba65
autostart:
- stats-processes
- stats-ports
- stats-machines

@ -1,8 +1,13 @@
#!/usr/bin/env python2
import zmq, yaml, binascii, nacl
import zmq, yaml, binascii, nacl, sys, subprocess, os
from nacl.public import PublicKey, PrivateKey, Box
basedir = os.path.dirname(os.path.realpath(__file__))
with open("cstatsd.pid", "w") as pidfile:
pidfile.write(str(os.getpid()))
ctx = zmq.Context()
with open("config/cstatsd.yaml", "r") as cfile:
@ -21,6 +26,17 @@ collector.bind("ipc:///tmp/cstatsd")
shipper = ctx.socket(zmq.PUSH)
shipper.bind(config["endpoint"])
try:
disable_autostart = (sys.argv[1] == "--disable-autostart")
except:
disable_autostart = False
if disable_autostart == False:
with open("/dev/null", "w+") as stfu:
for script in config["autostart"]:
print os.path.join(basedir, script)
subprocess.Popen([os.path.join(basedir, script)], stdout=stfu, stderr=stfu)
while True:
message = collector.recv()
nonce = nacl.utils.random(Box.NONCE_SIZE)

@ -0,0 +1,5 @@
#!/bin/bash
PID=`cat cstatsd.pid`
pkill -P $PID
kill $PID

@ -103,7 +103,7 @@ while True:
}
}))
swap_data = psutil.virtual_memory()
swap_data = psutil.swap_memory()
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",

@ -1,2 +1,4 @@
apt-get install -y libzmq-dev
#!/bin/bash
# You need squeeze-backports if you run this on squeeze!
apt-get install -y libzmq-dev libffi-dev
pip install pyzmq msgpack-python pynacl pyyaml

Loading…
Cancel
Save