Reorganize some files, add support for units in alarm rules, support network blip detection (including more leniency for heartbeat failures), fix bug in alarm matching, remove debug output, add auto-start mechanism and killscript to cstatsd, fix deps.sh

develop
Sven Slootweg 10 years ago
parent 4cf0601b05
commit e2c9097585

2
.gitignore vendored

@ -6,3 +6,5 @@ ccollectd/pubkey.dat
ccollectd/privkey.dat ccollectd/privkey.dat
cstatsd/pubkey.dat cstatsd/pubkey.dat
cstatsd/privkey.dat cstatsd/privkey.dat
cstatsd/cstatsd.pid
alert/rules.pickle

@ -120,6 +120,12 @@ class Bot(object):
target, rel, value = args[1:4] target, rel, value = args[1:4]
target = self.parse_target(target) target = self.parse_target(target)
if value[-1].lower() in ("k", "m", "g", "t"):
unit = value[-1].lower()
value = value[:-1]
value = float(value)
value = value * (1024 ** (("k", "m", "g", "t").index(unit) + 1))
rule_id = uuid.uuid4() rule_id = uuid.uuid4()
rules[rule_id] = { rules[rule_id] = {
"target": target, "target": target,
@ -187,6 +193,8 @@ class Bot(object):
elif data["msg_type"] == "down": elif data["msg_type"] == "down":
self.last_down["%(host)s!%(service)s.%(unit)s" % data] = time.time() self.last_down["%(host)s!%(service)s.%(unit)s" % data] = time.time()
self.send_all("\x02\x030,4 [ DOWN ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is \x02down!\x02" % data) self.send_all("\x02\x030,4 [ DOWN ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is \x02down!\x02" % data)
elif data["msg_type"] == "blip":
self.send_all("\x02\x030,4 [ WARNING ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that a blip occurred for \x036%(unit)s\x03!" % data)
elif data["msg_type"] == "value": elif data["msg_type"] == "value":
for rule_id, rule in rules.iteritems(): for rule_id, rule in rules.iteritems():
check_vals = { check_vals = {
@ -196,10 +204,14 @@ class Bot(object):
"unit": [data["unit"]] "unit": [data["unit"]]
} }
failed = False
for segment in ("host", "service", "resource", "unit"): for segment in ("host", "service", "resource", "unit"):
for val in check_vals[segment]: for val in check_vals[segment]:
if rule["target"][segment] is not True and not fnmatch.fnmatch(val, rule["target"][segment]): if rule["target"][segment] is not True and not fnmatch.fnmatch(val, rule["target"][segment]):
continue failed = True
break
if failed:
continue # Skip to next
# We haven't broken out in the past bit of code, so we're still matching the pattern... # We haven't broken out in the past bit of code, so we're still matching the pattern...
eligible_keys = [key for key in data["values"].keys() if fnmatch.fnmatch(key, rule["target"]["attribute"])] eligible_keys = [key for key in data["values"].keys() if fnmatch.fnmatch(key, rule["target"]["attribute"])]
@ -223,8 +235,6 @@ class Bot(object):
alarm = (value != rule_value) alarm = (value != rule_value)
else: else:
alarm = False alarm = False
print value, operator, rule_value, alarm
self.trigger_alarm(rule_id, data, alarm, value, key) self.trigger_alarm(rule_id, data, alarm, value, key)
@ -234,21 +244,17 @@ class Bot(object):
if key not in self.known_alarms: if key not in self.known_alarms:
if active: if active:
self.transmit_alarm(rule_id, data, active, offending_value, offending_key) self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
print "ALARM ACTIVE, UNKNOWN ENTRY"
self.known_alarms[key] = time.time() self.known_alarms[key] = time.time()
else: else:
self.known_alarms[key] = False self.known_alarms[key] = False
print "ALARM STOP, UNKNOWN ENTRY"
else: else:
if self.known_alarms[key] == False and active: if self.known_alarms[key] == False and active:
# Alarm activated # Alarm activated
self.transmit_alarm(rule_id, data, active, offending_value, offending_key) self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
print "ALARM ACTIVE, EXISTING ENTRY"
self.known_alarms[key] = time.time() self.known_alarms[key] = time.time()
elif self.known_alarms[key] != False and not active: elif self.known_alarms[key] != False and not active:
# Alarm deactivated # Alarm deactivated
self.transmit_alarm(rule_id, data, active, offending_value, offending_key) self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
print "ALARM STOP, EXISTING ENTRY"
self.known_alarms[key] = False self.known_alarms[key] = False
def transmit_alarm(self, rule_id, data, active, offending_value=None, offending_key=None): def transmit_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):

@ -1,6 +1,6 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys, socket
from nacl.public import PublicKey, PrivateKey, Box from nacl.public import PublicKey, PrivateKey, Box
ctx = zmq.Context() ctx = zmq.Context()
@ -23,15 +23,19 @@ boxes = {}
def heartbeat(): def heartbeat():
for hostname, node in nodes.iteritems(): for hostname, node in nodes.iteritems():
try: retries = 0
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) while retries < config["heartbeat"]["attempts"]:
s.settimeout(config["heartbeat"]["timeout"]) try:
s.connect((node["ip"], node["port"])) s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.shutdown(socket.SHUT_RDWR) s.settimeout(float(config["heartbeat"]["timeout"]) / (retries + 1))
s.close() s.connect((node["ip"], node["port"]))
up = True s.shutdown(socket.SHUT_RDWR)
except socket.error, e: s.close()
up = False up = True
break
except socket.error, e:
up = False
retries += 1
try: try:
status_changed = (up != last_node_status[hostname]) status_changed = (up != last_node_status[hostname])
@ -45,9 +49,16 @@ def heartbeat():
if status_changed: if status_changed:
if up: if up:
msg_type = "up" msg_type = "up"
send_message = True
else: else:
msg_type = "down" msg_type = "down"
send_message = True
else:
if retries > 0:
msg_type = "blip"
send_message = True
if send_message:
distributor.send(msgpack.packb({ distributor.send(msgpack.packb({
"host": config["hostname"], "host": config["hostname"],
"message": { "message": {

@ -1,2 +1,7 @@
endpoint: tcp://*:6543 endpoint: tcp://*:6543
pubkey: a266a0634790a79c6934385892f7c377d35b8f03b9c6ac7d5bfed4a94f93ba65 pubkey: a266a0634790a79c6934385892f7c377d35b8f03b9c6ac7d5bfed4a94f93ba65
autostart:
- stats-processes
- stats-ports
- stats-machines

@ -1,8 +1,13 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
import zmq, yaml, binascii, nacl import zmq, yaml, binascii, nacl, sys, subprocess, os
from nacl.public import PublicKey, PrivateKey, Box from nacl.public import PublicKey, PrivateKey, Box
basedir = os.path.dirname(os.path.realpath(__file__))
with open("cstatsd.pid", "w") as pidfile:
pidfile.write(str(os.getpid()))
ctx = zmq.Context() ctx = zmq.Context()
with open("config/cstatsd.yaml", "r") as cfile: with open("config/cstatsd.yaml", "r") as cfile:
@ -21,6 +26,17 @@ collector.bind("ipc:///tmp/cstatsd")
shipper = ctx.socket(zmq.PUSH) shipper = ctx.socket(zmq.PUSH)
shipper.bind(config["endpoint"]) shipper.bind(config["endpoint"])
try:
disable_autostart = (sys.argv[1] == "--disable-autostart")
except:
disable_autostart = False
if disable_autostart == False:
with open("/dev/null", "w+") as stfu:
for script in config["autostart"]:
print os.path.join(basedir, script)
subprocess.Popen([os.path.join(basedir, script)], stdout=stfu, stderr=stfu)
while True: while True:
message = collector.recv() message = collector.recv()
nonce = nacl.utils.random(Box.NONCE_SIZE) nonce = nacl.utils.random(Box.NONCE_SIZE)

@ -0,0 +1,5 @@
#!/bin/bash
PID=`cat cstatsd.pid`
pkill -P $PID
kill $PID

@ -103,7 +103,7 @@ while True:
} }
})) }))
swap_data = psutil.virtual_memory() swap_data = psutil.swap_memory()
sock.send(msgpack.packb({ sock.send(msgpack.packb({
"service": "machine", "service": "machine",
"msg_type": "value", "msg_type": "value",

@ -1,2 +1,4 @@
apt-get install -y libzmq-dev #!/bin/bash
# You need squeeze-backports if you run this on squeeze!
apt-get install -y libzmq-dev libffi-dev
pip install pyzmq msgpack-python pynacl pyyaml pip install pyzmq msgpack-python pynacl pyyaml

Loading…
Cancel
Save