Browse Source

Reorganize some files, add support for units in alarm rules, support network blip detection (including more leniency for heartbeat failures), fix bug in alarm matching, remove debug output, add auto-start mechanism and killscript to cstatsd, fix deps.sh

develop
Sven Slootweg 10 years ago
parent
commit
e2c9097585
  1. 2
      .gitignore
  2. 20
      alert/alert
  3. 31
      ccollectd/ccollectd
  4. 5
      cstatsd/config/cstatsd.yaml.example
  5. 18
      cstatsd/cstatsd
  6. 5
      cstatsd/kill-stats
  7. 2
      cstatsd/stats-machine
  8. 0
      cstatsd/stats-ports
  9. 0
      cstatsd/stats-processes
  10. 0
      cstatsd/stats-tahoe
  11. 4
      deps.sh

2
.gitignore

@ -6,3 +6,5 @@ ccollectd/pubkey.dat
ccollectd/privkey.dat
cstatsd/pubkey.dat
cstatsd/privkey.dat
cstatsd/cstatsd.pid
alert/rules.pickle

20
alert/alert

@ -120,6 +120,12 @@ class Bot(object):
target, rel, value = args[1:4]
target = self.parse_target(target)
if value[-1].lower() in ("k", "m", "g", "t"):
unit = value[-1].lower()
value = value[:-1]
value = float(value)
value = value * (1024 ** (("k", "m", "g", "t").index(unit) + 1))
rule_id = uuid.uuid4()
rules[rule_id] = {
"target": target,
@ -187,6 +193,8 @@ class Bot(object):
elif data["msg_type"] == "down":
self.last_down["%(host)s!%(service)s.%(unit)s" % data] = time.time()
self.send_all("\x02\x030,4 [ DOWN ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is \x02down!\x02" % data)
elif data["msg_type"] == "blip":
self.send_all("\x02\x030,4 [ WARNING ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that a blip occurred for \x036%(unit)s\x03!" % data)
elif data["msg_type"] == "value":
for rule_id, rule in rules.iteritems():
check_vals = {
@ -196,10 +204,14 @@ class Bot(object):
"unit": [data["unit"]]
}
failed = False
for segment in ("host", "service", "resource", "unit"):
for val in check_vals[segment]:
if rule["target"][segment] is not True and not fnmatch.fnmatch(val, rule["target"][segment]):
continue
failed = True
break
if failed:
continue # Skip to next
# We haven't broken out in the past bit of code, so we're still matching the pattern...
eligible_keys = [key for key in data["values"].keys() if fnmatch.fnmatch(key, rule["target"]["attribute"])]
@ -223,8 +235,6 @@ class Bot(object):
alarm = (value != rule_value)
else:
alarm = False
print value, operator, rule_value, alarm
self.trigger_alarm(rule_id, data, alarm, value, key)
@ -234,21 +244,17 @@ class Bot(object):
if key not in self.known_alarms:
if active:
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
print "ALARM ACTIVE, UNKNOWN ENTRY"
self.known_alarms[key] = time.time()
else:
self.known_alarms[key] = False
print "ALARM STOP, UNKNOWN ENTRY"
else:
if self.known_alarms[key] == False and active:
# Alarm activated
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
print "ALARM ACTIVE, EXISTING ENTRY"
self.known_alarms[key] = time.time()
elif self.known_alarms[key] != False and not active:
# Alarm deactivated
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
print "ALARM STOP, EXISTING ENTRY"
self.known_alarms[key] = False
def transmit_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):

31
ccollectd/ccollectd

@ -1,6 +1,6 @@
#!/usr/bin/env python2
import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys
import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys, socket
from nacl.public import PublicKey, PrivateKey, Box
ctx = zmq.Context()
@ -23,15 +23,19 @@ boxes = {}
def heartbeat():
for hostname, node in nodes.iteritems():
try:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(config["heartbeat"]["timeout"])
s.connect((node["ip"], node["port"]))
s.shutdown(socket.SHUT_RDWR)
s.close()
up = True
except socket.error, e:
up = False
retries = 0
while retries < config["heartbeat"]["attempts"]:
try:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(float(config["heartbeat"]["timeout"]) / (retries + 1))
s.connect((node["ip"], node["port"]))
s.shutdown(socket.SHUT_RDWR)
s.close()
up = True
break
except socket.error, e:
up = False
retries += 1
try:
status_changed = (up != last_node_status[hostname])
@ -45,9 +49,16 @@ def heartbeat():
if status_changed:
if up:
msg_type = "up"
send_message = True
else:
msg_type = "down"
send_message = True
else:
if retries > 0:
msg_type = "blip"
send_message = True
if send_message:
distributor.send(msgpack.packb({
"host": config["hostname"],
"message": {

5
cstatsd/config/cstatsd.yaml.example

@ -1,2 +1,7 @@
endpoint: tcp://*:6543
pubkey: a266a0634790a79c6934385892f7c377d35b8f03b9c6ac7d5bfed4a94f93ba65
autostart:
- stats-processes
- stats-ports
- stats-machines

18
cstatsd/cstatsd

@ -1,8 +1,13 @@
#!/usr/bin/env python2
import zmq, yaml, binascii, nacl
import zmq, yaml, binascii, nacl, sys, subprocess, os
from nacl.public import PublicKey, PrivateKey, Box
basedir = os.path.dirname(os.path.realpath(__file__))
with open("cstatsd.pid", "w") as pidfile:
pidfile.write(str(os.getpid()))
ctx = zmq.Context()
with open("config/cstatsd.yaml", "r") as cfile:
@ -21,6 +26,17 @@ collector.bind("ipc:///tmp/cstatsd")
shipper = ctx.socket(zmq.PUSH)
shipper.bind(config["endpoint"])
try:
disable_autostart = (sys.argv[1] == "--disable-autostart")
except:
disable_autostart = False
if disable_autostart == False:
with open("/dev/null", "w+") as stfu:
for script in config["autostart"]:
print os.path.join(basedir, script)
subprocess.Popen([os.path.join(basedir, script)], stdout=stfu, stderr=stfu)
while True:
message = collector.recv()
nonce = nacl.utils.random(Box.NONCE_SIZE)

5
cstatsd/kill-stats

@ -0,0 +1,5 @@
#!/bin/bash
PID=`cat cstatsd.pid`
pkill -P $PID
kill $PID

2
cstatsd/stats-machine.py → cstatsd/stats-machine

@ -103,7 +103,7 @@ while True:
}
}))
swap_data = psutil.virtual_memory()
swap_data = psutil.swap_memory()
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",

0
cstatsd/stats-ports.py → cstatsd/stats-ports

0
cstatsd/stats-processes.py → cstatsd/stats-processes

0
cstatsd/stats-tahoe.py → cstatsd/stats-tahoe

4
deps.sh

@ -1,2 +1,4 @@
apt-get install -y libzmq-dev
#!/bin/bash
# You need squeeze-backports if you run this on squeeze!
apt-get install -y libzmq-dev libffi-dev
pip install pyzmq msgpack-python pynacl pyyaml

Loading…
Cancel
Save