Update todo

Spec out components
Update todo
25 changed files with 1040 additions and 148 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,10 @@
 cstatsd/config/*.yaml
 ccollectd/config.yaml
 alert/config.yaml
 *.pyc
 ccollectd/pubkey.dat
 ccollectd/privkey.dat
 cstatsd/pubkey.dat
 cstatsd/privkey.dat
 cstatsd/cstatsd.pid
 alert/rules.pickle
--- a/alert/alert
+++ b/alert/alert
@ -0,0 +1,304 @@
 #!/usr/bin/env python2
 import socket, yaml, random, zmq, msgpack, time, uuid, fnmatch
 import cPickle as pickle
 ctx = zmq.Context()
 with open("config.yaml", "r") as cfile:
 	config = yaml.safe_load(cfile)
 try:
 	with open("rules.pickle", "r") as pfile:
 		rules = pickle.load(pfile)
 except IOError, e:
 	rules = {}
 fetcher = ctx.socket(zmq.SUB)
 fetcher.setsockopt(zmq.SUBSCRIBE, "")
 fetcher.connect("tcp://127.0.0.1:8998")
 class Bot(object):
 	def __init__(self, hosts, port, nickname, realname, channels, admins, subsock):
 		self.hosts = hosts
 		self.port = port
 		self.nickname = nickname
 		self.realname = realname
 		self.channels = channels
 		self.admins = admins
 		self.subsock = subsock
 		self.connected = False
 		self.last_down = {}
 		self.known_alarms = {}
 		self.command_map = {
 			"422": self.join_all_channels,
 			"376": self.join_all_channels,
 			"PRIVMSG": self.receive_message
 		}
 	def split_irc(self, message):
 		if message[0] == ":":
 			prefix = ":"
 			message = message[1:]
 		else:
 			prefix = ""
 		if ":" in message:
 			rest, last = message.split(":", 1)
 			parts = rest.strip().split() + [last]
 		else:
 			parts = message.split()
 		parts[0] = prefix + parts[0]
 		return parts
 	def run(self):
 		while True:  # Connect loop
 			host = random.choice(self.hosts)
 			self.sock = socket.socket()
 			try:
 				self.sock.connect((host, self.port))
 			except socket.error, e:
 				continue  # Reconnect
 			self.send_raw("NICK %s" % self.nickname)
 			self.sock.send("USER %s 0 0 :%s\r\n" % (self.nickname, self.realname))
 			buff = ""
 			while True:  # Read loop
 				r, w, x = zmq.select([self.sock, self.subsock], [], [])
 				for s in r:
 					if s == self.sock.fileno():
 						try:
 							recvdata = self.sock.recv(1024)
 						except socket.error, e:
 							break # Something went wrong, reconnect...
 						if len(recvdata) == 0:
 							break  # We have disconnected...
 						buff += recvdata
 						messages = buff.split("\n")
 						buff = messages.pop()
 						for message in messages:
 							self.process_message(self.split_irc(message.strip("\r")))
 					elif self.subsock.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0:
 						# Process incoming data from the subscribe socket...
 						message = msgpack.unpackb(s.recv())
 						self.process_stats(message)
 	def send_raw(self, message):
 		self.sock.send("%s\r\n" % message)
 	def send_message(self, recipient, message):
 		if self.connected == True:
 			self.send_raw("PRIVMSG %s :%s" % (recipient, message))
 	def send_all(self, message):
 		for channel in self.channels:
 			self.send_message(channel, message)
 	def join(self, channel):
 		self.send_raw("JOIN %s" % channel)
 	def join_all_channels(self, message):
 		self.connected = True
 		for channel in self.channels:
 			self.join(channel)
 	def receive_message(self, message):
 		args = message[3].split()
 		sender = message[0][1:].split("!", 1)[0]
 		channel = message[2]
 		try:
 			if sender in self.admins:
 				if args[0] == "!addrule":
 					target, rel, value = args[1:4]
 					target = self.parse_target(target)
 					if value[-1].lower() in ("k", "m", "g", "t"):
 						unit = value[-1].lower()
 						value = value[:-1]
 						value = float(value)
 						value = value * (1024 ** (("k", "m", "g", "t").index(unit) + 1))
 					rule_id = uuid.uuid4()
 					rules[rule_id] = {
 						"target": target,
 						"operator": rel,
 						"value": value
 					}
 					with open("rules.pickle", "w") as pfile:
 						pickle.dump(rules, pfile)
 					self.send_message(channel, "Added rule for %s with ID %s." % (args[1], rule_id))
 		except Exception, e:
 			self.send_message(channel, str(e))
 	def parse_target(self, target):
 		host, rest = target.split("!", 1)
 		service, rest = rest.split(".", 1)
 		resource, rest = rest.split(":", 1)
 		unit, attribute = rest.split(".", 1)
 		# TODO: unit = unit.split("(", 1)[0].strip() # Allow () for comments
 		if host == "*":
 			host = True
 		if service == "*":
 			service = True
 		if attribute == "*":
 			attribute = True
 		if resource == "*":
 			resource = True
 		if unit == "*":
 			unit = True
 		return {
 			"host": host,
 			"service": service,
 			"resource": resource,
 			"unit": unit,
 			"attribute": attribute
 		}
 	def format_time_duration(self, seconds):
 		# http://stackoverflow.com/a/20222351/1332715
 		days, rem = divmod(seconds, 86400)
 		hours, rem = divmod(rem, 3600)
 		minutes, seconds = divmod(rem, 60)
 		if seconds < 1:
 			seconds = 1
 		locals_ = locals()
 		magnitudes_str = ("{n} {magnitude}".format(n=int(locals_[magnitude]), magnitude=magnitude) for magnitude in ("days", "hours", "minutes", "seconds") if locals_[magnitude])
 		return ", ".join(magnitudes_str)
 	def process_stats(self, message):
 		data = message["message"]
 		data["host"] = message["host"]
 		if data["msg_type"] == "up" and data["initial"] == True:
 			return # We don't need to say what is up, initially...
 		# TODO: Duration
 		if data["msg_type"] == "up":
 			try:
 				data["duration"] = self.format_time_duration(time.time() - self.last_down["%(host)s!%(service)s.%(unit)s" % data])
 			except KeyError, e:
 				data["duration"] = "0 seconds"
 			self.send_all("\x02\x030,3 [ UP ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is now back up. It was down for %(duration)s." % data)
 		elif data["msg_type"] == "down":
 			self.last_down["%(host)s!%(service)s.%(unit)s" % data] = time.time()
 			self.send_all("\x02\x030,4 [ DOWN ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is \x02down!\x02" % data)
 		elif data["msg_type"] == "blip":
 			self.send_all("\x02\x030,7 [ WARNING ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that a blip occurred for \x036%(unit)s\x03!" % data)
 		elif data["msg_type"] == "value":
 			for rule_id, rule in rules.iteritems():
 				check_vals = {
 					"host": [data["host"]],
 					"service": [data["service"]],
 					"resource": [data["resource_type"]],
 					"unit": [data["unit"]]
 				}
 				failed = False
 				for segment in ("host", "service", "resource", "unit"):
 					for val in check_vals[segment]:
 						if rule["target"][segment] is not True and not fnmatch.fnmatch(val, rule["target"][segment]):
 							failed = True
 							break
 				if failed:
 					continue # Skip to next
 				# We haven't broken out in the past bit of code, so we're still matching the pattern...
 				eligible_keys = [key for key in data["values"].keys() if fnmatch.fnmatch(key, rule["target"]["attribute"])]
 				for key in eligible_keys:
 					value = data["values"][key]
 					rule_value = float(rule["value"])
 					operator = rule["operator"]
 					if operator == "=":
 						alarm = (value == rule_value)
 					elif operator == ">":
 						alarm = (value > rule_value)
 					elif operator == "<":
 						alarm = (value < rule_value)
 					elif operator == ">=":
 						alarm = (value >= rule_value)
 					elif operator == "<=":
 						alarm = (value <= rule_value)
 					elif operator == "!=":
 						alarm = (value != rule_value)
 					else:
 						alarm = False
 					self.trigger_alarm(rule_id, data, alarm, value, key)
 	def trigger_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
 		key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key)
 		if key not in self.known_alarms:
 			if active:
 				self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
 				self.known_alarms[key] = time.time()
 			else:
 				self.known_alarms[key] = False
 		else:
 			if self.known_alarms[key] == False and active:
 				# Alarm activated
 				self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
 				self.known_alarms[key] = time.time()
 			elif self.known_alarms[key] != False and not active:
 				# Alarm deactivated
 				self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
 				self.known_alarms[key] = False
 	def transmit_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
 		# At this point, we're sure that we want to notify...
 		rule_target = rules[rule_id]["target"].copy()
 		for k, v in rule_target.iteritems():
 			if v is True:
 				rule_target[k] = "*"
 		rule_pattern = "%(host)s!%(service)s.%(resource)s:%(unit)s.%(attribute)s" % rule_target
 		info = {
 			"host": data["host"],
 			"rule_id": rule_id,
 			"rule_pattern": rule_pattern
 		}
 		if not active:
 			key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key)
 			try:
 				info["duration"] = self.format_time_duration(time.time() - self.known_alarms[key])
 			except KeyError, e:
 				info["duration"] = "0 seconds"
 			info["unit"] = data["unit"]
 			info["attribute"] = offending_key
 			self.send_all("\x02\x030,3 [ SOLVED ] \x03\x02 Host \x037%(host)s\x03 reports that the alarm for rule %(rule_id)s (\x036%(rule_pattern)s\x03) was resolved for \x034%(unit)s\x03.\x034%(attribute)s\x03. It was active for %(duration)s." % info)
 		else:
 			info["value"] = offending_value
 			info["spec"] = "%s %s" % (rules[rule_id]["operator"], rules[rule_id]["value"])
 			info["unit"] = data["unit"]
 			info["attribute"] = offending_key
 			self.send_all("\x02\x030,7 [ ALARM ] \x03\x02 Host \x037%(host)s\x03 reports that an alarm was triggered for rule %(rule_id)s (\x036%(rule_pattern)s\x03). The reported value was\x034 %(value)s\x03 for\x034 %(unit)s\x03.\x034%(attribute)s\x03 , triggering the \x032%(spec)s\x03 condition." % info)
 	def process_message(self, message):
 		if message[0].upper() == "PING":
 			self.send_raw("PONG %s" % message[1])
 		else:
 			try:
 				self.command_map[message[1].upper()](message)
 			except KeyError, e:
 				pass
 bot = Bot(config["irc"]["hosts"], config["irc"]["port"], config["irc"]["nickname"], config["irc"]["realname"], config["irc"]["channels"], config["irc"]["admins"], fetcher)
 bot.run()
--- a/alert/config.yaml.example
+++ b/alert/config.yaml.example
@ -0,0 +1,12 @@
 irc:
        hosts:
                - kerpia.cryto.net
                - box.cryto.net
                - arvel.cryto.net
        port: 6667
        nickname: StatusBot
        realname: Cryto System Monitoring Service
        admins:
                - joepie91
        channels:
                - "#test"
--- a/ccollectd/ccollectd
+++ b/ccollectd/ccollectd
@ -0,0 +1,104 @@
 #!/usr/bin/env python2
 import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys, socket
 from nacl.public import PublicKey, PrivateKey, Box
 ctx = zmq.Context()
 distributor = ctx.socket(zmq.PUB)
 distributor.bind("tcp://127.0.0.1:8998")
 poller = zmq.Poller()
 with open("config.yaml", "r") as cfile:
 	config = yaml.safe_load(cfile)
 with open("privkey.dat", "r") as f:
 	privkey = PrivateKey(binascii.unhexlify(f.read()))
 nodes = config["nodes"]
 last_node_status = {}
 socket_map = {}
 boxes = {}
 def heartbeat():
 	for hostname, node in nodes.iteritems():
 		retries = 0
 		while retries < config["heartbeat"]["attempts"]:
 			try:
 				s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 				s.settimeout(float(config["heartbeat"]["timeout"]) / (retries + 1))
 				s.connect((node["ip"], node["port"]))
 				s.shutdown(socket.SHUT_RDWR)
 				s.close()
 				up = True
 				break
 			except socket.error, e:
 				up = False
 				retries += 1
 		try:
 			status_changed = (up != last_node_status[hostname])
 			initial = False
 		except KeyError, e:
 			status_changed = True
 			initial = True
 		last_node_status[hostname] = up
 		send_message = False
 		if status_changed:
 			if up:
 				msg_type = "up"
 				send_message = True
 			else:
 				msg_type = "down"
 				send_message = True
 		else:
 			if up and retries > 0:
 				msg_type = "blip"
 				send_message = True
 		if send_message:
 			distributor.send(msgpack.packb({
 				"host": config["hostname"],
 				"message": {
 					"service": "heartbeat",
 					"msg_type": msg_type,
 					"unit": hostname,
 					"initial": initial
 				}
 			}))
 timers = zmqtimer.ZmqTimerManager()
 timers.add_timer(zmqtimer.ZmqTimer(config["heartbeat"]["interval"], heartbeat))
 for hostname, node in config["nodes"].iteritems():
 	boxes[hostname] = Box(privkey, PublicKey(binascii.unhexlify(node["pubkey"])))
 	grabber = ctx.socket(zmq.SUB)
 	grabber.setsockopt(zmq.SUBSCRIBE, "")
 	grabber.connect(node["endpoint"])
 	socket_map[grabber] = hostname
 	poller.register(grabber, zmq.POLLIN)
 while True:
 	timers.check()
 	socks = dict(poller.poll(timers.get_next_interval()))
 	for sock in socks:
 		if socks[sock] == zmq.POLLIN:
 			host = socket_map[sock]
 			try:
 				message = msgpack.unpackb(boxes[host].decrypt(sock.recv()))
 			except nacl.exceptions.CryptoError, e:
 				# Probably a spoofed message... skip to next socket
 				sys.stderr.write("Ignoring message... spoofed? (origin: %s)\n" % host) # FIXME: Use logging module...
 				continue
 			except Exception, e:
 				sys.stderr.write(repr(e) + "\n")
 				continue
 			distributor.send(msgpack.packb({
 				"host": host,
 				"message": message
 			}))
--- a/ccollectd/config.yaml.example
+++ b/ccollectd/config.yaml.example
@ -0,0 +1,13 @@
 hostname: monitoring.cryto.net
 heartbeat:
        interval: 5
        timeout: 1
        attempts: 3
 nodes:
        localhost:
                ip: 127.0.0.1
                port: 6543
                endpoint: tcp://127.0.0.1:6543
                pubkey: bd784ef4065c9bd31627106dc55e26764605a144c6fc45ce93f33cbd19dd7333
--- a/ccollectd/genkey
+++ b/ccollectd/genkey
@ -0,0 +1,15 @@
 #!/usr/bin/env python2
 import yaml, os, stat, binascii
 from nacl.public import PrivateKey
 privkey = PrivateKey.generate()
 pubkey = privkey.public_key
 with open("privkey.dat", "w") as f:
 	f.write(binascii.hexlify(str(privkey)))
 with open("pubkey.dat", "w") as f:
 	f.write(binascii.hexlify(str(pubkey)))
 os.chmod("privkey.dat", stat.S_IRUSR | stat.S_IWUSR)
--- a/ccollectd/listen
+++ b/ccollectd/listen
@ -0,0 +1,12 @@
 #!/usr/bin/env python2
 import zmq, msgpack
 ctx = zmq.Context()
 fetcher = ctx.socket(zmq.SUB)
 fetcher.setsockopt(zmq.SUBSCRIBE, "")
 fetcher.connect("tcp://127.0.0.1:8998")
 while True:
 	message = msgpack.unpackb(fetcher.recv())
 	print message
--- a/ccollectd/zmqtimer.py
+++ b/ccollectd/zmqtimer.py
@ -0,0 +1,41 @@
 import time
 class ZmqTimerManager(object):
 	def __init__(self):
 		self.timers = []
 		self.next_call = 0
 	def add_timer(self, timer):
 		self.timers.append(timer)
 	def check(self):
 		if time.time() > self.next_call:
 			for timer in self.timers:
 				timer.check()
 	def get_next_interval(self):
 		if time.time() >= self.next_call:
 			call_times = []
 			for timer in self.timers:
 				call_times.append(timer.get_next_call())
 			self.next_call = min(call_times)
 			if self.next_call  < time.time():
 				return 0
 			else:
 				return (self.next_call - time.time()) * 1000
 		else:
 			return (self.next_call - time.time()) * 1000
 class ZmqTimer(object):
 	def __init__(self, interval, callback):
 		self.interval = interval
 		self.callback = callback
 		self.last_call = 0
 	def check(self):
 		if time.time() > (self.interval + self.last_call):
 			self.callback()
 			self.last_call = time.time()
 	def get_next_call(self):
 		return self.last_call + self.interval
--- a/cstatsd/bootstrap
+++ b/cstatsd/bootstrap
@ -0,0 +1,6 @@
 #!/bin/bash
 echo "Generating keypair..."
 ./genkey 2>/dev/null
 ./bootstrap-config
 echo "Your public key: `cat pubkey.dat`"
 echo "Server IP: `curl -s http://wtfismyip.com/text`" 2>/dev/null
--- a/cstatsd/bootstrap-config
+++ b/cstatsd/bootstrap-config
@ -0,0 +1,86 @@
 #!/usr/bin/env python2
 import yaml, sys
 master_pubkey = raw_input("Public key of the master server: ")
 print "You'll now be asked to configure ports to check. If you don't want to configure any ports, just hit enter without entering any information."
 ports = {}
 while True:
 	port = raw_input("Port number: ")
 	if port.strip() == "":
 		break
 	service_name = raw_input("Service name for port %s: " % port)
 	ports[int(port)] = service_name
 print "The same thing, except now for processes to check. Just hit enter without entering any information when you're done; the same goes for the argument list. As a wildcard, you can use *"
 services = {}
 while True:
 	service_name = raw_input("Service name: ")
 	if service_name.strip() == "":
 		break
 	process_name = raw_input("Process name: ")
 	args = {}
 	argnum = 1
 	while True:
 		arg = raw_input("Argument %d: " % argnum)
 		if arg.strip() == "":
 			break
 		args[argnum] = arg
 		argnum += 1
 	services[service_name] = {
 		"name": process_name,
 		"args": args
 	}
 print "Now enter any disk devices you wish to monitor. Leave empty and hit enter when done."
 disks = []
 while True:
 	device_name = raw_input("Device name: ")
 	if device_name.strip() == "":
 		break
 	disks.append(device_name)
 # Write config files...
 modules = []
 modules.append("stats-machine")
 with open("config/machine.yaml.example", "r") as ef:
 	with open("config/machine.yaml", "w") as ff:
 		data = yaml.safe_load(ef.read())
 		data["drives"] = disks
 		ff.write(yaml.dump(data))
 if len(ports) > 0:
 	modules.append("stats-ports")
 	with open("config/ports.yaml.example", "r") as ef:
 		with open("config/ports.yaml", "w") as ff:
 			data = yaml.safe_load(ef.read())
 			data["ports"] = ports
 			ff.write(yaml.dump(data))
 if len(services) > 0:
 	modules.append("stats-processes")
 	with open("config/processes.yaml.example", "r") as ef:
 		with open("config/processes.yaml", "w") as ff:
 			data = yaml.safe_load(ef.read())
 			data["processes"] = services
 			ff.write(yaml.dump(data))
 with open("config/cstatsd.yaml.example", "r") as ef:
 	with open("config/cstatsd.yaml", "w") as ff:
 		data = yaml.safe_load(ef.read())
 		data["pubkey"] = master_pubkey
 		data["autostart"] = modules
 		ff.write(yaml.dump(data))
--- a/cstatsd/config/cstatsd.yaml.example
+++ b/cstatsd/config/cstatsd.yaml.example
@ -0,0 +1,7 @@
 endpoint: tcp://*:6543
 pubkey: a266a0634790a79c6934385892f7c377d35b8f03b9c6ac7d5bfed4a94f93ba65
 autostart:
        - stats-processes
        - stats-ports
        - stats-machine
--- a/cstatsd/config/machine.yaml.example
+++ b/cstatsd/config/machine.yaml.example
@ -1,4 +1,5 @@
 interval: 1
 drives:
-        - /
+        - /dev/sda1
        - /dev/sdb1
--- a/cstatsd/config/ports.yaml.example
+++ b/cstatsd/config/ports.yaml.example
@ -1,4 +1,4 @@
-interval: 5
+interval: 1
 ports:
        6667: UnrealIRCd
--- a/cstatsd/config/processes.yaml.example
+++ b/cstatsd/config/processes.yaml.example
@ -0,0 +1,15 @@
 interval: 5
 processes:
        radiotray:
                name: '*python*'
                args:
                        1: /usr/bin/radiotray
        guake:
                name: '*python*'
                args:
                        1: /usr/local/bin/guake
        keepassx:
                name: keepassx
--- a/cstatsd/cstatsd
+++ b/cstatsd/cstatsd
@ -1,12 +1,43 @@
 #!/usr/bin/env python2
-import zmq, msgpack
+import zmq, yaml, binascii, nacl, sys, subprocess, os
 from nacl.public import PublicKey, PrivateKey, Box
 basedir = os.path.dirname(os.path.realpath(__file__))
 with open("cstatsd.pid", "w") as pidfile:
 	pidfile.write(str(os.getpid()))
 ctx = zmq.Context()
 with open("config/cstatsd.yaml", "r") as cfile:
 	config = yaml.safe_load(cfile)
 pubkey = PublicKey(binascii.unhexlify(config["pubkey"]))
 with open("privkey.dat", "r") as f:
 	privkey = PrivateKey(binascii.unhexlify(f.read()))
 box = Box(privkey, pubkey)
 collector = ctx.socket(zmq.PULL)
 collector.bind("ipc:///tmp/cstatsd")
 shipper = ctx.socket(zmq.PUB)
 shipper.bind(config["endpoint"])
 try:
 	disable_autostart = (sys.argv[1] == "--disable-autostart")
 except:
 	disable_autostart = False
 if disable_autostart == False:
 	with open("/dev/null", "w+") as stfu:
 		for script in config["autostart"]:
 			print os.path.join(basedir, script)
 			subprocess.Popen([os.path.join(basedir, script)], stdout=stfu, stderr=stfu)
 while True:
-	message = msgpack.unpackb(collector.recv())
+	message = collector.recv()
-	print message
+	nonce = nacl.utils.random(Box.NONCE_SIZE)
 	shipper.send(box.encrypt(message, nonce))
--- a/cstatsd/genkey
+++ b/cstatsd/genkey
@ -0,0 +1,15 @@
 #!/usr/bin/env python2
 import yaml, os, stat, binascii
 from nacl.public import PrivateKey
 privkey = PrivateKey.generate()
 pubkey = privkey.public_key
 with open("privkey.dat", "w") as f:
 	f.write(binascii.hexlify(str(privkey)))
 with open("pubkey.dat", "w") as f:
 	f.write(binascii.hexlify(str(pubkey)))
 os.chmod("privkey.dat", stat.S_IRUSR | stat.S_IWUSR)
--- a/cstatsd/kill-stats
+++ b/cstatsd/kill-stats
@ -0,0 +1,5 @@
 #!/bin/bash
 PID=`cat cstatsd.pid`
 pkill -P $PID
 kill $PID
--- a/cstatsd/stats-machine
+++ b/cstatsd/stats-machine
@ -0,0 +1,224 @@
 #!/usr/bin/env python2
 import zmq, msgpack, time, psutil, yaml, os, subprocess
 from collections import namedtuple
 # Horrible hack to make check_output exist in 2.6
 # http://stackoverflow.com/a/13160748/1332715
 if "check_output" not in dir( subprocess ): # duck punch it in!
 	def f(*popenargs, **kwargs):
 		if 'stdout' in kwargs:
 			raise ValueError('stdout argument not allowed, it will be overridden.')
 		process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
 		output, unused_err = process.communicate()
 		retcode = process.poll()
 		if retcode:
 			cmd = kwargs.get("args")
 			if cmd is None:
 				cmd = popenargs[0]
 			raise subprocess.CalledProcessError(retcode, cmd)
 		return output
 	subprocess.check_output = f
 ctx = zmq.Context()
 sock = ctx.socket(zmq.PUSH)
 sock.connect("ipc:///tmp/cstatsd")
 with open("config/machine.yaml", "r") as cfile:
 	config = yaml.safe_load(cfile)
 interval = config["interval"]
 old_net_data = {}
 disk_map = {}
 last_io_data = {}
 if os.path.exists("/proc/user_beancounters") and not os.path.exists("/proc/vz/vestat"):
 	openvz_burst = True
 	FakeRam = namedtuple("FakeRam", ["total", "used", "available", "percent", "buffers", "cached"])
 else:
 	openvz_burst = False
 for disk in psutil.disk_partitions():
 	disk_map[disk.device] = disk
 if len(disk_map) == 0:
 	# We're probably on OpenVZ, so /proc/partitions doesn't exist. Fall back to 'df'.
 	FakeDisk = namedtuple("FakeDisk", ["device", "mountpoint"])
 	for line in subprocess.check_output(["df"]).splitlines()[1:]:
 		device, _, _, _, _, mountpoint = line.split()
 		disk_map[device] = FakeDisk(device, mountpoint)
 while True:
 	load_avgs = os.getloadavg()
 	sock.send(msgpack.packb({
 		"service": "machine",
 		"msg_type": "value",
 		"resource_type": "load_average",
 		"unit": "",
 		"values": {
 			"1m": load_avgs[0],
 			"5m": load_avgs[1],
 			"15m": load_avgs[2]
 		}
 	}))
 	cpu_loads = psutil.cpu_percent(percpu=True)
 	for i in xrange(0, len(cpu_loads)):
 		sock.send(msgpack.packb({
 			"service": "machine",
 			"msg_type": "value",
 			"resource_type": "cpu",
 			"unit": "core%d" % (i + 1),
 			"values": {
 				"load": cpu_loads[i]
 			}
 		}))
 	try:
 		io_counters = psutil.disk_io_counters(perdisk=True)
 	except IOError, e:
 		io_counters = {} # OpenVZ...
 	for drive in config["drives"]:
 		drive_data = psutil.disk_usage(disk_map[drive].mountpoint)
 		io_data = None
 		for diskname, data in io_counters.iteritems():
 			if drive.endswith(diskname):
 				io_data = data
 		if io_data is None or drive not in last_io_data:
 			read_bps = 0
 			write_bps = 0
 			read_iops = 0
 			write_iops = 0
 		else:
 			read_bps = (io_data.read_bytes - last_io_data[drive].read_bytes) / interval
 			write_bps = (io_data.write_bytes - last_io_data[drive].write_bytes) / interval
 			read_iops = (io_data.read_count - last_io_data[drive].read_count) / interval
 			write_iops = (io_data.write_count - last_io_data[drive].write_count) / interval
 		if io_data is not None:
 			last_io_data[drive] = io_data
 		sock.send(msgpack.packb({
 			"service": "machine",
 			"msg_type": "value",
 			"resource_type": "disk",
 			"unit": drive,
 			"values": {
 				"total": drive_data.total,
 				"used": drive_data.used,
 				"free": drive_data.free,
 				"used_percentage": drive_data.percent,
 				"bps_read": read_bps,
 				"bps_write": write_bps,
 				"iops_read": read_iops,
 				"iops_write": write_iops,
 			}
 		}))
 	if openvz_burst:
 		# Sigh, OpenVZ... let's use 'free', since that apparently -does- understand OpenVZ.
 		lines = subprocess.check_output(["free", "-b"]).splitlines()
 		_, ram_total, ram_used, ram_free, _, ram_buffers, ram_cached = lines[1].split()
 		_, _, _, ram_available = lines[2].split()
 		ram_total = int(ram_total)
 		ram_free = int(ram_free)
 		ram_buffers = int(ram_buffers)
 		ram_cached = int(ram_cached)
 		ram_available = int(ram_available)
 		ram_used = int(ram_used)
 		ram_percent = 1.0 * (ram_total - ram_available) / ram_total * 100
 		ram_data = FakeRam(ram_total, ram_used, ram_available, ram_percent, ram_buffers, ram_cached)
 	else:
 		ram_data = psutil.virtual_memory()
 	sock.send(msgpack.packb({
 		"service": "machine",
 		"msg_type": "value",
 		"resource_type": "memory",
 		"unit": "physical",
 		"values": {
 			"total": ram_data.total,
 			"used": ram_data.used,
 			"free": ram_data.available,
 			"used_percentage": ram_data.percent,
 			"buffers": ram_data.buffers,
 			"cache": ram_data.cached
 		}
 	}))
 	swap_data = psutil.swap_memory()
 	sock.send(msgpack.packb({
 		"service": "machine",
 		"msg_type": "value",
 		"resource_type": "memory",
 		"unit": "swap",
 		"values": {
 			"total": swap_data.total,
 			"used": swap_data.used,
 			"free": swap_data.free,
 			"used_percentage": swap_data.percent
 		}
 	}))
 	net_data = psutil.net_io_counters(pernic=True)
 	for nic, data in net_data.iteritems():
 		try:
 			old_in_b = old_net_data[nic].bytes_recv
 			old_out_b = old_net_data[nic].bytes_sent
 			old_in_p = old_net_data[nic].packets_recv
 			old_out_p = old_net_data[nic].packets_sent
 		except KeyError, e:
 			# No old data yet, first run? Save and skip to next...
 			old_net_data[nic] = data
 			continue
 		diff_in_b = data.bytes_recv - old_in_b
 		diff_out_b = data.bytes_sent - old_out_b
 		diff_in_p = data.packets_recv - old_in_p
 		diff_out_p = data.packets_sent - old_out_p
 		if diff_in_b < 0:
 			diff_in_b = 0
 		if diff_out_b < 0:
 			diff_out_b = 0
 		if diff_in_p < 0:
 			diff_in_p = 0
 		if diff_out_p < 0:
 			diff_out_p = 0
 		old_net_data[nic] = data
 		sock.send(msgpack.packb({
 			"service": "machine",
 			"msg_type": "value",
 			"resource_type": "network",
 			"unit": nic,
 			"values": {
 				"bps_in": diff_in_b / interval,
 				"bps_out": diff_out_b / interval,
 				"pps_in": diff_in_p / interval,
 				"pps_out": diff_out_p / interval
 			}
 		}))
 	sock.send(msgpack.packb({
 		"service": "machine",
 		"msg_type": "value",
 		"resource_type": "uptime",
 		"unit": "",
 		"values": {
 			"uptime": time.time() - psutil.get_boot_time()
 		}
 	}))
 	time.sleep(interval)
--- a/cstatsd/stats-machine.py
+++ b/cstatsd/stats-machine.py
@ -1,143 +0,0 @@
 #!/usr/bin/env python2
 import zmq, msgpack, time, psutil, yaml, os
 ctx = zmq.Context()
 sock = ctx.socket(zmq.PUSH)
 sock.connect("ipc:///tmp/cstatsd")
 with open("config/machine.yaml", "r") as cfile:
 	config = yaml.safe_load(cfile)
 interval = config["interval"]
 old_net_data = {}
 while True:
 	load_avgs = os.getloadavg()
 	sock.send(msgpack.packb({
 		"service": "machine",
 		"msg_type": "value",
 		"resource_type": "load_average",
 		"unit": "",
 		"values": {
 			"1m": load_avgs[0],
 			"5m": load_avgs[1],
 			"15m": load_avgs[2]
 		}
 	}))
 	cpu_loads = psutil.cpu_percent(percpu=True)
 	for i in xrange(0, len(cpu_loads)):
 		sock.send(msgpack.packb({
 			"service": "machine",
 			"msg_type": "value",
 			"resource_type": "cpu",
 			"unit": "core%d" % (i + 1),
 			"values": {
 				"load": cpu_loads[i]
 			}
 		}))
 	for drive in config["drives"]:
 		drive_data = psutil.disk_usage(drive)
 		sock.send(msgpack.packb({
 			"service": "machine",
 			"msg_type": "value",
 			"resource_type": "disk",
 			"unit": drive,
 			"values": {
 				"total": drive_data.total,
 				"used": drive_data.used,
 				"free": drive_data.free,
 				"used_percentage": drive_data.percent
 			}
 		}))
 	ram_data = psutil.virtual_memory()
 	sock.send(msgpack.packb({
 		"service": "machine",
 		"msg_type": "value",
 		"resource_type": "memory",
 		"unit": "physical",
 		"values": {
 			"total": ram_data.total,
 			"used": ram_data.used,
 			"free": ram_data.available,
 			"used_percentage": ram_data.percent,
 			"buffers": ram_data.buffers,
 			"cache": ram_data.cached
 		}
 	}))
 	swap_data = psutil.virtual_memory()
 	sock.send(msgpack.packb({
 		"service": "machine",
 		"msg_type": "value",
 		"resource_type": "memory",
 		"unit": "swap",
 		"values": {
 			"total": swap_data.total,
 			"used": swap_data.used,
 			"free": swap_data.free,
 			"used_percentage": swap_data.percent
 		}
 	}))
 	net_data = psutil.net_io_counters(pernic=True)
 	for nic, data in net_data.iteritems():
 		try:
 			old_in_b = old_net_data[nic].bytes_recv
 			old_out_b = old_net_data[nic].bytes_sent
 			old_in_p = old_net_data[nic].packets_recv
 			old_out_p = old_net_data[nic].packets_sent
 		except KeyError, e:
 			# No old data yet, first run? Save and skip to next...
 			old_net_data[nic] = data
 			continue
 		diff_in_b = data.bytes_recv - old_in_b
 		diff_out_b = data.bytes_sent - old_out_b
 		diff_in_p = data.packets_recv - old_in_p
 		diff_out_p = data.packets_sent - old_out_p
 		if diff_in_b < 0:
 			diff_in_b = 0
 		if diff_out_b < 0:
 			diff_out_b = 0
 		if diff_in_p < 0:
 			diff_in_p = 0
 		if diff_out_p < 0:
 			diff_out_p = 0
 		old_net_data[nic] = data
 		sock.send(msgpack.packb({
 			"service": "machine",
 			"msg_type": "value",
 			"resource_type": "network",
 			"unit": nic,
 			"values": {
 				"bps_in": diff_in_b / interval,
 				"bps_out": diff_out_b / interval,
 				"pps_in": diff_in_p / interval,
 				"pps_out": diff_out_p / interval
 			}
 		}))
 	sock.send(msgpack.packb({
 		"service": "machine",
 		"msg_type": "value",
 		"resource_type": "uptime",
 		"unit": "",
 		"values": {
 			"uptime": time.time() - psutil.get_boot_time()
 		}
 	}))
 	time.sleep(interval)
--- a/cstatsd/stats-ports.py
+++ b/cstatsd/stats-ports.py
@ -20,6 +20,7 @@ while True:
 			s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 			s.settimeout(0.5)
 			s.connect(("127.0.0.1", port))
 			s.shutdown(socket.SHUT_RDWR)
 			s.close()
 			up = True
 		except socket.error, e:
--- a/cstatsd/stats-processes
+++ b/cstatsd/stats-processes
@ -0,0 +1,72 @@
 #!/usr/bin/env python2
 import zmq, msgpack, time, yaml, psutil, fnmatch
 ctx = zmq.Context()
 sock = ctx.socket(zmq.PUSH)
 sock.connect("ipc:///tmp/cstatsd")
 with open("config/processes.yaml", "r") as cfile:
 	config = yaml.safe_load(cfile)
 interval = config["interval"]
 old_status = {}
 while True:
 	all_procs = psutil.get_process_list()
 	for service_name, patterns in config["processes"].iteritems():
 		matching = []
 		for proc in all_procs:  # Can't use filter() because of exceptions...
 			try:
 				if len(proc.cmdline) > 0 and fnmatch.fnmatch(proc.cmdline[0], patterns["name"]):
 					failed = False
 					try:
 						for arg, pattern in patterns["args"].iteritems():
 							try:
 								if len(proc.cmdline) < (arg + 1) or not fnmatch.fnmatch(proc.cmdline[arg], pattern):
 									failed = True
 							except KeyError, e:
 								pass
 					except KeyError, e:
 						pass
 					if failed == False:
 						matching.append(proc)
 			except psutil._error.NoSuchProcess, e:
 				pass
 		if len(matching) > 0:
 			up = True
 		else:
 			up = False
 		try:
 			if up == old_status[service_name]:
 				send_notice = False
 			else:
 				send_notice = True
 				initial = False
 		except KeyError, e:
 			send_notice = True
 			initial = True
 		old_status[service_name] = up
 		if send_notice:
 			if up:
 				msg_type = "up"
 			else:
 				msg_type = "down"
 			sock.send(msgpack.packb({
 				"service": "process",
 				"msg_type": msg_type,
 				"unit": service_name,
 				"initial": initial
 			}))
 	time.sleep(interval)
--- a/cstatsd/stats-tahoe.py
+++ b/cstatsd/stats-tahoe.py
--- a/deps.sh
+++ b/deps.sh
@ -0,0 +1,4 @@
 #!/bin/bash
 # You need squeeze-backports if you run this on squeeze!
 apt-get install -y libzmq-dev libffi-dev build-essential python python-dev
 pip install pyzmq msgpack-python pynacl pyyaml psutil
--- a/install-steps.txt
+++ b/install-steps.txt
@ -0,0 +1,4 @@
 # Backports: echo "deb http://backports.debian.org/debian-backports squeeze-backports main" >> /etc/apt/sources.list && apt-get update && apt-get upgrade
 apt-get install -y python python-dev && wget cryto.net/~joepie91/pipfix.sh && chmod +x pipfix.sh && ./pipfix.sh
 adduser --system --shell /bin/bash --group monitor && apt-get install -y git; su -c "cd ~; git clone https://github.com/joepie91/cryto-status.git" monitor && /home/monitor/cryto-status/deps.sh
 # Replace libzmq with a manually compiled version...: wget http://download.zeromq.org/zeromq-4.0.3.tar.gz; tar -xzvf zeromq-4.0.3.tar.gz; cd zeromq-4.0.3; ./configure; make; make install; ldconfig
--- a/todo.txt
+++ b/todo.txt
@ -0,0 +1,54 @@
 * allow comments in (parentheses) in units, and ignore these when matching against an alarm pattern...
 * web interface (angularjs)
 * separate alarm and IRC logic
 * monitor inodes
 * watchdog on slave and master -> should send WARN notifications
 * notifications (text, arbitrary-serialized-data as attachment, DEBUG/INFO/WARN/ERR/CRIT)
 * consider redundancy - can already connect multiple masters through pubsub, how to deal with duplicate processing checking?
 cprocessd:
 -> subscribe to ccollectd
 -> debug switch for outputting all to terminal
 -> keep up/down state
 -> keep last-value state (resource usage)
 -> keep track of persistent downtimes (down for more than X time, as configured in config file)
 -> alarms (move this from the IRC bot to cprocessd)
 -> classify message importance
 -> cprocessd-stream socket, PUB that just streams processed data
 -> cprocessd-query socket, REP that responds to queries
 	-> server-status
 	-> down-list
 	-> last-value
 	-> server-list
 	-> service-list
 cmaild:
 -> use marrow.mailer
 -> receives data from cprocessd-stream
 -> sends e-mails for configured importance levels
 cbotd:
 -> currently named 'alert'
 -> receives data from cprocessd-stream
 -> IRC bot
 -> posts alerts to specified IRC channels, depending on minimum severity level configured for that channel (ie. INFO for #cryto-network but ERR for #crytocc)
 csmsd:
 -> sends SMS for (critical) alerts
 -> receives data from cprocessd-stream
 -> Twilio? does a provider-neutral API exist? might need an extra abstraction...
 cwebd:
 -> offers web interface with streaming status data
 -> publicly accessible and password-protected
 -> streaming data from cprocessd-stream
 -> on-pageload state from cprocessd-query (including 'current downtimes')
 -> tornado+zmq ioloop, http://zeromq.github.io/pyzmq/eventloop.html
 -> web dashboard
 	-> AngularJS
 	-> fancy graphs (via AngularJS? idk if a directive exists for this)
 	-> show downtimes as well as live per-machine stats
 	-> also show overview of all machines in a grid, color-coded for average load of all resources
 	-> historical up/down data
 	-> sqlite storage? single concurrent write, so should work
 	-> perhaps letting people sign up for e-mail alerts is an option? to-inbox will be tricky here
Author	SHA1	Message	Date
Sven Slootweg	7efb3b3dd2	Update todo	11 years ago
Sven Slootweg	59ac723188	Spec out components	11 years ago
Sven Slootweg	d6b0fc2ad4	Update todo	11 years ago
Sven Slootweg	a796725057	Switch to pubsub for over-the-wire communication, to prevent memory leaks	11 years ago
Sven Slootweg	0c87ee058c	Update todo	11 years ago
Sven Slootweg	570d8f3b85	Cast to int properly	11 years ago
Sven Slootweg	57c015f161	Add bootstrap script	11 years ago
Sven Slootweg	e87d048ee9	Hack hack hackity hack hack - we now have check_output in 2.6!	11 years ago
Sven Slootweg	b67fdc8ca3	More fixes	11 years ago
Sven Slootweg	0dde712144	Forgot another split...	11 years ago
Sven Slootweg	4dbe92396c	Oops, forgot a split	11 years ago
Sven Slootweg	7dca261010	Attempt to fix memory accounting in OpenVZ....	11 years ago
Sven Slootweg	5424432ddb	Patch for disk stats on OpenVZ...	11 years ago
Sven Slootweg	971c5ccce3	Fixes and docs	11 years ago
Sven Slootweg	86b013a0b7	Be more lenient towards receiving errors, and update install/todo notes	11 years ago
Sven Slootweg	5a7e3815ca	Update install stuff	11 years ago
Sven Slootweg	ff98278520	Try using TCP instead...	11 years ago
Sven Slootweg	762d74d477	A few attempts later...	11 years ago
Sven Slootweg	1df76daa0d	Fix perms	11 years ago
Sven Slootweg	96aa2b020f	Add collectd listening script	11 years ago
Sven Slootweg	5eb46c13e8	(Probably) fix the blip bug	11 years ago
Sven Slootweg	e04448dcd8	Fix warning color for bot when blips are announced	11 years ago
Sven Slootweg	11eb813164	Installation steps and bugfixes	11 years ago
Sven Slootweg	e2c9097585	Reorganize some files, add support for units in alarm rules, support network blip detection (including more leniency for heartbeat failures), fix bug in alarm matching, remove debug output, add auto-start mechanism and killscript to cstatsd, fix deps.sh	11 years ago
Sven Slootweg	4cf0601b05	Fixes	11 years ago
Sven Slootweg	5db77ab87c	In theory, we should now have heartbeating...	11 years ago
Sven Slootweg	ae552ac0f9	Add todo list	11 years ago
Sven Slootweg	20eaf50791	Shut down socket connections properly after testing...	11 years ago
Sven Slootweg	97290dbb1c	Set up publish/subscribe mechanism, add example configurations, write IRC bot and alarm management mechanism	11 years ago
Sven Slootweg	56ae6b5305	Implement crypto, add disk i/o stats to machine statistics (and change config format), proper error handling for process monitoring	11 years ago
Sven Slootweg	258f62af22	Add key generation script	11 years ago
Sven Slootweg	4550e0a425	Add process watch, write code for ccollectd, implement ZeroMQ timer class, add dep installation script	11 years ago
Sven Slootweg	d310a95e7a	Change default interval for port polling to 1	11 years ago