Update todo

Spec out components
Update todo
25 changed files with 1040 additions and 148 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,10 @@
 cstatsd/config/*.yaml
+ccollectd/config.yaml
+alert/config.yaml
+*.pyc
+ccollectd/pubkey.dat
+ccollectd/privkey.dat
+cstatsd/pubkey.dat
+cstatsd/privkey.dat
+cstatsd/cstatsd.pid
+alert/rules.pickle
--- a/alert/alert
+++ b/alert/alert
@ -0,0 +1,304 @@
+#!/usr/bin/env python2
+
+import socket, yaml, random, zmq, msgpack, time, uuid, fnmatch
+import cPickle as pickle
+
+ctx = zmq.Context()
+
+with open("config.yaml", "r") as cfile:
+	config = yaml.safe_load(cfile)
+
+try:
+	with open("rules.pickle", "r") as pfile:
+		rules = pickle.load(pfile)
+except IOError, e:
+	rules = {}
+
+fetcher = ctx.socket(zmq.SUB)
+fetcher.setsockopt(zmq.SUBSCRIBE, "")
+fetcher.connect("tcp://127.0.0.1:8998")
+
+class Bot(object):
+	def __init__(self, hosts, port, nickname, realname, channels, admins, subsock):
+		self.hosts = hosts
+		self.port = port
+		self.nickname = nickname
+		self.realname = realname
+		self.channels = channels
+		self.admins = admins
+		self.subsock = subsock
+		self.connected = False
+		self.last_down = {}
+		self.known_alarms = {}
+		
+		self.command_map = {
+			"422": self.join_all_channels,
+			"376": self.join_all_channels,
+			"PRIVMSG": self.receive_message
+		}
+	
+	def split_irc(self, message):
+		if message[0] == ":":
+			prefix = ":"
+			message = message[1:]
+		else:
+			prefix = ""
+			
+		if ":" in message:
+			rest, last = message.split(":", 1)
+			parts = rest.strip().split() + [last]
+		else:
+			parts = message.split()
+			
+		parts[0] = prefix + parts[0]
+		return parts
+	
+	def run(self):
+		while True:  # Connect loop
+			host = random.choice(self.hosts)
+			
+			self.sock = socket.socket()
+			try:
+				self.sock.connect((host, self.port))
+			except socket.error, e:
+				continue  # Reconnect
+			self.send_raw("NICK %s" % self.nickname)
+			self.sock.send("USER %s 0 0 :%s\r\n" % (self.nickname, self.realname))
+			
+			buff = ""
+			while True:  # Read loop
+				r, w, x = zmq.select([self.sock, self.subsock], [], [])
+				
+				for s in r:
+					if s == self.sock.fileno():
+						try:
+							recvdata = self.sock.recv(1024)
+						except socket.error, e:
+							break # Something went wrong, reconnect...
+						
+						if len(recvdata) == 0:
+							break  # We have disconnected...
+							
+						buff += recvdata
+						messages = buff.split("\n")
+						buff = messages.pop()
+						
+						for message in messages:
+							self.process_message(self.split_irc(message.strip("\r")))
+					elif self.subsock.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0:
+						# Process incoming data from the subscribe socket...
+						message = msgpack.unpackb(s.recv())
+						self.process_stats(message)
+				
+	def send_raw(self, message):
+		self.sock.send("%s\r\n" % message)
+	
+	def send_message(self, recipient, message):
+		if self.connected == True:
+			self.send_raw("PRIVMSG %s :%s" % (recipient, message))
+	
+	def send_all(self, message):
+		for channel in self.channels:
+			self.send_message(channel, message)
+			
+	def join(self, channel):
+		self.send_raw("JOIN %s" % channel)
+	
+	def join_all_channels(self, message):
+		self.connected = True
+		for channel in self.channels:
+			self.join(channel)
+	
+	def receive_message(self, message):
+		args = message[3].split()
+		sender = message[0][1:].split("!", 1)[0]
+		channel = message[2]
+		
+		try:
+			if sender in self.admins:
+				if args[0] == "!addrule":
+					target, rel, value = args[1:4]
+					target = self.parse_target(target)
+					
+					if value[-1].lower() in ("k", "m", "g", "t"):
+						unit = value[-1].lower()
+						value = value[:-1]
+						value = float(value)
+						value = value * (1024 ** (("k", "m", "g", "t").index(unit) + 1))
+					
+					rule_id = uuid.uuid4()
+					rules[rule_id] = {
+						"target": target,
+						"operator": rel,
+						"value": value
+					}
+					
+					with open("rules.pickle", "w") as pfile:
+						pickle.dump(rules, pfile)
+					
+					self.send_message(channel, "Added rule for %s with ID %s." % (args[1], rule_id))
+		except Exception, e:
+			self.send_message(channel, str(e))
+					
+	def parse_target(self, target):
+		host, rest = target.split("!", 1)
+		service, rest = rest.split(".", 1)
+		resource, rest = rest.split(":", 1)
+		unit, attribute = rest.split(".", 1)
+		# TODO: unit = unit.split("(", 1)[0].strip() # Allow () for comments
+		if host == "*":
+			host = True
+		if service == "*":
+			service = True
+		if attribute == "*":
+			attribute = True
+		if resource == "*":
+			resource = True
+		if unit == "*":
+			unit = True
+		return {
+			"host": host,
+			"service": service,
+			"resource": resource,
+			"unit": unit,
+			"attribute": attribute
+		}
+			
+	
+	def format_time_duration(self, seconds):
+		# http://stackoverflow.com/a/20222351/1332715
+		days, rem = divmod(seconds, 86400)
+		hours, rem = divmod(rem, 3600)
+		minutes, seconds = divmod(rem, 60)
+		if seconds < 1:
+			seconds = 1
+		locals_ = locals()
+		magnitudes_str = ("{n} {magnitude}".format(n=int(locals_[magnitude]), magnitude=magnitude) for magnitude in ("days", "hours", "minutes", "seconds") if locals_[magnitude])
+		return ", ".join(magnitudes_str)
+	
+	def process_stats(self, message):
+		data = message["message"]
+		data["host"] = message["host"]
+		
+		if data["msg_type"] == "up" and data["initial"] == True:
+			return # We don't need to say what is up, initially...
+		
+		# TODO: Duration
+		if data["msg_type"] == "up":
+			try:
+				data["duration"] = self.format_time_duration(time.time() - self.last_down["%(host)s!%(service)s.%(unit)s" % data])
+			except KeyError, e:
+				data["duration"] = "0 seconds"
+			self.send_all("\x02\x030,3 [ UP ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is now back up. It was down for %(duration)s." % data)
+		elif data["msg_type"] == "down":
+			self.last_down["%(host)s!%(service)s.%(unit)s" % data] = time.time()
+			self.send_all("\x02\x030,4 [ DOWN ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is \x02down!\x02" % data)
+		elif data["msg_type"] == "blip":
+			self.send_all("\x02\x030,7 [ WARNING ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that a blip occurred for \x036%(unit)s\x03!" % data)
+		elif data["msg_type"] == "value":
+			for rule_id, rule in rules.iteritems():
+				check_vals = {
+					"host": [data["host"]],
+					"service": [data["service"]],
+					"resource": [data["resource_type"]],
+					"unit": [data["unit"]]
+				}
+				
+				failed = False
+				for segment in ("host", "service", "resource", "unit"):
+					for val in check_vals[segment]:
+						if rule["target"][segment] is not True and not fnmatch.fnmatch(val, rule["target"][segment]):
+							failed = True
+							break
+				if failed:
+					continue # Skip to next
+							
+				# We haven't broken out in the past bit of code, so we're still matching the pattern...
+				eligible_keys = [key for key in data["values"].keys() if fnmatch.fnmatch(key, rule["target"]["attribute"])]
+				
+				for key in eligible_keys:
+					value = data["values"][key]
+					rule_value = float(rule["value"])
+					operator = rule["operator"]
+					
+					if operator == "=":
+						alarm = (value == rule_value)
+					elif operator == ">":
+						alarm = (value > rule_value)
+					elif operator == "<":
+						alarm = (value < rule_value)
+					elif operator == ">=":
+						alarm = (value >= rule_value)
+					elif operator == "<=":
+						alarm = (value <= rule_value)
+					elif operator == "!=":
+						alarm = (value != rule_value)
+					else:
+						alarm = False
+				
+					self.trigger_alarm(rule_id, data, alarm, value, key)
+	
+	def trigger_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
+		key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key)
+		
+		if key not in self.known_alarms:
+			if active:
+				self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
+				self.known_alarms[key] = time.time()
+			else:
+				self.known_alarms[key] = False
+		else:
+			if self.known_alarms[key] == False and active:
+				# Alarm activated
+				self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
+				self.known_alarms[key] = time.time()
+			elif self.known_alarms[key] != False and not active:
+				# Alarm deactivated
+				self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
+				self.known_alarms[key] = False
+	
+	def transmit_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
+		# At this point, we're sure that we want to notify...
+		rule_target = rules[rule_id]["target"].copy()
+		for k, v in rule_target.iteritems():
+			if v is True:
+				rule_target[k] = "*"
+		
+		rule_pattern = "%(host)s!%(service)s.%(resource)s:%(unit)s.%(attribute)s" % rule_target
+		
+		info = {
+			"host": data["host"],
+			"rule_id": rule_id,
+			"rule_pattern": rule_pattern
+		}
+		
+		if not active:
+			key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key)
+			try:
+				info["duration"] = self.format_time_duration(time.time() - self.known_alarms[key])
+			except KeyError, e:
+				info["duration"] = "0 seconds"
+			info["unit"] = data["unit"]
+			info["attribute"] = offending_key
+			
+			self.send_all("\x02\x030,3 [ SOLVED ] \x03\x02 Host \x037%(host)s\x03 reports that the alarm for rule %(rule_id)s (\x036%(rule_pattern)s\x03) was resolved for \x034%(unit)s\x03.\x034%(attribute)s\x03. It was active for %(duration)s." % info)
+		else:
+			info["value"] = offending_value
+			info["spec"] = "%s %s" % (rules[rule_id]["operator"], rules[rule_id]["value"])
+			info["unit"] = data["unit"]
+			info["attribute"] = offending_key
+			
+			self.send_all("\x02\x030,7 [ ALARM ] \x03\x02 Host \x037%(host)s\x03 reports that an alarm was triggered for rule %(rule_id)s (\x036%(rule_pattern)s\x03). The reported value was\x034 %(value)s\x03 for\x034 %(unit)s\x03.\x034%(attribute)s\x03 , triggering the \x032%(spec)s\x03 condition." % info)
+	
+	def process_message(self, message):
+		if message[0].upper() == "PING":
+			self.send_raw("PONG %s" % message[1])
+		else:
+			try:
+				self.command_map[message[1].upper()](message)
+			except KeyError, e:
+				pass
+		
+
+bot = Bot(config["irc"]["hosts"], config["irc"]["port"], config["irc"]["nickname"], config["irc"]["realname"], config["irc"]["channels"], config["irc"]["admins"], fetcher)
+bot.run()
--- a/alert/config.yaml.example
+++ b/alert/config.yaml.example
@ -0,0 +1,12 @@
+irc:
+        hosts:
+                - kerpia.cryto.net
+                - box.cryto.net
+                - arvel.cryto.net
+        port: 6667
+        nickname: StatusBot
+        realname: Cryto System Monitoring Service
+        admins:
+                - joepie91
+        channels:
+                - "#test"
--- a/ccollectd/ccollectd
+++ b/ccollectd/ccollectd
@ -0,0 +1,104 @@
+#!/usr/bin/env python2
+
+import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys, socket
+from nacl.public import PublicKey, PrivateKey, Box
+
+ctx = zmq.Context()
+
+distributor = ctx.socket(zmq.PUB)
+distributor.bind("tcp://127.0.0.1:8998")
+
+poller = zmq.Poller()
+
+with open("config.yaml", "r") as cfile:
+	config = yaml.safe_load(cfile)
+
+with open("privkey.dat", "r") as f:
+	privkey = PrivateKey(binascii.unhexlify(f.read()))
+
+nodes = config["nodes"]
+last_node_status = {}
+socket_map = {}
+boxes = {}
+
+def heartbeat():
+	for hostname, node in nodes.iteritems():
+		retries = 0
+		while retries < config["heartbeat"]["attempts"]:
+			try:
+				s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+				s.settimeout(float(config["heartbeat"]["timeout"]) / (retries + 1))
+				s.connect((node["ip"], node["port"]))
+				s.shutdown(socket.SHUT_RDWR)
+				s.close()
+				up = True
+				break
+			except socket.error, e:
+				up = False
+				retries += 1
+			
+		try:
+			status_changed = (up != last_node_status[hostname])
+			initial = False
+		except KeyError, e:
+			status_changed = True
+			initial = True
+			
+		last_node_status[hostname] = up
+		
+		send_message = False
+		if status_changed:
+			if up:
+				msg_type = "up"
+				send_message = True
+			else:
+				msg_type = "down"
+				send_message = True
+		else:
+			if up and retries > 0:
+				msg_type = "blip"
+				send_message = True
+				
+		if send_message:
+			distributor.send(msgpack.packb({
+				"host": config["hostname"],
+				"message": {
+					"service": "heartbeat",
+					"msg_type": msg_type,
+					"unit": hostname,
+					"initial": initial
+				}
+			}))
+
+timers = zmqtimer.ZmqTimerManager()
+timers.add_timer(zmqtimer.ZmqTimer(config["heartbeat"]["interval"], heartbeat))
+
+for hostname, node in config["nodes"].iteritems():
+	boxes[hostname] = Box(privkey, PublicKey(binascii.unhexlify(node["pubkey"])))
+	grabber = ctx.socket(zmq.SUB)
+	grabber.setsockopt(zmq.SUBSCRIBE, "")
+	grabber.connect(node["endpoint"])
+	socket_map[grabber] = hostname
+	poller.register(grabber, zmq.POLLIN)
+
+while True:
+	timers.check()
+	socks = dict(poller.poll(timers.get_next_interval()))
+	
+	for sock in socks:
+		if socks[sock] == zmq.POLLIN:
+			host = socket_map[sock]
+			try:
+				message = msgpack.unpackb(boxes[host].decrypt(sock.recv()))
+			except nacl.exceptions.CryptoError, e:
+				# Probably a spoofed message... skip to next socket
+				sys.stderr.write("Ignoring message... spoofed? (origin: %s)\n" % host) # FIXME: Use logging module...
+				continue
+			except Exception, e:
+				sys.stderr.write(repr(e) + "\n")
+				continue
+			distributor.send(msgpack.packb({
+				"host": host,
+				"message": message
+			}))
+		
--- a/ccollectd/config.yaml.example
+++ b/ccollectd/config.yaml.example
@ -0,0 +1,13 @@
+hostname: monitoring.cryto.net
+
+heartbeat:
+        interval: 5
+        timeout: 1
+        attempts: 3
+
+nodes:
+        localhost:
+                ip: 127.0.0.1
+                port: 6543
+                endpoint: tcp://127.0.0.1:6543
+                pubkey: bd784ef4065c9bd31627106dc55e26764605a144c6fc45ce93f33cbd19dd7333
--- a/ccollectd/genkey
+++ b/ccollectd/genkey
@ -0,0 +1,15 @@
+#!/usr/bin/env python2
+
+import yaml, os, stat, binascii
+from nacl.public import PrivateKey
+
+privkey = PrivateKey.generate()
+pubkey = privkey.public_key
+
+with open("privkey.dat", "w") as f:
+	f.write(binascii.hexlify(str(privkey)))
+
+with open("pubkey.dat", "w") as f:
+	f.write(binascii.hexlify(str(pubkey)))
+
+os.chmod("privkey.dat", stat.S_IRUSR | stat.S_IWUSR)
--- a/ccollectd/listen
+++ b/ccollectd/listen
@ -0,0 +1,12 @@
+#!/usr/bin/env python2
+import zmq, msgpack
+
+ctx = zmq.Context()
+
+fetcher = ctx.socket(zmq.SUB)
+fetcher.setsockopt(zmq.SUBSCRIBE, "")
+fetcher.connect("tcp://127.0.0.1:8998")
+
+while True:
+	message = msgpack.unpackb(fetcher.recv())
+	print message
--- a/ccollectd/zmqtimer.py
+++ b/ccollectd/zmqtimer.py
@ -0,0 +1,41 @@
+import time
+
+class ZmqTimerManager(object):
+	def __init__(self):
+		self.timers = []
+		self.next_call = 0
+		
+	def add_timer(self, timer):
+		self.timers.append(timer)
+		
+	def check(self):
+		if time.time() > self.next_call:
+			for timer in self.timers:
+				timer.check()
+	
+	def get_next_interval(self):
+		if time.time() >= self.next_call:
+			call_times = []
+			for timer in self.timers:
+				call_times.append(timer.get_next_call())
+			self.next_call = min(call_times)
+			if self.next_call  < time.time():
+				return 0
+			else:
+				return (self.next_call - time.time()) * 1000
+		else:
+			return (self.next_call - time.time()) * 1000
+	
+class ZmqTimer(object):
+	def __init__(self, interval, callback):
+		self.interval = interval
+		self.callback = callback
+		self.last_call = 0
+	
+	def check(self):
+		if time.time() > (self.interval + self.last_call):
+			self.callback()
+			self.last_call = time.time()
+			
+	def get_next_call(self):
+		return self.last_call + self.interval
--- a/cstatsd/bootstrap
+++ b/cstatsd/bootstrap
@ -0,0 +1,6 @@
+#!/bin/bash
+echo "Generating keypair..."
+./genkey 2>/dev/null
+./bootstrap-config
+echo "Your public key: `cat pubkey.dat`"
+echo "Server IP: `curl -s http://wtfismyip.com/text`" 2>/dev/null
--- a/cstatsd/bootstrap-config
+++ b/cstatsd/bootstrap-config
@ -0,0 +1,86 @@
+#!/usr/bin/env python2
+
+import yaml, sys
+
+master_pubkey = raw_input("Public key of the master server: ")
+
+print "You'll now be asked to configure ports to check. If you don't want to configure any ports, just hit enter without entering any information."
+
+ports = {}
+
+while True:
+	port = raw_input("Port number: ")
+	if port.strip() == "":
+		break
+	service_name = raw_input("Service name for port %s: " % port)
+	ports[int(port)] = service_name
+
+print "The same thing, except now for processes to check. Just hit enter without entering any information when you're done; the same goes for the argument list. As a wildcard, you can use *"
+
+services = {}
+
+while True:
+	service_name = raw_input("Service name: ")
+	
+	if service_name.strip() == "":
+		break
+	
+	process_name = raw_input("Process name: ")
+	
+	args = {}
+	argnum = 1
+	while True:
+		arg = raw_input("Argument %d: " % argnum)
+		if arg.strip() == "":
+			break
+		args[argnum] = arg
+		argnum += 1
+		
+	services[service_name] = {
+		"name": process_name,
+		"args": args
+	}
+		
+print "Now enter any disk devices you wish to monitor. Leave empty and hit enter when done."
+
+disks = []
+
+while True:
+	device_name = raw_input("Device name: ")
+	if device_name.strip() == "":
+		break
+	disks.append(device_name)
+	
+# Write config files...
+
+modules = []
+
+modules.append("stats-machine")
+with open("config/machine.yaml.example", "r") as ef:
+	with open("config/machine.yaml", "w") as ff:
+		data = yaml.safe_load(ef.read())
+		data["drives"] = disks
+		ff.write(yaml.dump(data))
+
+if len(ports) > 0:
+	modules.append("stats-ports")
+	with open("config/ports.yaml.example", "r") as ef:
+		with open("config/ports.yaml", "w") as ff:
+			data = yaml.safe_load(ef.read())
+			data["ports"] = ports
+			ff.write(yaml.dump(data))
+
+if len(services) > 0:
+	modules.append("stats-processes")
+	with open("config/processes.yaml.example", "r") as ef:
+		with open("config/processes.yaml", "w") as ff:
+			data = yaml.safe_load(ef.read())
+			data["processes"] = services
+			ff.write(yaml.dump(data))
+
+with open("config/cstatsd.yaml.example", "r") as ef:
+	with open("config/cstatsd.yaml", "w") as ff:
+		data = yaml.safe_load(ef.read())
+		data["pubkey"] = master_pubkey
+		data["autostart"] = modules
+		ff.write(yaml.dump(data))
--- a/cstatsd/config/cstatsd.yaml.example
+++ b/cstatsd/config/cstatsd.yaml.example
@ -0,0 +1,7 @@
+endpoint: tcp://*:6543
+pubkey: a266a0634790a79c6934385892f7c377d35b8f03b9c6ac7d5bfed4a94f93ba65
+
+autostart:
+        - stats-processes
+        - stats-ports
+        - stats-machine
--- a/cstatsd/config/machine.yaml.example
+++ b/cstatsd/config/machine.yaml.example
@ -1,4 +1,5 @@
 interval: 1

 drives:
-        - /
+        - /dev/sda1
+        - /dev/sdb1
--- a/cstatsd/config/ports.yaml.example
+++ b/cstatsd/config/ports.yaml.example
@ -1,4 +1,4 @@
-interval: 5
+interval: 1

 ports:
        6667: UnrealIRCd
--- a/cstatsd/config/processes.yaml.example
+++ b/cstatsd/config/processes.yaml.example
@ -0,0 +1,15 @@
+interval: 5
+
+processes:
+        radiotray:
+                name: '*python*'
+                args:
+                        1: /usr/bin/radiotray
+                        
+        guake:
+                name: '*python*'
+                args:
+                        1: /usr/local/bin/guake
+        
+        keepassx:
+                name: keepassx
--- a/cstatsd/cstatsd
+++ b/cstatsd/cstatsd
@ -1,12 +1,43 @@
 #!/usr/bin/env python2

-import zmq, msgpack
+import zmq, yaml, binascii, nacl, sys, subprocess, os
+from nacl.public import PublicKey, PrivateKey, Box
+
+basedir = os.path.dirname(os.path.realpath(__file__))
+
+with open("cstatsd.pid", "w") as pidfile:
+	pidfile.write(str(os.getpid()))

 ctx = zmq.Context()

+with open("config/cstatsd.yaml", "r") as cfile:
+	config = yaml.safe_load(cfile)
+
+pubkey = PublicKey(binascii.unhexlify(config["pubkey"]))
+
+with open("privkey.dat", "r") as f:
+	privkey = PrivateKey(binascii.unhexlify(f.read()))
+
+box = Box(privkey, pubkey)
+
 collector = ctx.socket(zmq.PULL)
 collector.bind("ipc:///tmp/cstatsd")

+shipper = ctx.socket(zmq.PUB)
+shipper.bind(config["endpoint"])
+
+try:
+	disable_autostart = (sys.argv[1] == "--disable-autostart")
+except:
+	disable_autostart = False
+	
+if disable_autostart == False:
+	with open("/dev/null", "w+") as stfu:
+		for script in config["autostart"]:
+			print os.path.join(basedir, script)
+			subprocess.Popen([os.path.join(basedir, script)], stdout=stfu, stderr=stfu)
+
 while True:
-	message = msgpack.unpackb(collector.recv())
-	print message
+	message = collector.recv()
+	nonce = nacl.utils.random(Box.NONCE_SIZE)
+	shipper.send(box.encrypt(message, nonce))
--- a/cstatsd/genkey
+++ b/cstatsd/genkey
@ -0,0 +1,15 @@
+#!/usr/bin/env python2
+
+import yaml, os, stat, binascii
+from nacl.public import PrivateKey
+
+privkey = PrivateKey.generate()
+pubkey = privkey.public_key
+
+with open("privkey.dat", "w") as f:
+	f.write(binascii.hexlify(str(privkey)))
+
+with open("pubkey.dat", "w") as f:
+	f.write(binascii.hexlify(str(pubkey)))
+
+os.chmod("privkey.dat", stat.S_IRUSR | stat.S_IWUSR)
--- a/cstatsd/kill-stats
+++ b/cstatsd/kill-stats
@ -0,0 +1,5 @@
+#!/bin/bash
+
+PID=`cat cstatsd.pid`
+pkill -P $PID
+kill $PID
--- a/cstatsd/stats-machine
+++ b/cstatsd/stats-machine
@ -0,0 +1,224 @@
+#!/usr/bin/env python2
+
+import zmq, msgpack, time, psutil, yaml, os, subprocess
+from collections import namedtuple
+
+# Horrible hack to make check_output exist in 2.6
+# http://stackoverflow.com/a/13160748/1332715
+if "check_output" not in dir( subprocess ): # duck punch it in!
+	def f(*popenargs, **kwargs):
+		if 'stdout' in kwargs:
+			raise ValueError('stdout argument not allowed, it will be overridden.')
+		process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
+		output, unused_err = process.communicate()
+		retcode = process.poll()
+		if retcode:
+			cmd = kwargs.get("args")
+			if cmd is None:
+				cmd = popenargs[0]
+			raise subprocess.CalledProcessError(retcode, cmd)
+		return output
+	subprocess.check_output = f
+
+ctx = zmq.Context()
+
+sock = ctx.socket(zmq.PUSH)
+sock.connect("ipc:///tmp/cstatsd")
+
+with open("config/machine.yaml", "r") as cfile:
+	config = yaml.safe_load(cfile)
+	
+interval = config["interval"]
+old_net_data = {}
+
+disk_map = {}
+last_io_data = {}
+
+if os.path.exists("/proc/user_beancounters") and not os.path.exists("/proc/vz/vestat"):
+	openvz_burst = True
+	FakeRam = namedtuple("FakeRam", ["total", "used", "available", "percent", "buffers", "cached"])
+else:
+	openvz_burst = False
+
+for disk in psutil.disk_partitions():
+	disk_map[disk.device] = disk
+	
+if len(disk_map) == 0:
+	# We're probably on OpenVZ, so /proc/partitions doesn't exist. Fall back to 'df'.
+	FakeDisk = namedtuple("FakeDisk", ["device", "mountpoint"])
+	for line in subprocess.check_output(["df"]).splitlines()[1:]:
+		device, _, _, _, _, mountpoint = line.split()
+		disk_map[device] = FakeDisk(device, mountpoint)
+
+while True:
+	load_avgs = os.getloadavg()
+	sock.send(msgpack.packb({
+		"service": "machine",
+		"msg_type": "value",
+		"resource_type": "load_average",
+		"unit": "",
+		"values": {
+			"1m": load_avgs[0],
+			"5m": load_avgs[1],
+			"15m": load_avgs[2]
+		}
+	}))
+	
+	cpu_loads = psutil.cpu_percent(percpu=True)
+	
+	for i in xrange(0, len(cpu_loads)):
+		sock.send(msgpack.packb({
+			"service": "machine",
+			"msg_type": "value",
+			"resource_type": "cpu",
+			"unit": "core%d" % (i + 1),
+			"values": {
+				"load": cpu_loads[i]
+			}
+		}))
+	
+	try:
+		io_counters = psutil.disk_io_counters(perdisk=True)
+	except IOError, e:
+		io_counters = {} # OpenVZ...
+	
+	for drive in config["drives"]:
+		drive_data = psutil.disk_usage(disk_map[drive].mountpoint)
+		io_data = None
+		
+		for diskname, data in io_counters.iteritems():
+			if drive.endswith(diskname):
+				io_data = data
+				
+		if io_data is None or drive not in last_io_data:
+			read_bps = 0
+			write_bps = 0
+			read_iops = 0
+			write_iops = 0
+		else:
+			read_bps = (io_data.read_bytes - last_io_data[drive].read_bytes) / interval
+			write_bps = (io_data.write_bytes - last_io_data[drive].write_bytes) / interval
+			read_iops = (io_data.read_count - last_io_data[drive].read_count) / interval
+			write_iops = (io_data.write_count - last_io_data[drive].write_count) / interval
+			
+		if io_data is not None:
+			last_io_data[drive] = io_data
+			
+		sock.send(msgpack.packb({
+			"service": "machine",
+			"msg_type": "value",
+			"resource_type": "disk",
+			"unit": drive,
+			"values": {
+				"total": drive_data.total,
+				"used": drive_data.used,
+				"free": drive_data.free,
+				"used_percentage": drive_data.percent,
+				"bps_read": read_bps,
+				"bps_write": write_bps,
+				"iops_read": read_iops,
+				"iops_write": write_iops,
+			}
+		}))
+		
+	if openvz_burst:
+		# Sigh, OpenVZ... let's use 'free', since that apparently -does- understand OpenVZ.
+		lines = subprocess.check_output(["free", "-b"]).splitlines()
+		_, ram_total, ram_used, ram_free, _, ram_buffers, ram_cached = lines[1].split()
+		_, _, _, ram_available = lines[2].split()
+		ram_total = int(ram_total)
+		ram_free = int(ram_free)
+		ram_buffers = int(ram_buffers)
+		ram_cached = int(ram_cached)
+		ram_available = int(ram_available)
+		ram_used = int(ram_used)
+		ram_percent = 1.0 * (ram_total - ram_available) / ram_total * 100
+		ram_data = FakeRam(ram_total, ram_used, ram_available, ram_percent, ram_buffers, ram_cached)
+	else:
+		ram_data = psutil.virtual_memory()
+		
+	sock.send(msgpack.packb({
+		"service": "machine",
+		"msg_type": "value",
+		"resource_type": "memory",
+		"unit": "physical",
+		"values": {
+			"total": ram_data.total,
+			"used": ram_data.used,
+			"free": ram_data.available,
+			"used_percentage": ram_data.percent,
+			"buffers": ram_data.buffers,
+			"cache": ram_data.cached
+		}
+	}))
+		
+	swap_data = psutil.swap_memory()
+	sock.send(msgpack.packb({
+		"service": "machine",
+		"msg_type": "value",
+		"resource_type": "memory",
+		"unit": "swap",
+		"values": {
+			"total": swap_data.total,
+			"used": swap_data.used,
+			"free": swap_data.free,
+			"used_percentage": swap_data.percent
+		}
+	}))
+	
+	net_data = psutil.net_io_counters(pernic=True)
+	for nic, data in net_data.iteritems():
+		try:
+			old_in_b = old_net_data[nic].bytes_recv
+			old_out_b = old_net_data[nic].bytes_sent
+			old_in_p = old_net_data[nic].packets_recv
+			old_out_p = old_net_data[nic].packets_sent
+		except KeyError, e:
+			# No old data yet, first run? Save and skip to next...
+			old_net_data[nic] = data
+			continue
+		
+		diff_in_b = data.bytes_recv - old_in_b
+		diff_out_b = data.bytes_sent - old_out_b
+		diff_in_p = data.packets_recv - old_in_p
+		diff_out_p = data.packets_sent - old_out_p
+		
+		if diff_in_b < 0:
+			diff_in_b = 0
+		
+		if diff_out_b < 0:
+			diff_out_b = 0
+		
+		if diff_in_p < 0:
+			diff_in_p = 0
+		
+		if diff_out_p < 0:
+			diff_out_p = 0
+			
+		old_net_data[nic] = data
+		
+		sock.send(msgpack.packb({
+			"service": "machine",
+			"msg_type": "value",
+			"resource_type": "network",
+			"unit": nic,
+			"values": {
+				"bps_in": diff_in_b / interval,
+				"bps_out": diff_out_b / interval,
+				"pps_in": diff_in_p / interval,
+				"pps_out": diff_out_p / interval
+			}
+		}))
+		
+	sock.send(msgpack.packb({
+		"service": "machine",
+		"msg_type": "value",
+		"resource_type": "uptime",
+		"unit": "",
+		"values": {
+			"uptime": time.time() - psutil.get_boot_time()
+		}
+	}))
+	
+	time.sleep(interval)
+
--- a/cstatsd/stats-machine.py
+++ b/cstatsd/stats-machine.py
@ -1,143 +0,0 @@
-#!/usr/bin/env python2
-
-import zmq, msgpack, time, psutil, yaml, os
-
-ctx = zmq.Context()
-
-sock = ctx.socket(zmq.PUSH)
-sock.connect("ipc:///tmp/cstatsd")
-
-with open("config/machine.yaml", "r") as cfile:
-	config = yaml.safe_load(cfile)
-	
-interval = config["interval"]
-old_net_data = {}
-
-while True:
-	load_avgs = os.getloadavg()
-	sock.send(msgpack.packb({
-		"service": "machine",
-		"msg_type": "value",
-		"resource_type": "load_average",
-		"unit": "",
-		"values": {
-			"1m": load_avgs[0],
-			"5m": load_avgs[1],
-			"15m": load_avgs[2]
-		}
-	}))
-	
-	cpu_loads = psutil.cpu_percent(percpu=True)
-	
-	for i in xrange(0, len(cpu_loads)):
-		sock.send(msgpack.packb({
-			"service": "machine",
-			"msg_type": "value",
-			"resource_type": "cpu",
-			"unit": "core%d" % (i + 1),
-			"values": {
-				"load": cpu_loads[i]
-			}
-		}))
-	
-	for drive in config["drives"]:
-		drive_data = psutil.disk_usage(drive)
-		sock.send(msgpack.packb({
-			"service": "machine",
-			"msg_type": "value",
-			"resource_type": "disk",
-			"unit": drive,
-			"values": {
-				"total": drive_data.total,
-				"used": drive_data.used,
-				"free": drive_data.free,
-				"used_percentage": drive_data.percent
-			}
-		}))
-		
-	ram_data = psutil.virtual_memory()
-	sock.send(msgpack.packb({
-		"service": "machine",
-		"msg_type": "value",
-		"resource_type": "memory",
-		"unit": "physical",
-		"values": {
-			"total": ram_data.total,
-			"used": ram_data.used,
-			"free": ram_data.available,
-			"used_percentage": ram_data.percent,
-			"buffers": ram_data.buffers,
-			"cache": ram_data.cached
-		}
-	}))
-		
-	swap_data = psutil.virtual_memory()
-	sock.send(msgpack.packb({
-		"service": "machine",
-		"msg_type": "value",
-		"resource_type": "memory",
-		"unit": "swap",
-		"values": {
-			"total": swap_data.total,
-			"used": swap_data.used,
-			"free": swap_data.free,
-			"used_percentage": swap_data.percent
-		}
-	}))
-	
-	net_data = psutil.net_io_counters(pernic=True)
-	for nic, data in net_data.iteritems():
-		try:
-			old_in_b = old_net_data[nic].bytes_recv
-			old_out_b = old_net_data[nic].bytes_sent
-			old_in_p = old_net_data[nic].packets_recv
-			old_out_p = old_net_data[nic].packets_sent
-		except KeyError, e:
-			# No old data yet, first run? Save and skip to next...
-			old_net_data[nic] = data
-			continue
-		
-		diff_in_b = data.bytes_recv - old_in_b
-		diff_out_b = data.bytes_sent - old_out_b
-		diff_in_p = data.packets_recv - old_in_p
-		diff_out_p = data.packets_sent - old_out_p
-		
-		if diff_in_b < 0:
-			diff_in_b = 0
-		
-		if diff_out_b < 0:
-			diff_out_b = 0
-		
-		if diff_in_p < 0:
-			diff_in_p = 0
-		
-		if diff_out_p < 0:
-			diff_out_p = 0
-			
-		old_net_data[nic] = data
-		
-		sock.send(msgpack.packb({
-			"service": "machine",
-			"msg_type": "value",
-			"resource_type": "network",
-			"unit": nic,
-			"values": {
-				"bps_in": diff_in_b / interval,
-				"bps_out": diff_out_b / interval,
-				"pps_in": diff_in_p / interval,
-				"pps_out": diff_out_p / interval
-			}
-		}))
-		
-	sock.send(msgpack.packb({
-		"service": "machine",
-		"msg_type": "value",
-		"resource_type": "uptime",
-		"unit": "",
-		"values": {
-			"uptime": time.time() - psutil.get_boot_time()
-		}
-	}))
-	
-	time.sleep(interval)
-
--- a/cstatsd/stats-ports.py
+++ b/cstatsd/stats-ports.py
@ -20,6 +20,7 @@ while True:
 			s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 			s.settimeout(0.5)
 			s.connect(("127.0.0.1", port))
+			s.shutdown(socket.SHUT_RDWR)
 			s.close()
 			up = True
 		except socket.error, e:
--- a/cstatsd/stats-processes
+++ b/cstatsd/stats-processes
@ -0,0 +1,72 @@
+#!/usr/bin/env python2
+
+import zmq, msgpack, time, yaml, psutil, fnmatch
+
+ctx = zmq.Context()
+
+sock = ctx.socket(zmq.PUSH)
+sock.connect("ipc:///tmp/cstatsd")
+
+with open("config/processes.yaml", "r") as cfile:
+	config = yaml.safe_load(cfile)
+
+interval = config["interval"]
+
+old_status = {}
+
+while True:
+	all_procs = psutil.get_process_list()
+	
+	for service_name, patterns in config["processes"].iteritems():
+		matching = []
+		for proc in all_procs:  # Can't use filter() because of exceptions...
+			try:
+				if len(proc.cmdline) > 0 and fnmatch.fnmatch(proc.cmdline[0], patterns["name"]):
+					failed = False
+					try:
+						for arg, pattern in patterns["args"].iteritems():
+							try:
+								if len(proc.cmdline) < (arg + 1) or not fnmatch.fnmatch(proc.cmdline[arg], pattern):
+									failed = True
+							except KeyError, e:
+								pass
+					except KeyError, e:
+						pass
+					if failed == False:
+						matching.append(proc)
+			except psutil._error.NoSuchProcess, e:
+				pass
+			
+		if len(matching) > 0:
+			up = True
+		else:
+			up = False
+			
+		try:
+			if up == old_status[service_name]:
+				send_notice = False
+			else:
+				send_notice = True
+				initial = False
+		except KeyError, e:
+			send_notice = True
+			initial = True
+			
+		old_status[service_name] = up
+			
+		if send_notice:
+			if up:
+				msg_type = "up"
+			else:
+				msg_type = "down"
+				
+			sock.send(msgpack.packb({
+				"service": "process",
+				"msg_type": msg_type,
+				"unit": service_name,
+				"initial": initial
+			}))
+			
+			
+	time.sleep(interval)
+	
--- a/cstatsd/stats-tahoe.py
+++ b/cstatsd/stats-tahoe.py
--- a/deps.sh
+++ b/deps.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+# You need squeeze-backports if you run this on squeeze!
+apt-get install -y libzmq-dev libffi-dev build-essential python python-dev
+pip install pyzmq msgpack-python pynacl pyyaml psutil
--- a/install-steps.txt
+++ b/install-steps.txt
@ -0,0 +1,4 @@
+# Backports: echo "deb http://backports.debian.org/debian-backports squeeze-backports main" >> /etc/apt/sources.list && apt-get update && apt-get upgrade
+apt-get install -y python python-dev && wget cryto.net/~joepie91/pipfix.sh && chmod +x pipfix.sh && ./pipfix.sh
+adduser --system --shell /bin/bash --group monitor && apt-get install -y git; su -c "cd ~; git clone https://github.com/joepie91/cryto-status.git" monitor && /home/monitor/cryto-status/deps.sh
+# Replace libzmq with a manually compiled version...: wget http://download.zeromq.org/zeromq-4.0.3.tar.gz; tar -xzvf zeromq-4.0.3.tar.gz; cd zeromq-4.0.3; ./configure; make; make install; ldconfig
--- a/todo.txt
+++ b/todo.txt
@ -0,0 +1,54 @@
+* allow comments in (parentheses) in units, and ignore these when matching against an alarm pattern...
+* web interface (angularjs)
+* separate alarm and IRC logic
+* monitor inodes
+* watchdog on slave and master -> should send WARN notifications
+* notifications (text, arbitrary-serialized-data as attachment, DEBUG/INFO/WARN/ERR/CRIT)
+* consider redundancy - can already connect multiple masters through pubsub, how to deal with duplicate processing checking?
+
+cprocessd:
+ -> subscribe to ccollectd
+ -> debug switch for outputting all to terminal
+ -> keep up/down state
+ -> keep last-value state (resource usage)
+ -> keep track of persistent downtimes (down for more than X time, as configured in config file)
+ -> alarms (move this from the IRC bot to cprocessd)
+ -> classify message importance
+ -> cprocessd-stream socket, PUB that just streams processed data
+ -> cprocessd-query socket, REP that responds to queries
+	-> server-status
+	-> down-list
+	-> last-value
+	-> server-list
+	-> service-list
+
+cmaild:
+ -> use marrow.mailer
+ -> receives data from cprocessd-stream
+ -> sends e-mails for configured importance levels
+
+cbotd:
+ -> currently named 'alert'
+ -> receives data from cprocessd-stream
+ -> IRC bot
+ -> posts alerts to specified IRC channels, depending on minimum severity level configured for that channel (ie. INFO for #cryto-network but ERR for #crytocc)
+ 
+csmsd:
+ -> sends SMS for (critical) alerts
+ -> receives data from cprocessd-stream
+ -> Twilio? does a provider-neutral API exist? might need an extra abstraction...
+
+cwebd:
+ -> offers web interface with streaming status data
+ -> publicly accessible and password-protected
+ -> streaming data from cprocessd-stream
+ -> on-pageload state from cprocessd-query (including 'current downtimes')
+ -> tornado+zmq ioloop, http://zeromq.github.io/pyzmq/eventloop.html
+ -> web dashboard
+	-> AngularJS
+	-> fancy graphs (via AngularJS? idk if a directive exists for this)
+	-> show downtimes as well as live per-machine stats
+	-> also show overview of all machines in a grid, color-coded for average load of all resources
+	-> historical up/down data
+	-> sqlite storage? single concurrent write, so should work
+	-> perhaps letting people sign up for e-mail alerts is an option? to-inbox will be tricky here
Author	SHA1	Message	Date
Sven Slootweg	7efb3b3dd2	Update todo	11 years ago
Sven Slootweg	59ac723188	Spec out components	11 years ago
Sven Slootweg	d6b0fc2ad4	Update todo	11 years ago
Sven Slootweg	a796725057	Switch to pubsub for over-the-wire communication, to prevent memory leaks	11 years ago
Sven Slootweg	0c87ee058c	Update todo	11 years ago
Sven Slootweg	570d8f3b85	Cast to int properly	11 years ago
Sven Slootweg	57c015f161	Add bootstrap script	11 years ago
Sven Slootweg	e87d048ee9	Hack hack hackity hack hack - we now have check_output in 2.6!	11 years ago
Sven Slootweg	b67fdc8ca3	More fixes	11 years ago
Sven Slootweg	0dde712144	Forgot another split...	11 years ago
Sven Slootweg	4dbe92396c	Oops, forgot a split	11 years ago
Sven Slootweg	7dca261010	Attempt to fix memory accounting in OpenVZ....	11 years ago
Sven Slootweg	5424432ddb	Patch for disk stats on OpenVZ...	11 years ago
Sven Slootweg	971c5ccce3	Fixes and docs	11 years ago
Sven Slootweg	86b013a0b7	Be more lenient towards receiving errors, and update install/todo notes	11 years ago
Sven Slootweg	5a7e3815ca	Update install stuff	11 years ago
Sven Slootweg	ff98278520	Try using TCP instead...	11 years ago
Sven Slootweg	762d74d477	A few attempts later...	11 years ago
Sven Slootweg	1df76daa0d	Fix perms	11 years ago
Sven Slootweg	96aa2b020f	Add collectd listening script	11 years ago
Sven Slootweg	5eb46c13e8	(Probably) fix the blip bug	11 years ago
Sven Slootweg	e04448dcd8	Fix warning color for bot when blips are announced	11 years ago
Sven Slootweg	11eb813164	Installation steps and bugfixes	11 years ago
Sven Slootweg	e2c9097585	Reorganize some files, add support for units in alarm rules, support network blip detection (including more leniency for heartbeat failures), fix bug in alarm matching, remove debug output, add auto-start mechanism and killscript to cstatsd, fix deps.sh	11 years ago
Sven Slootweg	4cf0601b05	Fixes	11 years ago
Sven Slootweg	5db77ab87c	In theory, we should now have heartbeating...	11 years ago
Sven Slootweg	ae552ac0f9	Add todo list	11 years ago
Sven Slootweg	20eaf50791	Shut down socket connections properly after testing...	11 years ago
Sven Slootweg	97290dbb1c	Set up publish/subscribe mechanism, add example configurations, write IRC bot and alarm management mechanism	11 years ago
Sven Slootweg	56ae6b5305	Implement crypto, add disk i/o stats to machine statistics (and change config format), proper error handling for process monitoring	11 years ago
Sven Slootweg	258f62af22	Add key generation script	11 years ago
Sven Slootweg	4550e0a425	Add process watch, write code for ccollectd, implement ZeroMQ timer class, add dep installation script	11 years ago
Sven Slootweg	d310a95e7a	Change default interval for port polling to 1	11 years ago