#!/usr/bin/env python2 import socket, yaml, random, zmq, msgpack, time, uuid, fnmatch import cPickle as pickle ctx = zmq.Context() with open("config.yaml", "r") as cfile: config = yaml.safe_load(cfile) try: with open("rules.pickle", "r") as pfile: rules = pickle.load(pfile) except IOError, e: rules = {} fetcher = ctx.socket(zmq.SUB) fetcher.setsockopt(zmq.SUBSCRIBE, "") fetcher.connect("tcp://127.0.0.1:8998") class Bot(object): def __init__(self, hosts, port, nickname, realname, channels, admins, subsock): self.hosts = hosts self.port = port self.nickname = nickname self.realname = realname self.channels = channels self.admins = admins self.subsock = subsock self.connected = False self.last_down = {} self.known_alarms = {} self.command_map = { "422": self.join_all_channels, "376": self.join_all_channels, "PRIVMSG": self.receive_message } def split_irc(self, message): if message[0] == ":": prefix = ":" message = message[1:] else: prefix = "" if ":" in message: rest, last = message.split(":", 1) parts = rest.strip().split() + [last] else: parts = message.split() parts[0] = prefix + parts[0] return parts def run(self): while True: # Connect loop host = random.choice(self.hosts) self.sock = socket.socket() try: self.sock.connect((host, self.port)) except socket.error, e: continue # Reconnect self.send_raw("NICK %s" % self.nickname) self.sock.send("USER %s 0 0 :%s\r\n" % (self.nickname, self.realname)) buff = "" while True: # Read loop r, w, x = zmq.select([self.sock, self.subsock], [], []) for s in r: if s == self.sock.fileno(): try: recvdata = self.sock.recv(1024) except socket.error, e: break # Something went wrong, reconnect... if len(recvdata) == 0: break # We have disconnected... buff += recvdata messages = buff.split("\n") buff = messages.pop() for message in messages: self.process_message(self.split_irc(message.strip("\r"))) elif self.subsock.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0: # Process incoming data from the subscribe socket... message = msgpack.unpackb(s.recv()) self.process_stats(message) def send_raw(self, message): self.sock.send("%s\r\n" % message) def send_message(self, recipient, message): if self.connected == True: self.send_raw("PRIVMSG %s :%s" % (recipient, message)) def send_all(self, message): for channel in self.channels: self.send_message(channel, message) def join(self, channel): self.send_raw("JOIN %s" % channel) def join_all_channels(self, message): self.connected = True for channel in self.channels: self.join(channel) def receive_message(self, message): args = message[3].split() sender = message[0][1:].split("!", 1)[0] channel = message[2] try: if sender in self.admins: if args[0] == "!addrule": target, rel, value = args[1:4] target = self.parse_target(target) if value[-1].lower() in ("k", "m", "g", "t"): unit = value[-1].lower() value = value[:-1] value = float(value) value = value * (1024 ** (("k", "m", "g", "t").index(unit) + 1)) rule_id = uuid.uuid4() rules[rule_id] = { "target": target, "operator": rel, "value": value } with open("rules.pickle", "w") as pfile: pickle.dump(rules, pfile) self.send_message(channel, "Added rule for %s with ID %s." % (args[1], rule_id)) except Exception, e: self.send_message(channel, str(e)) def parse_target(self, target): host, rest = target.split("!", 1) service, rest = rest.split(".", 1) resource, rest = rest.split(":", 1) unit, attribute = rest.split(".", 1) # TODO: unit = unit.split("(", 1)[0].strip() # Allow () for comments if host == "*": host = True if service == "*": service = True if attribute == "*": attribute = True if resource == "*": resource = True if unit == "*": unit = True return { "host": host, "service": service, "resource": resource, "unit": unit, "attribute": attribute } def format_time_duration(self, seconds): # http://stackoverflow.com/a/20222351/1332715 days, rem = divmod(seconds, 86400) hours, rem = divmod(rem, 3600) minutes, seconds = divmod(rem, 60) if seconds < 1: seconds = 1 locals_ = locals() magnitudes_str = ("{n} {magnitude}".format(n=int(locals_[magnitude]), magnitude=magnitude) for magnitude in ("days", "hours", "minutes", "seconds") if locals_[magnitude]) return ", ".join(magnitudes_str) def process_stats(self, message): data = message["message"] data["host"] = message["host"] if data["msg_type"] == "up" and data["initial"] == True: return # We don't need to say what is up, initially... # TODO: Duration if data["msg_type"] == "up": try: data["duration"] = self.format_time_duration(time.time() - self.last_down["%(host)s!%(service)s.%(unit)s" % data]) except KeyError, e: data["duration"] = "0 seconds" self.send_all("\x02\x030,3 [ UP ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is now back up. It was down for %(duration)s." % data) elif data["msg_type"] == "down": self.last_down["%(host)s!%(service)s.%(unit)s" % data] = time.time() self.send_all("\x02\x030,4 [ DOWN ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is \x02down!\x02" % data) elif data["msg_type"] == "blip": self.send_all("\x02\x030,7 [ WARNING ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that a blip occurred for \x036%(unit)s\x03!" % data) elif data["msg_type"] == "value": for rule_id, rule in rules.iteritems(): check_vals = { "host": [data["host"]], "service": [data["service"]], "resource": [data["resource_type"]], "unit": [data["unit"]] } failed = False for segment in ("host", "service", "resource", "unit"): for val in check_vals[segment]: if rule["target"][segment] is not True and not fnmatch.fnmatch(val, rule["target"][segment]): failed = True break if failed: continue # Skip to next # We haven't broken out in the past bit of code, so we're still matching the pattern... eligible_keys = [key for key in data["values"].keys() if fnmatch.fnmatch(key, rule["target"]["attribute"])] for key in eligible_keys: value = data["values"][key] rule_value = float(rule["value"]) operator = rule["operator"] if operator == "=": alarm = (value == rule_value) elif operator == ">": alarm = (value > rule_value) elif operator == "<": alarm = (value < rule_value) elif operator == ">=": alarm = (value >= rule_value) elif operator == "<=": alarm = (value <= rule_value) elif operator == "!=": alarm = (value != rule_value) else: alarm = False self.trigger_alarm(rule_id, data, alarm, value, key) def trigger_alarm(self, rule_id, data, active, offending_value=None, offending_key=None): key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key) if key not in self.known_alarms: if active: self.transmit_alarm(rule_id, data, active, offending_value, offending_key) self.known_alarms[key] = time.time() else: self.known_alarms[key] = False else: if self.known_alarms[key] == False and active: # Alarm activated self.transmit_alarm(rule_id, data, active, offending_value, offending_key) self.known_alarms[key] = time.time() elif self.known_alarms[key] != False and not active: # Alarm deactivated self.transmit_alarm(rule_id, data, active, offending_value, offending_key) self.known_alarms[key] = False def transmit_alarm(self, rule_id, data, active, offending_value=None, offending_key=None): # At this point, we're sure that we want to notify... rule_target = rules[rule_id]["target"].copy() for k, v in rule_target.iteritems(): if v is True: rule_target[k] = "*" rule_pattern = "%(host)s!%(service)s.%(resource)s:%(unit)s.%(attribute)s" % rule_target info = { "host": data["host"], "rule_id": rule_id, "rule_pattern": rule_pattern } if not active: key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key) try: info["duration"] = self.format_time_duration(time.time() - self.known_alarms[key]) except KeyError, e: info["duration"] = "0 seconds" info["unit"] = data["unit"] info["attribute"] = offending_key self.send_all("\x02\x030,3 [ SOLVED ] \x03\x02 Host \x037%(host)s\x03 reports that the alarm for rule %(rule_id)s (\x036%(rule_pattern)s\x03) was resolved for \x034%(unit)s\x03.\x034%(attribute)s\x03. It was active for %(duration)s." % info) else: info["value"] = offending_value info["spec"] = "%s %s" % (rules[rule_id]["operator"], rules[rule_id]["value"]) info["unit"] = data["unit"] info["attribute"] = offending_key self.send_all("\x02\x030,7 [ ALARM ] \x03\x02 Host \x037%(host)s\x03 reports that an alarm was triggered for rule %(rule_id)s (\x036%(rule_pattern)s\x03). The reported value was\x034 %(value)s\x03 for\x034 %(unit)s\x03.\x034%(attribute)s\x03 , triggering the \x032%(spec)s\x03 condition." % info) def process_message(self, message): if message[0].upper() == "PING": self.send_raw("PONG %s" % message[1]) else: try: self.command_map[message[1].upper()](message) except KeyError, e: pass bot = Bot(config["irc"]["hosts"], config["irc"]["port"], config["irc"]["nickname"], config["irc"]["realname"], config["irc"]["channels"], config["irc"]["admins"], fetcher) bot.run()