Compare commits
33 Commits
Author | SHA1 | Date |
---|---|---|
Sven Slootweg | 7efb3b3dd2 | 11 years ago |
Sven Slootweg | 59ac723188 | 11 years ago |
Sven Slootweg | d6b0fc2ad4 | 11 years ago |
Sven Slootweg | a796725057 | 11 years ago |
Sven Slootweg | 0c87ee058c | 11 years ago |
Sven Slootweg | 570d8f3b85 | 11 years ago |
Sven Slootweg | 57c015f161 | 11 years ago |
Sven Slootweg | e87d048ee9 | 11 years ago |
Sven Slootweg | b67fdc8ca3 | 11 years ago |
Sven Slootweg | 0dde712144 | 11 years ago |
Sven Slootweg | 4dbe92396c | 11 years ago |
Sven Slootweg | 7dca261010 | 11 years ago |
Sven Slootweg | 5424432ddb | 11 years ago |
Sven Slootweg | 971c5ccce3 | 11 years ago |
Sven Slootweg | 86b013a0b7 | 11 years ago |
Sven Slootweg | 5a7e3815ca | 11 years ago |
Sven Slootweg | ff98278520 | 11 years ago |
Sven Slootweg | 762d74d477 | 11 years ago |
Sven Slootweg | 1df76daa0d | 11 years ago |
Sven Slootweg | 96aa2b020f | 11 years ago |
Sven Slootweg | 5eb46c13e8 | 11 years ago |
Sven Slootweg | e04448dcd8 | 11 years ago |
Sven Slootweg | 11eb813164 | 11 years ago |
Sven Slootweg | e2c9097585 | 11 years ago |
Sven Slootweg | 4cf0601b05 | 11 years ago |
Sven Slootweg | 5db77ab87c | 11 years ago |
Sven Slootweg | ae552ac0f9 | 11 years ago |
Sven Slootweg | 20eaf50791 | 11 years ago |
Sven Slootweg | 97290dbb1c | 11 years ago |
Sven Slootweg | 56ae6b5305 | 11 years ago |
Sven Slootweg | 258f62af22 | 11 years ago |
Sven Slootweg | 4550e0a425 | 11 years ago |
Sven Slootweg | d310a95e7a | 11 years ago |
@ -1 +1,10 @@
|
||||
cstatsd/config/*.yaml
|
||||
ccollectd/config.yaml
|
||||
alert/config.yaml
|
||||
*.pyc
|
||||
ccollectd/pubkey.dat
|
||||
ccollectd/privkey.dat
|
||||
cstatsd/pubkey.dat
|
||||
cstatsd/privkey.dat
|
||||
cstatsd/cstatsd.pid
|
||||
alert/rules.pickle
|
||||
|
@ -0,0 +1,304 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
import socket, yaml, random, zmq, msgpack, time, uuid, fnmatch
|
||||
import cPickle as pickle
|
||||
|
||||
ctx = zmq.Context()
|
||||
|
||||
with open("config.yaml", "r") as cfile:
|
||||
config = yaml.safe_load(cfile)
|
||||
|
||||
try:
|
||||
with open("rules.pickle", "r") as pfile:
|
||||
rules = pickle.load(pfile)
|
||||
except IOError, e:
|
||||
rules = {}
|
||||
|
||||
fetcher = ctx.socket(zmq.SUB)
|
||||
fetcher.setsockopt(zmq.SUBSCRIBE, "")
|
||||
fetcher.connect("tcp://127.0.0.1:8998")
|
||||
|
||||
class Bot(object):
|
||||
def __init__(self, hosts, port, nickname, realname, channels, admins, subsock):
|
||||
self.hosts = hosts
|
||||
self.port = port
|
||||
self.nickname = nickname
|
||||
self.realname = realname
|
||||
self.channels = channels
|
||||
self.admins = admins
|
||||
self.subsock = subsock
|
||||
self.connected = False
|
||||
self.last_down = {}
|
||||
self.known_alarms = {}
|
||||
|
||||
self.command_map = {
|
||||
"422": self.join_all_channels,
|
||||
"376": self.join_all_channels,
|
||||
"PRIVMSG": self.receive_message
|
||||
}
|
||||
|
||||
def split_irc(self, message):
|
||||
if message[0] == ":":
|
||||
prefix = ":"
|
||||
message = message[1:]
|
||||
else:
|
||||
prefix = ""
|
||||
|
||||
if ":" in message:
|
||||
rest, last = message.split(":", 1)
|
||||
parts = rest.strip().split() + [last]
|
||||
else:
|
||||
parts = message.split()
|
||||
|
||||
parts[0] = prefix + parts[0]
|
||||
return parts
|
||||
|
||||
def run(self):
|
||||
while True: # Connect loop
|
||||
host = random.choice(self.hosts)
|
||||
|
||||
self.sock = socket.socket()
|
||||
try:
|
||||
self.sock.connect((host, self.port))
|
||||
except socket.error, e:
|
||||
continue # Reconnect
|
||||
self.send_raw("NICK %s" % self.nickname)
|
||||
self.sock.send("USER %s 0 0 :%s\r\n" % (self.nickname, self.realname))
|
||||
|
||||
buff = ""
|
||||
while True: # Read loop
|
||||
r, w, x = zmq.select([self.sock, self.subsock], [], [])
|
||||
|
||||
for s in r:
|
||||
if s == self.sock.fileno():
|
||||
try:
|
||||
recvdata = self.sock.recv(1024)
|
||||
except socket.error, e:
|
||||
break # Something went wrong, reconnect...
|
||||
|
||||
if len(recvdata) == 0:
|
||||
break # We have disconnected...
|
||||
|
||||
buff += recvdata
|
||||
messages = buff.split("\n")
|
||||
buff = messages.pop()
|
||||
|
||||
for message in messages:
|
||||
self.process_message(self.split_irc(message.strip("\r")))
|
||||
elif self.subsock.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0:
|
||||
# Process incoming data from the subscribe socket...
|
||||
message = msgpack.unpackb(s.recv())
|
||||
self.process_stats(message)
|
||||
|
||||
def send_raw(self, message):
|
||||
self.sock.send("%s\r\n" % message)
|
||||
|
||||
def send_message(self, recipient, message):
|
||||
if self.connected == True:
|
||||
self.send_raw("PRIVMSG %s :%s" % (recipient, message))
|
||||
|
||||
def send_all(self, message):
|
||||
for channel in self.channels:
|
||||
self.send_message(channel, message)
|
||||
|
||||
def join(self, channel):
|
||||
self.send_raw("JOIN %s" % channel)
|
||||
|
||||
def join_all_channels(self, message):
|
||||
self.connected = True
|
||||
for channel in self.channels:
|
||||
self.join(channel)
|
||||
|
||||
def receive_message(self, message):
|
||||
args = message[3].split()
|
||||
sender = message[0][1:].split("!", 1)[0]
|
||||
channel = message[2]
|
||||
|
||||
try:
|
||||
if sender in self.admins:
|
||||
if args[0] == "!addrule":
|
||||
target, rel, value = args[1:4]
|
||||
target = self.parse_target(target)
|
||||
|
||||
if value[-1].lower() in ("k", "m", "g", "t"):
|
||||
unit = value[-1].lower()
|
||||
value = value[:-1]
|
||||
value = float(value)
|
||||
value = value * (1024 ** (("k", "m", "g", "t").index(unit) + 1))
|
||||
|
||||
rule_id = uuid.uuid4()
|
||||
rules[rule_id] = {
|
||||
"target": target,
|
||||
"operator": rel,
|
||||
"value": value
|
||||
}
|
||||
|
||||
with open("rules.pickle", "w") as pfile:
|
||||
pickle.dump(rules, pfile)
|
||||
|
||||
self.send_message(channel, "Added rule for %s with ID %s." % (args[1], rule_id))
|
||||
except Exception, e:
|
||||
self.send_message(channel, str(e))
|
||||
|
||||
def parse_target(self, target):
|
||||
host, rest = target.split("!", 1)
|
||||
service, rest = rest.split(".", 1)
|
||||
resource, rest = rest.split(":", 1)
|
||||
unit, attribute = rest.split(".", 1)
|
||||
# TODO: unit = unit.split("(", 1)[0].strip() # Allow () for comments
|
||||
if host == "*":
|
||||
host = True
|
||||
if service == "*":
|
||||
service = True
|
||||
if attribute == "*":
|
||||
attribute = True
|
||||
if resource == "*":
|
||||
resource = True
|
||||
if unit == "*":
|
||||
unit = True
|
||||
return {
|
||||
"host": host,
|
||||
"service": service,
|
||||
"resource": resource,
|
||||
"unit": unit,
|
||||
"attribute": attribute
|
||||
}
|
||||
|
||||
|
||||
def format_time_duration(self, seconds):
|
||||
# http://stackoverflow.com/a/20222351/1332715
|
||||
days, rem = divmod(seconds, 86400)
|
||||
hours, rem = divmod(rem, 3600)
|
||||
minutes, seconds = divmod(rem, 60)
|
||||
if seconds < 1:
|
||||
seconds = 1
|
||||
locals_ = locals()
|
||||
magnitudes_str = ("{n} {magnitude}".format(n=int(locals_[magnitude]), magnitude=magnitude) for magnitude in ("days", "hours", "minutes", "seconds") if locals_[magnitude])
|
||||
return ", ".join(magnitudes_str)
|
||||
|
||||
def process_stats(self, message):
|
||||
data = message["message"]
|
||||
data["host"] = message["host"]
|
||||
|
||||
if data["msg_type"] == "up" and data["initial"] == True:
|
||||
return # We don't need to say what is up, initially...
|
||||
|
||||
# TODO: Duration
|
||||
if data["msg_type"] == "up":
|
||||
try:
|
||||
data["duration"] = self.format_time_duration(time.time() - self.last_down["%(host)s!%(service)s.%(unit)s" % data])
|
||||
except KeyError, e:
|
||||
data["duration"] = "0 seconds"
|
||||
self.send_all("\x02\x030,3 [ UP ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is now back up. It was down for %(duration)s." % data)
|
||||
elif data["msg_type"] == "down":
|
||||
self.last_down["%(host)s!%(service)s.%(unit)s" % data] = time.time()
|
||||
self.send_all("\x02\x030,4 [ DOWN ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is \x02down!\x02" % data)
|
||||
elif data["msg_type"] == "blip":
|
||||
self.send_all("\x02\x030,7 [ WARNING ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that a blip occurred for \x036%(unit)s\x03!" % data)
|
||||
elif data["msg_type"] == "value":
|
||||
for rule_id, rule in rules.iteritems():
|
||||
check_vals = {
|
||||
"host": [data["host"]],
|
||||
"service": [data["service"]],
|
||||
"resource": [data["resource_type"]],
|
||||
"unit": [data["unit"]]
|
||||
}
|
||||
|
||||
failed = False
|
||||
for segment in ("host", "service", "resource", "unit"):
|
||||
for val in check_vals[segment]:
|
||||
if rule["target"][segment] is not True and not fnmatch.fnmatch(val, rule["target"][segment]):
|
||||
failed = True
|
||||
break
|
||||
if failed:
|
||||
continue # Skip to next
|
||||
|
||||
# We haven't broken out in the past bit of code, so we're still matching the pattern...
|
||||
eligible_keys = [key for key in data["values"].keys() if fnmatch.fnmatch(key, rule["target"]["attribute"])]
|
||||
|
||||
for key in eligible_keys:
|
||||
value = data["values"][key]
|
||||
rule_value = float(rule["value"])
|
||||
operator = rule["operator"]
|
||||
|
||||
if operator == "=":
|
||||
alarm = (value == rule_value)
|
||||
elif operator == ">":
|
||||
alarm = (value > rule_value)
|
||||
elif operator == "<":
|
||||
alarm = (value < rule_value)
|
||||
elif operator == ">=":
|
||||
alarm = (value >= rule_value)
|
||||
elif operator == "<=":
|
||||
alarm = (value <= rule_value)
|
||||
elif operator == "!=":
|
||||
alarm = (value != rule_value)
|
||||
else:
|
||||
alarm = False
|
||||
|
||||
self.trigger_alarm(rule_id, data, alarm, value, key)
|
||||
|
||||
def trigger_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
|
||||
key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key)
|
||||
|
||||
if key not in self.known_alarms:
|
||||
if active:
|
||||
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
|
||||
self.known_alarms[key] = time.time()
|
||||
else:
|
||||
self.known_alarms[key] = False
|
||||
else:
|
||||
if self.known_alarms[key] == False and active:
|
||||
# Alarm activated
|
||||
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
|
||||
self.known_alarms[key] = time.time()
|
||||
elif self.known_alarms[key] != False and not active:
|
||||
# Alarm deactivated
|
||||
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
|
||||
self.known_alarms[key] = False
|
||||
|
||||
def transmit_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
|
||||
# At this point, we're sure that we want to notify...
|
||||
rule_target = rules[rule_id]["target"].copy()
|
||||
for k, v in rule_target.iteritems():
|
||||
if v is True:
|
||||
rule_target[k] = "*"
|
||||
|
||||
rule_pattern = "%(host)s!%(service)s.%(resource)s:%(unit)s.%(attribute)s" % rule_target
|
||||
|
||||
info = {
|
||||
"host": data["host"],
|
||||
"rule_id": rule_id,
|
||||
"rule_pattern": rule_pattern
|
||||
}
|
||||
|
||||
if not active:
|
||||
key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key)
|
||||
try:
|
||||
info["duration"] = self.format_time_duration(time.time() - self.known_alarms[key])
|
||||
except KeyError, e:
|
||||
info["duration"] = "0 seconds"
|
||||
info["unit"] = data["unit"]
|
||||
info["attribute"] = offending_key
|
||||
|
||||
self.send_all("\x02\x030,3 [ SOLVED ] \x03\x02 Host \x037%(host)s\x03 reports that the alarm for rule %(rule_id)s (\x036%(rule_pattern)s\x03) was resolved for \x034%(unit)s\x03.\x034%(attribute)s\x03. It was active for %(duration)s." % info)
|
||||
else:
|
||||
info["value"] = offending_value
|
||||
info["spec"] = "%s %s" % (rules[rule_id]["operator"], rules[rule_id]["value"])
|
||||
info["unit"] = data["unit"]
|
||||
info["attribute"] = offending_key
|
||||
|
||||
self.send_all("\x02\x030,7 [ ALARM ] \x03\x02 Host \x037%(host)s\x03 reports that an alarm was triggered for rule %(rule_id)s (\x036%(rule_pattern)s\x03). The reported value was\x034 %(value)s\x03 for\x034 %(unit)s\x03.\x034%(attribute)s\x03 , triggering the \x032%(spec)s\x03 condition." % info)
|
||||
|
||||
def process_message(self, message):
|
||||
if message[0].upper() == "PING":
|
||||
self.send_raw("PONG %s" % message[1])
|
||||
else:
|
||||
try:
|
||||
self.command_map[message[1].upper()](message)
|
||||
except KeyError, e:
|
||||
pass
|
||||
|
||||
|
||||
bot = Bot(config["irc"]["hosts"], config["irc"]["port"], config["irc"]["nickname"], config["irc"]["realname"], config["irc"]["channels"], config["irc"]["admins"], fetcher)
|
||||
bot.run()
|
@ -0,0 +1,12 @@
|
||||
irc:
|
||||
hosts:
|
||||
- kerpia.cryto.net
|
||||
- box.cryto.net
|
||||
- arvel.cryto.net
|
||||
port: 6667
|
||||
nickname: StatusBot
|
||||
realname: Cryto System Monitoring Service
|
||||
admins:
|
||||
- joepie91
|
||||
channels:
|
||||
- "#test"
|
@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys, socket
|
||||
from nacl.public import PublicKey, PrivateKey, Box
|
||||
|
||||
ctx = zmq.Context()
|
||||
|
||||
distributor = ctx.socket(zmq.PUB)
|
||||
distributor.bind("tcp://127.0.0.1:8998")
|
||||
|
||||
poller = zmq.Poller()
|
||||
|
||||
with open("config.yaml", "r") as cfile:
|
||||
config = yaml.safe_load(cfile)
|
||||
|
||||
with open("privkey.dat", "r") as f:
|
||||
privkey = PrivateKey(binascii.unhexlify(f.read()))
|
||||
|
||||
nodes = config["nodes"]
|
||||
last_node_status = {}
|
||||
socket_map = {}
|
||||
boxes = {}
|
||||
|
||||
def heartbeat():
|
||||
for hostname, node in nodes.iteritems():
|
||||
retries = 0
|
||||
while retries < config["heartbeat"]["attempts"]:
|
||||
try:
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
s.settimeout(float(config["heartbeat"]["timeout"]) / (retries + 1))
|
||||
s.connect((node["ip"], node["port"]))
|
||||
s.shutdown(socket.SHUT_RDWR)
|
||||
s.close()
|
||||
up = True
|
||||
break
|
||||
except socket.error, e:
|
||||
up = False
|
||||
retries += 1
|
||||
|
||||
try:
|
||||
status_changed = (up != last_node_status[hostname])
|
||||
initial = False
|
||||
except KeyError, e:
|
||||
status_changed = True
|
||||
initial = True
|
||||
|
||||
last_node_status[hostname] = up
|
||||
|
||||
send_message = False
|
||||
if status_changed:
|
||||
if up:
|
||||
msg_type = "up"
|
||||
send_message = True
|
||||
else:
|
||||
msg_type = "down"
|
||||
send_message = True
|
||||
else:
|
||||
if up and retries > 0:
|
||||
msg_type = "blip"
|
||||
send_message = True
|
||||
|
||||
if send_message:
|
||||
distributor.send(msgpack.packb({
|
||||
"host": config["hostname"],
|
||||
"message": {
|
||||
"service": "heartbeat",
|
||||
"msg_type": msg_type,
|
||||
"unit": hostname,
|
||||
"initial": initial
|
||||
}
|
||||
}))
|
||||
|
||||
timers = zmqtimer.ZmqTimerManager()
|
||||
timers.add_timer(zmqtimer.ZmqTimer(config["heartbeat"]["interval"], heartbeat))
|
||||
|
||||
for hostname, node in config["nodes"].iteritems():
|
||||
boxes[hostname] = Box(privkey, PublicKey(binascii.unhexlify(node["pubkey"])))
|
||||
grabber = ctx.socket(zmq.SUB)
|
||||
grabber.setsockopt(zmq.SUBSCRIBE, "")
|
||||
grabber.connect(node["endpoint"])
|
||||
socket_map[grabber] = hostname
|
||||
poller.register(grabber, zmq.POLLIN)
|
||||
|
||||
while True:
|
||||
timers.check()
|
||||
socks = dict(poller.poll(timers.get_next_interval()))
|
||||
|
||||
for sock in socks:
|
||||
if socks[sock] == zmq.POLLIN:
|
||||
host = socket_map[sock]
|
||||
try:
|
||||
message = msgpack.unpackb(boxes[host].decrypt(sock.recv()))
|
||||
except nacl.exceptions.CryptoError, e:
|
||||
# Probably a spoofed message... skip to next socket
|
||||
sys.stderr.write("Ignoring message... spoofed? (origin: %s)\n" % host) # FIXME: Use logging module...
|
||||
continue
|
||||
except Exception, e:
|
||||
sys.stderr.write(repr(e) + "\n")
|
||||
continue
|
||||
distributor.send(msgpack.packb({
|
||||
"host": host,
|
||||
"message": message
|
||||
}))
|
||||
|
@ -0,0 +1,13 @@
|
||||
hostname: monitoring.cryto.net
|
||||
|
||||
heartbeat:
|
||||
interval: 5
|
||||
timeout: 1
|
||||
attempts: 3
|
||||
|
||||
nodes:
|
||||
localhost:
|
||||
ip: 127.0.0.1
|
||||
port: 6543
|
||||
endpoint: tcp://127.0.0.1:6543
|
||||
pubkey: bd784ef4065c9bd31627106dc55e26764605a144c6fc45ce93f33cbd19dd7333
|
@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
import yaml, os, stat, binascii
|
||||
from nacl.public import PrivateKey
|
||||
|
||||
privkey = PrivateKey.generate()
|
||||
pubkey = privkey.public_key
|
||||
|
||||
with open("privkey.dat", "w") as f:
|
||||
f.write(binascii.hexlify(str(privkey)))
|
||||
|
||||
with open("pubkey.dat", "w") as f:
|
||||
f.write(binascii.hexlify(str(pubkey)))
|
||||
|
||||
os.chmod("privkey.dat", stat.S_IRUSR | stat.S_IWUSR)
|
@ -0,0 +1,12 @@
|
||||
#!/usr/bin/env python2
|
||||
import zmq, msgpack
|
||||
|
||||
ctx = zmq.Context()
|
||||
|
||||
fetcher = ctx.socket(zmq.SUB)
|
||||
fetcher.setsockopt(zmq.SUBSCRIBE, "")
|
||||
fetcher.connect("tcp://127.0.0.1:8998")
|
||||
|
||||
while True:
|
||||
message = msgpack.unpackb(fetcher.recv())
|
||||
print message
|
@ -0,0 +1,41 @@
|
||||
import time
|
||||
|
||||
class ZmqTimerManager(object):
|
||||
def __init__(self):
|
||||
self.timers = []
|
||||
self.next_call = 0
|
||||
|
||||
def add_timer(self, timer):
|
||||
self.timers.append(timer)
|
||||
|
||||
def check(self):
|
||||
if time.time() > self.next_call:
|
||||
for timer in self.timers:
|
||||
timer.check()
|
||||
|
||||
def get_next_interval(self):
|
||||
if time.time() >= self.next_call:
|
||||
call_times = []
|
||||
for timer in self.timers:
|
||||
call_times.append(timer.get_next_call())
|
||||
self.next_call = min(call_times)
|
||||
if self.next_call < time.time():
|
||||
return 0
|
||||
else:
|
||||
return (self.next_call - time.time()) * 1000
|
||||
else:
|
||||
return (self.next_call - time.time()) * 1000
|
||||
|
||||
class ZmqTimer(object):
|
||||
def __init__(self, interval, callback):
|
||||
self.interval = interval
|
||||
self.callback = callback
|
||||
self.last_call = 0
|
||||
|
||||
def check(self):
|
||||
if time.time() > (self.interval + self.last_call):
|
||||
self.callback()
|
||||
self.last_call = time.time()
|
||||
|
||||
def get_next_call(self):
|
||||
return self.last_call + self.interval
|
@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
echo "Generating keypair..."
|
||||
./genkey 2>/dev/null
|
||||
./bootstrap-config
|
||||
echo "Your public key: `cat pubkey.dat`"
|
||||
echo "Server IP: `curl -s http://wtfismyip.com/text`" 2>/dev/null
|
@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
import yaml, sys
|
||||
|
||||
master_pubkey = raw_input("Public key of the master server: ")
|
||||
|
||||
print "You'll now be asked to configure ports to check. If you don't want to configure any ports, just hit enter without entering any information."
|
||||
|
||||
ports = {}
|
||||
|
||||
while True:
|
||||
port = raw_input("Port number: ")
|
||||
if port.strip() == "":
|
||||
break
|
||||
service_name = raw_input("Service name for port %s: " % port)
|
||||
ports[int(port)] = service_name
|
||||
|
||||
print "The same thing, except now for processes to check. Just hit enter without entering any information when you're done; the same goes for the argument list. As a wildcard, you can use *"
|
||||
|
||||
services = {}
|
||||
|
||||
while True:
|
||||
service_name = raw_input("Service name: ")
|
||||
|
||||
if service_name.strip() == "":
|
||||
break
|
||||
|
||||
process_name = raw_input("Process name: ")
|
||||
|
||||
args = {}
|
||||
argnum = 1
|
||||
while True:
|
||||
arg = raw_input("Argument %d: " % argnum)
|
||||
if arg.strip() == "":
|
||||
break
|
||||
args[argnum] = arg
|
||||
argnum += 1
|
||||
|
||||
services[service_name] = {
|
||||
"name": process_name,
|
||||
"args": args
|
||||
}
|
||||
|
||||
print "Now enter any disk devices you wish to monitor. Leave empty and hit enter when done."
|
||||
|
||||
disks = []
|
||||
|
||||
while True:
|
||||
device_name = raw_input("Device name: ")
|
||||
if device_name.strip() == "":
|
||||
break
|
||||
disks.append(device_name)
|
||||
|
||||
# Write config files...
|
||||
|
||||
modules = []
|
||||
|
||||
modules.append("stats-machine")
|
||||
with open("config/machine.yaml.example", "r") as ef:
|
||||
with open("config/machine.yaml", "w") as ff:
|
||||
data = yaml.safe_load(ef.read())
|
||||
data["drives"] = disks
|
||||
ff.write(yaml.dump(data))
|
||||
|
||||
if len(ports) > 0:
|
||||
modules.append("stats-ports")
|
||||
with open("config/ports.yaml.example", "r") as ef:
|
||||
with open("config/ports.yaml", "w") as ff:
|
||||
data = yaml.safe_load(ef.read())
|
||||
data["ports"] = ports
|
||||
ff.write(yaml.dump(data))
|
||||
|
||||
if len(services) > 0:
|
||||
modules.append("stats-processes")
|
||||
with open("config/processes.yaml.example", "r") as ef:
|
||||
with open("config/processes.yaml", "w") as ff:
|
||||
data = yaml.safe_load(ef.read())
|
||||
data["processes"] = services
|
||||
ff.write(yaml.dump(data))
|
||||
|
||||
with open("config/cstatsd.yaml.example", "r") as ef:
|
||||
with open("config/cstatsd.yaml", "w") as ff:
|
||||
data = yaml.safe_load(ef.read())
|
||||
data["pubkey"] = master_pubkey
|
||||
data["autostart"] = modules
|
||||
ff.write(yaml.dump(data))
|
@ -0,0 +1,7 @@
|
||||
endpoint: tcp://*:6543
|
||||
pubkey: a266a0634790a79c6934385892f7c377d35b8f03b9c6ac7d5bfed4a94f93ba65
|
||||
|
||||
autostart:
|
||||
- stats-processes
|
||||
- stats-ports
|
||||
- stats-machine
|
@ -1,4 +1,5 @@
|
||||
interval: 1
|
||||
|
||||
drives:
|
||||
- /
|
||||
- /dev/sda1
|
||||
- /dev/sdb1
|
||||
|
@ -0,0 +1,15 @@
|
||||
interval: 5
|
||||
|
||||
processes:
|
||||
radiotray:
|
||||
name: '*python*'
|
||||
args:
|
||||
1: /usr/bin/radiotray
|
||||
|
||||
guake:
|
||||
name: '*python*'
|
||||
args:
|
||||
1: /usr/local/bin/guake
|
||||
|
||||
keepassx:
|
||||
name: keepassx
|
@ -1,12 +1,43 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
import zmq, msgpack
|
||||
import zmq, yaml, binascii, nacl, sys, subprocess, os
|
||||
from nacl.public import PublicKey, PrivateKey, Box
|
||||
|
||||
basedir = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
with open("cstatsd.pid", "w") as pidfile:
|
||||
pidfile.write(str(os.getpid()))
|
||||
|
||||
ctx = zmq.Context()
|
||||
|
||||
with open("config/cstatsd.yaml", "r") as cfile:
|
||||
config = yaml.safe_load(cfile)
|
||||
|
||||
pubkey = PublicKey(binascii.unhexlify(config["pubkey"]))
|
||||
|
||||
with open("privkey.dat", "r") as f:
|
||||
privkey = PrivateKey(binascii.unhexlify(f.read()))
|
||||
|
||||
box = Box(privkey, pubkey)
|
||||
|
||||
collector = ctx.socket(zmq.PULL)
|
||||
collector.bind("ipc:///tmp/cstatsd")
|
||||
|
||||
shipper = ctx.socket(zmq.PUB)
|
||||
shipper.bind(config["endpoint"])
|
||||
|
||||
try:
|
||||
disable_autostart = (sys.argv[1] == "--disable-autostart")
|
||||
except:
|
||||
disable_autostart = False
|
||||
|
||||
if disable_autostart == False:
|
||||
with open("/dev/null", "w+") as stfu:
|
||||
for script in config["autostart"]:
|
||||
print os.path.join(basedir, script)
|
||||
subprocess.Popen([os.path.join(basedir, script)], stdout=stfu, stderr=stfu)
|
||||
|
||||
while True:
|
||||
message = msgpack.unpackb(collector.recv())
|
||||
print message
|
||||
message = collector.recv()
|
||||
nonce = nacl.utils.random(Box.NONCE_SIZE)
|
||||
shipper.send(box.encrypt(message, nonce))
|
||||
|
@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
import yaml, os, stat, binascii
|
||||
from nacl.public import PrivateKey
|
||||
|
||||
privkey = PrivateKey.generate()
|
||||
pubkey = privkey.public_key
|
||||
|
||||
with open("privkey.dat", "w") as f:
|
||||
f.write(binascii.hexlify(str(privkey)))
|
||||
|
||||
with open("pubkey.dat", "w") as f:
|
||||
f.write(binascii.hexlify(str(pubkey)))
|
||||
|
||||
os.chmod("privkey.dat", stat.S_IRUSR | stat.S_IWUSR)
|
@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
PID=`cat cstatsd.pid`
|
||||
pkill -P $PID
|
||||
kill $PID
|
@ -0,0 +1,224 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
import zmq, msgpack, time, psutil, yaml, os, subprocess
|
||||
from collections import namedtuple
|
||||
|
||||
# Horrible hack to make check_output exist in 2.6
|
||||
# http://stackoverflow.com/a/13160748/1332715
|
||||
if "check_output" not in dir( subprocess ): # duck punch it in!
|
||||
def f(*popenargs, **kwargs):
|
||||
if 'stdout' in kwargs:
|
||||
raise ValueError('stdout argument not allowed, it will be overridden.')
|
||||
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
|
||||
output, unused_err = process.communicate()
|
||||
retcode = process.poll()
|
||||
if retcode:
|
||||
cmd = kwargs.get("args")
|
||||
if cmd is None:
|
||||
cmd = popenargs[0]
|
||||
raise subprocess.CalledProcessError(retcode, cmd)
|
||||
return output
|
||||
subprocess.check_output = f
|
||||
|
||||
ctx = zmq.Context()
|
||||
|
||||
sock = ctx.socket(zmq.PUSH)
|
||||
sock.connect("ipc:///tmp/cstatsd")
|
||||
|
||||
with open("config/machine.yaml", "r") as cfile:
|
||||
config = yaml.safe_load(cfile)
|
||||
|
||||
interval = config["interval"]
|
||||
old_net_data = {}
|
||||
|
||||
disk_map = {}
|
||||
last_io_data = {}
|
||||
|
||||
if os.path.exists("/proc/user_beancounters") and not os.path.exists("/proc/vz/vestat"):
|
||||
openvz_burst = True
|
||||
FakeRam = namedtuple("FakeRam", ["total", "used", "available", "percent", "buffers", "cached"])
|
||||
else:
|
||||
openvz_burst = False
|
||||
|
||||
for disk in psutil.disk_partitions():
|
||||
disk_map[disk.device] = disk
|
||||
|
||||
if len(disk_map) == 0:
|
||||
# We're probably on OpenVZ, so /proc/partitions doesn't exist. Fall back to 'df'.
|
||||
FakeDisk = namedtuple("FakeDisk", ["device", "mountpoint"])
|
||||
for line in subprocess.check_output(["df"]).splitlines()[1:]:
|
||||
device, _, _, _, _, mountpoint = line.split()
|
||||
disk_map[device] = FakeDisk(device, mountpoint)
|
||||
|
||||
while True:
|
||||
load_avgs = os.getloadavg()
|
||||
sock.send(msgpack.packb({
|
||||
"service": "machine",
|
||||
"msg_type": "value",
|
||||
"resource_type": "load_average",
|
||||
"unit": "",
|
||||
"values": {
|
||||
"1m": load_avgs[0],
|
||||
"5m": load_avgs[1],
|
||||
"15m": load_avgs[2]
|
||||
}
|
||||
}))
|
||||
|
||||
cpu_loads = psutil.cpu_percent(percpu=True)
|
||||
|
||||
for i in xrange(0, len(cpu_loads)):
|
||||
sock.send(msgpack.packb({
|
||||
"service": "machine",
|
||||
"msg_type": "value",
|
||||
"resource_type": "cpu",
|
||||
"unit": "core%d" % (i + 1),
|
||||
"values": {
|
||||
"load": cpu_loads[i]
|
||||
}
|
||||
}))
|
||||
|
||||
try:
|
||||
io_counters = psutil.disk_io_counters(perdisk=True)
|
||||
except IOError, e:
|
||||
io_counters = {} # OpenVZ...
|
||||
|
||||
for drive in config["drives"]:
|
||||
drive_data = psutil.disk_usage(disk_map[drive].mountpoint)
|
||||
io_data = None
|
||||
|
||||
for diskname, data in io_counters.iteritems():
|
||||
if drive.endswith(diskname):
|
||||
io_data = data
|
||||
|
||||
if io_data is None or drive not in last_io_data:
|
||||
read_bps = 0
|
||||
write_bps = 0
|
||||
read_iops = 0
|
||||
write_iops = 0
|
||||
else:
|
||||
read_bps = (io_data.read_bytes - last_io_data[drive].read_bytes) / interval
|
||||
write_bps = (io_data.write_bytes - last_io_data[drive].write_bytes) / interval
|
||||
read_iops = (io_data.read_count - last_io_data[drive].read_count) / interval
|
||||
write_iops = (io_data.write_count - last_io_data[drive].write_count) / interval
|
||||
|
||||
if io_data is not None:
|
||||
last_io_data[drive] = io_data
|
||||
|
||||
sock.send(msgpack.packb({
|
||||
"service": "machine",
|
||||
"msg_type": "value",
|
||||
"resource_type": "disk",
|
||||
"unit": drive,
|
||||
"values": {
|
||||
"total": drive_data.total,
|
||||
"used": drive_data.used,
|
||||
"free": drive_data.free,
|
||||
"used_percentage": drive_data.percent,
|
||||
"bps_read": read_bps,
|
||||
"bps_write": write_bps,
|
||||
"iops_read": read_iops,
|
||||
"iops_write": write_iops,
|
||||
}
|
||||
}))
|
||||
|
||||
if openvz_burst:
|
||||
# Sigh, OpenVZ... let's use 'free', since that apparently -does- understand OpenVZ.
|
||||
lines = subprocess.check_output(["free", "-b"]).splitlines()
|
||||
_, ram_total, ram_used, ram_free, _, ram_buffers, ram_cached = lines[1].split()
|
||||
_, _, _, ram_available = lines[2].split()
|
||||
ram_total = int(ram_total)
|
||||
ram_free = int(ram_free)
|
||||
ram_buffers = int(ram_buffers)
|
||||
ram_cached = int(ram_cached)
|
||||
ram_available = int(ram_available)
|
||||
ram_used = int(ram_used)
|
||||
ram_percent = 1.0 * (ram_total - ram_available) / ram_total * 100
|
||||
ram_data = FakeRam(ram_total, ram_used, ram_available, ram_percent, ram_buffers, ram_cached)
|
||||
else:
|
||||
ram_data = psutil.virtual_memory()
|
||||
|
||||
sock.send(msgpack.packb({
|
||||
"service": "machine",
|
||||
"msg_type": "value",
|
||||
"resource_type": "memory",
|
||||
"unit": "physical",
|
||||
"values": {
|
||||
"total": ram_data.total,
|
||||
"used": ram_data.used,
|
||||
"free": ram_data.available,
|
||||
"used_percentage": ram_data.percent,
|
||||
"buffers": ram_data.buffers,
|
||||
"cache": ram_data.cached
|
||||
}
|
||||
}))
|
||||
|
||||
swap_data = psutil.swap_memory()
|
||||
sock.send(msgpack.packb({
|
||||
"service": "machine",
|
||||
"msg_type": "value",
|
||||
"resource_type": "memory",
|
||||
"unit": "swap",
|
||||
"values": {
|
||||
"total": swap_data.total,
|
||||
"used": swap_data.used,
|
||||
"free": swap_data.free,
|
||||
"used_percentage": swap_data.percent
|
||||
}
|
||||
}))
|
||||
|
||||
net_data = psutil.net_io_counters(pernic=True)
|
||||
for nic, data in net_data.iteritems():
|
||||
try:
|
||||
old_in_b = old_net_data[nic].bytes_recv
|
||||
old_out_b = old_net_data[nic].bytes_sent
|
||||
old_in_p = old_net_data[nic].packets_recv
|
||||
old_out_p = old_net_data[nic].packets_sent
|
||||
except KeyError, e:
|
||||
# No old data yet, first run? Save and skip to next...
|
||||
old_net_data[nic] = data
|
||||
continue
|
||||
|
||||
diff_in_b = data.bytes_recv - old_in_b
|
||||
diff_out_b = data.bytes_sent - old_out_b
|
||||
diff_in_p = data.packets_recv - old_in_p
|
||||
diff_out_p = data.packets_sent - old_out_p
|
||||
|
||||
if diff_in_b < 0:
|
||||
diff_in_b = 0
|
||||
|
||||
if diff_out_b < 0:
|
||||
diff_out_b = 0
|
||||
|
||||
if diff_in_p < 0:
|
||||
diff_in_p = 0
|
||||
|
||||
if diff_out_p < 0:
|
||||
diff_out_p = 0
|
||||
|
||||
old_net_data[nic] = data
|
||||
|
||||
sock.send(msgpack.packb({
|
||||
"service": "machine",
|
||||
"msg_type": "value",
|
||||
"resource_type": "network",
|
||||
"unit": nic,
|
||||
"values": {
|
||||
"bps_in": diff_in_b / interval,
|
||||
"bps_out": diff_out_b / interval,
|
||||
"pps_in": diff_in_p / interval,
|
||||
"pps_out": diff_out_p / interval
|
||||
}
|
||||
}))
|
||||
|
||||
sock.send(msgpack.packb({
|
||||
"service": "machine",
|
||||
"msg_type": "value",
|
||||
"resource_type": "uptime",
|
||||
"unit": "",
|
||||
"values": {
|
||||
"uptime": time.time() - psutil.get_boot_time()
|
||||
}
|
||||
}))
|
||||
|
||||
time.sleep(interval)
|
||||
|
@ -1,143 +0,0 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
import zmq, msgpack, time, psutil, yaml, os
|
||||
|
||||
ctx = zmq.Context()
|
||||
|
||||
sock = ctx.socket(zmq.PUSH)
|
||||
sock.connect("ipc:///tmp/cstatsd")
|
||||
|
||||
with open("config/machine.yaml", "r") as cfile:
|
||||
config = yaml.safe_load(cfile)
|
||||
|
||||
interval = config["interval"]
|
||||
old_net_data = {}
|
||||
|
||||
while True:
|
||||
load_avgs = os.getloadavg()
|
||||
sock.send(msgpack.packb({
|
||||
"service": "machine",
|
||||
"msg_type": "value",
|
||||
"resource_type": "load_average",
|
||||
"unit": "",
|
||||
"values": {
|
||||
"1m": load_avgs[0],
|
||||
"5m": load_avgs[1],
|
||||
"15m": load_avgs[2]
|
||||
}
|
||||
}))
|
||||
|
||||
cpu_loads = psutil.cpu_percent(percpu=True)
|
||||
|
||||
for i in xrange(0, len(cpu_loads)):
|
||||
sock.send(msgpack.packb({
|
||||
"service": "machine",
|
||||
"msg_type": "value",
|
||||
"resource_type": "cpu",
|
||||
"unit": "core%d" % (i + 1),
|
||||
"values": {
|
||||
"load": cpu_loads[i]
|
||||
}
|
||||
}))
|
||||
|
||||
for drive in config["drives"]:
|
||||
drive_data = psutil.disk_usage(drive)
|
||||
sock.send(msgpack.packb({
|
||||
"service": "machine",
|
||||
"msg_type": "value",
|
||||
"resource_type": "disk",
|
||||
"unit": drive,
|
||||
"values": {
|
||||
"total": drive_data.total,
|
||||
"used": drive_data.used,
|
||||
"free": drive_data.free,
|
||||
"used_percentage": drive_data.percent
|
||||
}
|
||||
}))
|
||||
|
||||
ram_data = psutil.virtual_memory()
|
||||
sock.send(msgpack.packb({
|
||||
"service": "machine",
|
||||
"msg_type": "value",
|
||||
"resource_type": "memory",
|
||||
"unit": "physical",
|
||||
"values": {
|
||||
"total": ram_data.total,
|
||||
"used": ram_data.used,
|
||||
"free": ram_data.available,
|
||||
"used_percentage": ram_data.percent,
|
||||
"buffers": ram_data.buffers,
|
||||
"cache": ram_data.cached
|
||||
}
|
||||
}))
|
||||
|
||||
swap_data = psutil.virtual_memory()
|
||||
sock.send(msgpack.packb({
|
||||
"service": "machine",
|
||||
"msg_type": "value",
|
||||
"resource_type": "memory",
|
||||
"unit": "swap",
|
||||
"values": {
|
||||
"total": swap_data.total,
|
||||
"used": swap_data.used,
|
||||
"free": swap_data.free,
|
||||
"used_percentage": swap_data.percent
|
||||
}
|
||||
}))
|
||||
|
||||
net_data = psutil.net_io_counters(pernic=True)
|
||||
for nic, data in net_data.iteritems():
|
||||
try:
|
||||
old_in_b = old_net_data[nic].bytes_recv
|
||||
old_out_b = old_net_data[nic].bytes_sent
|
||||
old_in_p = old_net_data[nic].packets_recv
|
||||
old_out_p = old_net_data[nic].packets_sent
|
||||
except KeyError, e:
|
||||
# No old data yet, first run? Save and skip to next...
|
||||
old_net_data[nic] = data
|
||||
continue
|
||||
|
||||
diff_in_b = data.bytes_recv - old_in_b
|
||||
diff_out_b = data.bytes_sent - old_out_b
|
||||
diff_in_p = data.packets_recv - old_in_p
|
||||
diff_out_p = data.packets_sent - old_out_p
|
||||
|
||||
if diff_in_b < 0:
|
||||
diff_in_b = 0
|
||||
|
||||
if diff_out_b < 0:
|
||||
diff_out_b = 0
|
||||
|
||||
if diff_in_p < 0:
|
||||
diff_in_p = 0
|
||||
|
||||
if diff_out_p < 0:
|
||||
diff_out_p = 0
|
||||
|
||||
old_net_data[nic] = data
|
||||
|
||||
sock.send(msgpack.packb({
|
||||
"service": "machine",
|
||||
"msg_type": "value",
|
||||
"resource_type": "network",
|
||||
"unit": nic,
|
||||
"values": {
|
||||
"bps_in": diff_in_b / interval,
|
||||
"bps_out": diff_out_b / interval,
|
||||
"pps_in": diff_in_p / interval,
|
||||
"pps_out": diff_out_p / interval
|
||||
}
|
||||
}))
|
||||
|
||||
sock.send(msgpack.packb({
|
||||
"service": "machine",
|
||||
"msg_type": "value",
|
||||
"resource_type": "uptime",
|
||||
"unit": "",
|
||||
"values": {
|
||||
"uptime": time.time() - psutil.get_boot_time()
|
||||
}
|
||||
}))
|
||||
|
||||
time.sleep(interval)
|
||||
|
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
import zmq, msgpack, time, yaml, psutil, fnmatch
|
||||
|
||||
ctx = zmq.Context()
|
||||
|
||||
sock = ctx.socket(zmq.PUSH)
|
||||
sock.connect("ipc:///tmp/cstatsd")
|
||||
|
||||
with open("config/processes.yaml", "r") as cfile:
|
||||
config = yaml.safe_load(cfile)
|
||||
|
||||
interval = config["interval"]
|
||||
|
||||
old_status = {}
|
||||
|
||||
while True:
|
||||
all_procs = psutil.get_process_list()
|
||||
|
||||
for service_name, patterns in config["processes"].iteritems():
|
||||
matching = []
|
||||
for proc in all_procs: # Can't use filter() because of exceptions...
|
||||
try:
|
||||
if len(proc.cmdline) > 0 and fnmatch.fnmatch(proc.cmdline[0], patterns["name"]):
|
||||
failed = False
|
||||
try:
|
||||
for arg, pattern in patterns["args"].iteritems():
|
||||
try:
|
||||
if len(proc.cmdline) < (arg + 1) or not fnmatch.fnmatch(proc.cmdline[arg], pattern):
|
||||
failed = True
|
||||
except KeyError, e:
|
||||
pass
|
||||
except KeyError, e:
|
||||
pass
|
||||
if failed == False:
|
||||
matching.append(proc)
|
||||
except psutil._error.NoSuchProcess, e:
|
||||
pass
|
||||
|
||||
if len(matching) > 0:
|
||||
up = True
|
||||
else:
|
||||
up = False
|
||||
|
||||
try:
|
||||
if up == old_status[service_name]:
|
||||
send_notice = False
|
||||
else:
|
||||
send_notice = True
|
||||
initial = False
|
||||
except KeyError, e:
|
||||
send_notice = True
|
||||
initial = True
|
||||
|
||||
old_status[service_name] = up
|
||||
|
||||
if send_notice:
|
||||
if up:
|
||||
msg_type = "up"
|
||||
else:
|
||||
msg_type = "down"
|
||||
|
||||
sock.send(msgpack.packb({
|
||||
"service": "process",
|
||||
"msg_type": msg_type,
|
||||
"unit": service_name,
|
||||
"initial": initial
|
||||
}))
|
||||
|
||||
|
||||
time.sleep(interval)
|
||||
|
@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
# You need squeeze-backports if you run this on squeeze!
|
||||
apt-get install -y libzmq-dev libffi-dev build-essential python python-dev
|
||||
pip install pyzmq msgpack-python pynacl pyyaml psutil
|
@ -0,0 +1,4 @@
|
||||
# Backports: echo "deb http://backports.debian.org/debian-backports squeeze-backports main" >> /etc/apt/sources.list && apt-get update && apt-get upgrade
|
||||
apt-get install -y python python-dev && wget cryto.net/~joepie91/pipfix.sh && chmod +x pipfix.sh && ./pipfix.sh
|
||||
adduser --system --shell /bin/bash --group monitor && apt-get install -y git; su -c "cd ~; git clone https://github.com/joepie91/cryto-status.git" monitor && /home/monitor/cryto-status/deps.sh
|
||||
# Replace libzmq with a manually compiled version...: wget http://download.zeromq.org/zeromq-4.0.3.tar.gz; tar -xzvf zeromq-4.0.3.tar.gz; cd zeromq-4.0.3; ./configure; make; make install; ldconfig
|
@ -0,0 +1,54 @@
|
||||
* allow comments in (parentheses) in units, and ignore these when matching against an alarm pattern...
|
||||
* web interface (angularjs)
|
||||
* separate alarm and IRC logic
|
||||
* monitor inodes
|
||||
* watchdog on slave and master -> should send WARN notifications
|
||||
* notifications (text, arbitrary-serialized-data as attachment, DEBUG/INFO/WARN/ERR/CRIT)
|
||||
* consider redundancy - can already connect multiple masters through pubsub, how to deal with duplicate processing checking?
|
||||
|
||||
cprocessd:
|
||||
-> subscribe to ccollectd
|
||||
-> debug switch for outputting all to terminal
|
||||
-> keep up/down state
|
||||
-> keep last-value state (resource usage)
|
||||
-> keep track of persistent downtimes (down for more than X time, as configured in config file)
|
||||
-> alarms (move this from the IRC bot to cprocessd)
|
||||
-> classify message importance
|
||||
-> cprocessd-stream socket, PUB that just streams processed data
|
||||
-> cprocessd-query socket, REP that responds to queries
|
||||
-> server-status
|
||||
-> down-list
|
||||
-> last-value
|
||||
-> server-list
|
||||
-> service-list
|
||||
|
||||
cmaild:
|
||||
-> use marrow.mailer
|
||||
-> receives data from cprocessd-stream
|
||||
-> sends e-mails for configured importance levels
|
||||
|
||||
cbotd:
|
||||
-> currently named 'alert'
|
||||
-> receives data from cprocessd-stream
|
||||
-> IRC bot
|
||||
-> posts alerts to specified IRC channels, depending on minimum severity level configured for that channel (ie. INFO for #cryto-network but ERR for #crytocc)
|
||||
|
||||
csmsd:
|
||||
-> sends SMS for (critical) alerts
|
||||
-> receives data from cprocessd-stream
|
||||
-> Twilio? does a provider-neutral API exist? might need an extra abstraction...
|
||||
|
||||
cwebd:
|
||||
-> offers web interface with streaming status data
|
||||
-> publicly accessible and password-protected
|
||||
-> streaming data from cprocessd-stream
|
||||
-> on-pageload state from cprocessd-query (including 'current downtimes')
|
||||
-> tornado+zmq ioloop, http://zeromq.github.io/pyzmq/eventloop.html
|
||||
-> web dashboard
|
||||
-> AngularJS
|
||||
-> fancy graphs (via AngularJS? idk if a directive exists for this)
|
||||
-> show downtimes as well as live per-machine stats
|
||||
-> also show overview of all machines in a grid, color-coded for average load of all resources
|
||||
-> historical up/down data
|
||||
-> sqlite storage? single concurrent write, so should work
|
||||
-> perhaps letting people sign up for e-mail alerts is an option? to-inbox will be tricky here
|
Loading…
Reference in New Issue