Compare commits

..

No commits in common. 'develop' and 'master' have entirely different histories.

9
.gitignore vendored

@ -1,10 +1 @@
cstatsd/config/*.yaml
ccollectd/config.yaml
alert/config.yaml
*.pyc
ccollectd/pubkey.dat
ccollectd/privkey.dat
cstatsd/pubkey.dat
cstatsd/privkey.dat
cstatsd/cstatsd.pid
alert/rules.pickle

@ -1,304 +0,0 @@
#!/usr/bin/env python2
import socket, yaml, random, zmq, msgpack, time, uuid, fnmatch
import cPickle as pickle
ctx = zmq.Context()
with open("config.yaml", "r") as cfile:
config = yaml.safe_load(cfile)
try:
with open("rules.pickle", "r") as pfile:
rules = pickle.load(pfile)
except IOError, e:
rules = {}
fetcher = ctx.socket(zmq.SUB)
fetcher.setsockopt(zmq.SUBSCRIBE, "")
fetcher.connect("tcp://127.0.0.1:8998")
class Bot(object):
def __init__(self, hosts, port, nickname, realname, channels, admins, subsock):
self.hosts = hosts
self.port = port
self.nickname = nickname
self.realname = realname
self.channels = channels
self.admins = admins
self.subsock = subsock
self.connected = False
self.last_down = {}
self.known_alarms = {}
self.command_map = {
"422": self.join_all_channels,
"376": self.join_all_channels,
"PRIVMSG": self.receive_message
}
def split_irc(self, message):
if message[0] == ":":
prefix = ":"
message = message[1:]
else:
prefix = ""
if ":" in message:
rest, last = message.split(":", 1)
parts = rest.strip().split() + [last]
else:
parts = message.split()
parts[0] = prefix + parts[0]
return parts
def run(self):
while True: # Connect loop
host = random.choice(self.hosts)
self.sock = socket.socket()
try:
self.sock.connect((host, self.port))
except socket.error, e:
continue # Reconnect
self.send_raw("NICK %s" % self.nickname)
self.sock.send("USER %s 0 0 :%s\r\n" % (self.nickname, self.realname))
buff = ""
while True: # Read loop
r, w, x = zmq.select([self.sock, self.subsock], [], [])
for s in r:
if s == self.sock.fileno():
try:
recvdata = self.sock.recv(1024)
except socket.error, e:
break # Something went wrong, reconnect...
if len(recvdata) == 0:
break # We have disconnected...
buff += recvdata
messages = buff.split("\n")
buff = messages.pop()
for message in messages:
self.process_message(self.split_irc(message.strip("\r")))
elif self.subsock.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0:
# Process incoming data from the subscribe socket...
message = msgpack.unpackb(s.recv())
self.process_stats(message)
def send_raw(self, message):
self.sock.send("%s\r\n" % message)
def send_message(self, recipient, message):
if self.connected == True:
self.send_raw("PRIVMSG %s :%s" % (recipient, message))
def send_all(self, message):
for channel in self.channels:
self.send_message(channel, message)
def join(self, channel):
self.send_raw("JOIN %s" % channel)
def join_all_channels(self, message):
self.connected = True
for channel in self.channels:
self.join(channel)
def receive_message(self, message):
args = message[3].split()
sender = message[0][1:].split("!", 1)[0]
channel = message[2]
try:
if sender in self.admins:
if args[0] == "!addrule":
target, rel, value = args[1:4]
target = self.parse_target(target)
if value[-1].lower() in ("k", "m", "g", "t"):
unit = value[-1].lower()
value = value[:-1]
value = float(value)
value = value * (1024 ** (("k", "m", "g", "t").index(unit) + 1))
rule_id = uuid.uuid4()
rules[rule_id] = {
"target": target,
"operator": rel,
"value": value
}
with open("rules.pickle", "w") as pfile:
pickle.dump(rules, pfile)
self.send_message(channel, "Added rule for %s with ID %s." % (args[1], rule_id))
except Exception, e:
self.send_message(channel, str(e))
def parse_target(self, target):
host, rest = target.split("!", 1)
service, rest = rest.split(".", 1)
resource, rest = rest.split(":", 1)
unit, attribute = rest.split(".", 1)
# TODO: unit = unit.split("(", 1)[0].strip() # Allow () for comments
if host == "*":
host = True
if service == "*":
service = True
if attribute == "*":
attribute = True
if resource == "*":
resource = True
if unit == "*":
unit = True
return {
"host": host,
"service": service,
"resource": resource,
"unit": unit,
"attribute": attribute
}
def format_time_duration(self, seconds):
# http://stackoverflow.com/a/20222351/1332715
days, rem = divmod(seconds, 86400)
hours, rem = divmod(rem, 3600)
minutes, seconds = divmod(rem, 60)
if seconds < 1:
seconds = 1
locals_ = locals()
magnitudes_str = ("{n} {magnitude}".format(n=int(locals_[magnitude]), magnitude=magnitude) for magnitude in ("days", "hours", "minutes", "seconds") if locals_[magnitude])
return ", ".join(magnitudes_str)
def process_stats(self, message):
data = message["message"]
data["host"] = message["host"]
if data["msg_type"] == "up" and data["initial"] == True:
return # We don't need to say what is up, initially...
# TODO: Duration
if data["msg_type"] == "up":
try:
data["duration"] = self.format_time_duration(time.time() - self.last_down["%(host)s!%(service)s.%(unit)s" % data])
except KeyError, e:
data["duration"] = "0 seconds"
self.send_all("\x02\x030,3 [ UP ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is now back up. It was down for %(duration)s." % data)
elif data["msg_type"] == "down":
self.last_down["%(host)s!%(service)s.%(unit)s" % data] = time.time()
self.send_all("\x02\x030,4 [ DOWN ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is \x02down!\x02" % data)
elif data["msg_type"] == "blip":
self.send_all("\x02\x030,7 [ WARNING ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that a blip occurred for \x036%(unit)s\x03!" % data)
elif data["msg_type"] == "value":
for rule_id, rule in rules.iteritems():
check_vals = {
"host": [data["host"]],
"service": [data["service"]],
"resource": [data["resource_type"]],
"unit": [data["unit"]]
}
failed = False
for segment in ("host", "service", "resource", "unit"):
for val in check_vals[segment]:
if rule["target"][segment] is not True and not fnmatch.fnmatch(val, rule["target"][segment]):
failed = True
break
if failed:
continue # Skip to next
# We haven't broken out in the past bit of code, so we're still matching the pattern...
eligible_keys = [key for key in data["values"].keys() if fnmatch.fnmatch(key, rule["target"]["attribute"])]
for key in eligible_keys:
value = data["values"][key]
rule_value = float(rule["value"])
operator = rule["operator"]
if operator == "=":
alarm = (value == rule_value)
elif operator == ">":
alarm = (value > rule_value)
elif operator == "<":
alarm = (value < rule_value)
elif operator == ">=":
alarm = (value >= rule_value)
elif operator == "<=":
alarm = (value <= rule_value)
elif operator == "!=":
alarm = (value != rule_value)
else:
alarm = False
self.trigger_alarm(rule_id, data, alarm, value, key)
def trigger_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key)
if key not in self.known_alarms:
if active:
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
self.known_alarms[key] = time.time()
else:
self.known_alarms[key] = False
else:
if self.known_alarms[key] == False and active:
# Alarm activated
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
self.known_alarms[key] = time.time()
elif self.known_alarms[key] != False and not active:
# Alarm deactivated
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
self.known_alarms[key] = False
def transmit_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
# At this point, we're sure that we want to notify...
rule_target = rules[rule_id]["target"].copy()
for k, v in rule_target.iteritems():
if v is True:
rule_target[k] = "*"
rule_pattern = "%(host)s!%(service)s.%(resource)s:%(unit)s.%(attribute)s" % rule_target
info = {
"host": data["host"],
"rule_id": rule_id,
"rule_pattern": rule_pattern
}
if not active:
key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key)
try:
info["duration"] = self.format_time_duration(time.time() - self.known_alarms[key])
except KeyError, e:
info["duration"] = "0 seconds"
info["unit"] = data["unit"]
info["attribute"] = offending_key
self.send_all("\x02\x030,3 [ SOLVED ] \x03\x02 Host \x037%(host)s\x03 reports that the alarm for rule %(rule_id)s (\x036%(rule_pattern)s\x03) was resolved for \x034%(unit)s\x03.\x034%(attribute)s\x03. It was active for %(duration)s." % info)
else:
info["value"] = offending_value
info["spec"] = "%s %s" % (rules[rule_id]["operator"], rules[rule_id]["value"])
info["unit"] = data["unit"]
info["attribute"] = offending_key
self.send_all("\x02\x030,7 [ ALARM ] \x03\x02 Host \x037%(host)s\x03 reports that an alarm was triggered for rule %(rule_id)s (\x036%(rule_pattern)s\x03). The reported value was\x034 %(value)s\x03 for\x034 %(unit)s\x03.\x034%(attribute)s\x03 , triggering the \x032%(spec)s\x03 condition." % info)
def process_message(self, message):
if message[0].upper() == "PING":
self.send_raw("PONG %s" % message[1])
else:
try:
self.command_map[message[1].upper()](message)
except KeyError, e:
pass
bot = Bot(config["irc"]["hosts"], config["irc"]["port"], config["irc"]["nickname"], config["irc"]["realname"], config["irc"]["channels"], config["irc"]["admins"], fetcher)
bot.run()

@ -1,12 +0,0 @@
irc:
hosts:
- kerpia.cryto.net
- box.cryto.net
- arvel.cryto.net
port: 6667
nickname: StatusBot
realname: Cryto System Monitoring Service
admins:
- joepie91
channels:
- "#test"

@ -1,104 +0,0 @@
#!/usr/bin/env python2
import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys, socket
from nacl.public import PublicKey, PrivateKey, Box
ctx = zmq.Context()
distributor = ctx.socket(zmq.PUB)
distributor.bind("tcp://127.0.0.1:8998")
poller = zmq.Poller()
with open("config.yaml", "r") as cfile:
config = yaml.safe_load(cfile)
with open("privkey.dat", "r") as f:
privkey = PrivateKey(binascii.unhexlify(f.read()))
nodes = config["nodes"]
last_node_status = {}
socket_map = {}
boxes = {}
def heartbeat():
for hostname, node in nodes.iteritems():
retries = 0
while retries < config["heartbeat"]["attempts"]:
try:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(float(config["heartbeat"]["timeout"]) / (retries + 1))
s.connect((node["ip"], node["port"]))
s.shutdown(socket.SHUT_RDWR)
s.close()
up = True
break
except socket.error, e:
up = False
retries += 1
try:
status_changed = (up != last_node_status[hostname])
initial = False
except KeyError, e:
status_changed = True
initial = True
last_node_status[hostname] = up
send_message = False
if status_changed:
if up:
msg_type = "up"
send_message = True
else:
msg_type = "down"
send_message = True
else:
if up and retries > 0:
msg_type = "blip"
send_message = True
if send_message:
distributor.send(msgpack.packb({
"host": config["hostname"],
"message": {
"service": "heartbeat",
"msg_type": msg_type,
"unit": hostname,
"initial": initial
}
}))
timers = zmqtimer.ZmqTimerManager()
timers.add_timer(zmqtimer.ZmqTimer(config["heartbeat"]["interval"], heartbeat))
for hostname, node in config["nodes"].iteritems():
boxes[hostname] = Box(privkey, PublicKey(binascii.unhexlify(node["pubkey"])))
grabber = ctx.socket(zmq.SUB)
grabber.setsockopt(zmq.SUBSCRIBE, "")
grabber.connect(node["endpoint"])
socket_map[grabber] = hostname
poller.register(grabber, zmq.POLLIN)
while True:
timers.check()
socks = dict(poller.poll(timers.get_next_interval()))
for sock in socks:
if socks[sock] == zmq.POLLIN:
host = socket_map[sock]
try:
message = msgpack.unpackb(boxes[host].decrypt(sock.recv()))
except nacl.exceptions.CryptoError, e:
# Probably a spoofed message... skip to next socket
sys.stderr.write("Ignoring message... spoofed? (origin: %s)\n" % host) # FIXME: Use logging module...
continue
except Exception, e:
sys.stderr.write(repr(e) + "\n")
continue
distributor.send(msgpack.packb({
"host": host,
"message": message
}))

@ -1,13 +0,0 @@
hostname: monitoring.cryto.net
heartbeat:
interval: 5
timeout: 1
attempts: 3
nodes:
localhost:
ip: 127.0.0.1
port: 6543
endpoint: tcp://127.0.0.1:6543
pubkey: bd784ef4065c9bd31627106dc55e26764605a144c6fc45ce93f33cbd19dd7333

@ -1,15 +0,0 @@
#!/usr/bin/env python2
import yaml, os, stat, binascii
from nacl.public import PrivateKey
privkey = PrivateKey.generate()
pubkey = privkey.public_key
with open("privkey.dat", "w") as f:
f.write(binascii.hexlify(str(privkey)))
with open("pubkey.dat", "w") as f:
f.write(binascii.hexlify(str(pubkey)))
os.chmod("privkey.dat", stat.S_IRUSR | stat.S_IWUSR)

@ -1,12 +0,0 @@
#!/usr/bin/env python2
import zmq, msgpack
ctx = zmq.Context()
fetcher = ctx.socket(zmq.SUB)
fetcher.setsockopt(zmq.SUBSCRIBE, "")
fetcher.connect("tcp://127.0.0.1:8998")
while True:
message = msgpack.unpackb(fetcher.recv())
print message

@ -1,41 +0,0 @@
import time
class ZmqTimerManager(object):
def __init__(self):
self.timers = []
self.next_call = 0
def add_timer(self, timer):
self.timers.append(timer)
def check(self):
if time.time() > self.next_call:
for timer in self.timers:
timer.check()
def get_next_interval(self):
if time.time() >= self.next_call:
call_times = []
for timer in self.timers:
call_times.append(timer.get_next_call())
self.next_call = min(call_times)
if self.next_call < time.time():
return 0
else:
return (self.next_call - time.time()) * 1000
else:
return (self.next_call - time.time()) * 1000
class ZmqTimer(object):
def __init__(self, interval, callback):
self.interval = interval
self.callback = callback
self.last_call = 0
def check(self):
if time.time() > (self.interval + self.last_call):
self.callback()
self.last_call = time.time()
def get_next_call(self):
return self.last_call + self.interval

@ -1,6 +0,0 @@
#!/bin/bash
echo "Generating keypair..."
./genkey 2>/dev/null
./bootstrap-config
echo "Your public key: `cat pubkey.dat`"
echo "Server IP: `curl -s http://wtfismyip.com/text`" 2>/dev/null

@ -1,86 +0,0 @@
#!/usr/bin/env python2
import yaml, sys
master_pubkey = raw_input("Public key of the master server: ")
print "You'll now be asked to configure ports to check. If you don't want to configure any ports, just hit enter without entering any information."
ports = {}
while True:
port = raw_input("Port number: ")
if port.strip() == "":
break
service_name = raw_input("Service name for port %s: " % port)
ports[int(port)] = service_name
print "The same thing, except now for processes to check. Just hit enter without entering any information when you're done; the same goes for the argument list. As a wildcard, you can use *"
services = {}
while True:
service_name = raw_input("Service name: ")
if service_name.strip() == "":
break
process_name = raw_input("Process name: ")
args = {}
argnum = 1
while True:
arg = raw_input("Argument %d: " % argnum)
if arg.strip() == "":
break
args[argnum] = arg
argnum += 1
services[service_name] = {
"name": process_name,
"args": args
}
print "Now enter any disk devices you wish to monitor. Leave empty and hit enter when done."
disks = []
while True:
device_name = raw_input("Device name: ")
if device_name.strip() == "":
break
disks.append(device_name)
# Write config files...
modules = []
modules.append("stats-machine")
with open("config/machine.yaml.example", "r") as ef:
with open("config/machine.yaml", "w") as ff:
data = yaml.safe_load(ef.read())
data["drives"] = disks
ff.write(yaml.dump(data))
if len(ports) > 0:
modules.append("stats-ports")
with open("config/ports.yaml.example", "r") as ef:
with open("config/ports.yaml", "w") as ff:
data = yaml.safe_load(ef.read())
data["ports"] = ports
ff.write(yaml.dump(data))
if len(services) > 0:
modules.append("stats-processes")
with open("config/processes.yaml.example", "r") as ef:
with open("config/processes.yaml", "w") as ff:
data = yaml.safe_load(ef.read())
data["processes"] = services
ff.write(yaml.dump(data))
with open("config/cstatsd.yaml.example", "r") as ef:
with open("config/cstatsd.yaml", "w") as ff:
data = yaml.safe_load(ef.read())
data["pubkey"] = master_pubkey
data["autostart"] = modules
ff.write(yaml.dump(data))

@ -1,7 +0,0 @@
endpoint: tcp://*:6543
pubkey: a266a0634790a79c6934385892f7c377d35b8f03b9c6ac7d5bfed4a94f93ba65
autostart:
- stats-processes
- stats-ports
- stats-machine

@ -1,5 +1,4 @@
interval: 1
drives:
- /dev/sda1
- /dev/sdb1
- /

@ -1,4 +1,4 @@
interval: 1
interval: 5
ports:
6667: UnrealIRCd

@ -1,15 +0,0 @@
interval: 5
processes:
radiotray:
name: '*python*'
args:
1: /usr/bin/radiotray
guake:
name: '*python*'
args:
1: /usr/local/bin/guake
keepassx:
name: keepassx

@ -1,43 +1,12 @@
#!/usr/bin/env python2
import zmq, yaml, binascii, nacl, sys, subprocess, os
from nacl.public import PublicKey, PrivateKey, Box
basedir = os.path.dirname(os.path.realpath(__file__))
with open("cstatsd.pid", "w") as pidfile:
pidfile.write(str(os.getpid()))
import zmq, msgpack
ctx = zmq.Context()
with open("config/cstatsd.yaml", "r") as cfile:
config = yaml.safe_load(cfile)
pubkey = PublicKey(binascii.unhexlify(config["pubkey"]))
with open("privkey.dat", "r") as f:
privkey = PrivateKey(binascii.unhexlify(f.read()))
box = Box(privkey, pubkey)
collector = ctx.socket(zmq.PULL)
collector.bind("ipc:///tmp/cstatsd")
shipper = ctx.socket(zmq.PUB)
shipper.bind(config["endpoint"])
try:
disable_autostart = (sys.argv[1] == "--disable-autostart")
except:
disable_autostart = False
if disable_autostart == False:
with open("/dev/null", "w+") as stfu:
for script in config["autostart"]:
print os.path.join(basedir, script)
subprocess.Popen([os.path.join(basedir, script)], stdout=stfu, stderr=stfu)
while True:
message = collector.recv()
nonce = nacl.utils.random(Box.NONCE_SIZE)
shipper.send(box.encrypt(message, nonce))
message = msgpack.unpackb(collector.recv())
print message

@ -1,15 +0,0 @@
#!/usr/bin/env python2
import yaml, os, stat, binascii
from nacl.public import PrivateKey
privkey = PrivateKey.generate()
pubkey = privkey.public_key
with open("privkey.dat", "w") as f:
f.write(binascii.hexlify(str(privkey)))
with open("pubkey.dat", "w") as f:
f.write(binascii.hexlify(str(pubkey)))
os.chmod("privkey.dat", stat.S_IRUSR | stat.S_IWUSR)

@ -1,5 +0,0 @@
#!/bin/bash
PID=`cat cstatsd.pid`
pkill -P $PID
kill $PID

@ -1,224 +0,0 @@
#!/usr/bin/env python2
import zmq, msgpack, time, psutil, yaml, os, subprocess
from collections import namedtuple
# Horrible hack to make check_output exist in 2.6
# http://stackoverflow.com/a/13160748/1332715
if "check_output" not in dir( subprocess ): # duck punch it in!
def f(*popenargs, **kwargs):
if 'stdout' in kwargs:
raise ValueError('stdout argument not allowed, it will be overridden.')
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
output, unused_err = process.communicate()
retcode = process.poll()
if retcode:
cmd = kwargs.get("args")
if cmd is None:
cmd = popenargs[0]
raise subprocess.CalledProcessError(retcode, cmd)
return output
subprocess.check_output = f
ctx = zmq.Context()
sock = ctx.socket(zmq.PUSH)
sock.connect("ipc:///tmp/cstatsd")
with open("config/machine.yaml", "r") as cfile:
config = yaml.safe_load(cfile)
interval = config["interval"]
old_net_data = {}
disk_map = {}
last_io_data = {}
if os.path.exists("/proc/user_beancounters") and not os.path.exists("/proc/vz/vestat"):
openvz_burst = True
FakeRam = namedtuple("FakeRam", ["total", "used", "available", "percent", "buffers", "cached"])
else:
openvz_burst = False
for disk in psutil.disk_partitions():
disk_map[disk.device] = disk
if len(disk_map) == 0:
# We're probably on OpenVZ, so /proc/partitions doesn't exist. Fall back to 'df'.
FakeDisk = namedtuple("FakeDisk", ["device", "mountpoint"])
for line in subprocess.check_output(["df"]).splitlines()[1:]:
device, _, _, _, _, mountpoint = line.split()
disk_map[device] = FakeDisk(device, mountpoint)
while True:
load_avgs = os.getloadavg()
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "load_average",
"unit": "",
"values": {
"1m": load_avgs[0],
"5m": load_avgs[1],
"15m": load_avgs[2]
}
}))
cpu_loads = psutil.cpu_percent(percpu=True)
for i in xrange(0, len(cpu_loads)):
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "cpu",
"unit": "core%d" % (i + 1),
"values": {
"load": cpu_loads[i]
}
}))
try:
io_counters = psutil.disk_io_counters(perdisk=True)
except IOError, e:
io_counters = {} # OpenVZ...
for drive in config["drives"]:
drive_data = psutil.disk_usage(disk_map[drive].mountpoint)
io_data = None
for diskname, data in io_counters.iteritems():
if drive.endswith(diskname):
io_data = data
if io_data is None or drive not in last_io_data:
read_bps = 0
write_bps = 0
read_iops = 0
write_iops = 0
else:
read_bps = (io_data.read_bytes - last_io_data[drive].read_bytes) / interval
write_bps = (io_data.write_bytes - last_io_data[drive].write_bytes) / interval
read_iops = (io_data.read_count - last_io_data[drive].read_count) / interval
write_iops = (io_data.write_count - last_io_data[drive].write_count) / interval
if io_data is not None:
last_io_data[drive] = io_data
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "disk",
"unit": drive,
"values": {
"total": drive_data.total,
"used": drive_data.used,
"free": drive_data.free,
"used_percentage": drive_data.percent,
"bps_read": read_bps,
"bps_write": write_bps,
"iops_read": read_iops,
"iops_write": write_iops,
}
}))
if openvz_burst:
# Sigh, OpenVZ... let's use 'free', since that apparently -does- understand OpenVZ.
lines = subprocess.check_output(["free", "-b"]).splitlines()
_, ram_total, ram_used, ram_free, _, ram_buffers, ram_cached = lines[1].split()
_, _, _, ram_available = lines[2].split()
ram_total = int(ram_total)
ram_free = int(ram_free)
ram_buffers = int(ram_buffers)
ram_cached = int(ram_cached)
ram_available = int(ram_available)
ram_used = int(ram_used)
ram_percent = 1.0 * (ram_total - ram_available) / ram_total * 100
ram_data = FakeRam(ram_total, ram_used, ram_available, ram_percent, ram_buffers, ram_cached)
else:
ram_data = psutil.virtual_memory()
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "memory",
"unit": "physical",
"values": {
"total": ram_data.total,
"used": ram_data.used,
"free": ram_data.available,
"used_percentage": ram_data.percent,
"buffers": ram_data.buffers,
"cache": ram_data.cached
}
}))
swap_data = psutil.swap_memory()
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "memory",
"unit": "swap",
"values": {
"total": swap_data.total,
"used": swap_data.used,
"free": swap_data.free,
"used_percentage": swap_data.percent
}
}))
net_data = psutil.net_io_counters(pernic=True)
for nic, data in net_data.iteritems():
try:
old_in_b = old_net_data[nic].bytes_recv
old_out_b = old_net_data[nic].bytes_sent
old_in_p = old_net_data[nic].packets_recv
old_out_p = old_net_data[nic].packets_sent
except KeyError, e:
# No old data yet, first run? Save and skip to next...
old_net_data[nic] = data
continue
diff_in_b = data.bytes_recv - old_in_b
diff_out_b = data.bytes_sent - old_out_b
diff_in_p = data.packets_recv - old_in_p
diff_out_p = data.packets_sent - old_out_p
if diff_in_b < 0:
diff_in_b = 0
if diff_out_b < 0:
diff_out_b = 0
if diff_in_p < 0:
diff_in_p = 0
if diff_out_p < 0:
diff_out_p = 0
old_net_data[nic] = data
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "network",
"unit": nic,
"values": {
"bps_in": diff_in_b / interval,
"bps_out": diff_out_b / interval,
"pps_in": diff_in_p / interval,
"pps_out": diff_out_p / interval
}
}))
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "uptime",
"unit": "",
"values": {
"uptime": time.time() - psutil.get_boot_time()
}
}))
time.sleep(interval)

@ -0,0 +1,143 @@
#!/usr/bin/env python2
import zmq, msgpack, time, psutil, yaml, os
ctx = zmq.Context()
sock = ctx.socket(zmq.PUSH)
sock.connect("ipc:///tmp/cstatsd")
with open("config/machine.yaml", "r") as cfile:
config = yaml.safe_load(cfile)
interval = config["interval"]
old_net_data = {}
while True:
load_avgs = os.getloadavg()
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "load_average",
"unit": "",
"values": {
"1m": load_avgs[0],
"5m": load_avgs[1],
"15m": load_avgs[2]
}
}))
cpu_loads = psutil.cpu_percent(percpu=True)
for i in xrange(0, len(cpu_loads)):
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "cpu",
"unit": "core%d" % (i + 1),
"values": {
"load": cpu_loads[i]
}
}))
for drive in config["drives"]:
drive_data = psutil.disk_usage(drive)
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "disk",
"unit": drive,
"values": {
"total": drive_data.total,
"used": drive_data.used,
"free": drive_data.free,
"used_percentage": drive_data.percent
}
}))
ram_data = psutil.virtual_memory()
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "memory",
"unit": "physical",
"values": {
"total": ram_data.total,
"used": ram_data.used,
"free": ram_data.available,
"used_percentage": ram_data.percent,
"buffers": ram_data.buffers,
"cache": ram_data.cached
}
}))
swap_data = psutil.virtual_memory()
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "memory",
"unit": "swap",
"values": {
"total": swap_data.total,
"used": swap_data.used,
"free": swap_data.free,
"used_percentage": swap_data.percent
}
}))
net_data = psutil.net_io_counters(pernic=True)
for nic, data in net_data.iteritems():
try:
old_in_b = old_net_data[nic].bytes_recv
old_out_b = old_net_data[nic].bytes_sent
old_in_p = old_net_data[nic].packets_recv
old_out_p = old_net_data[nic].packets_sent
except KeyError, e:
# No old data yet, first run? Save and skip to next...
old_net_data[nic] = data
continue
diff_in_b = data.bytes_recv - old_in_b
diff_out_b = data.bytes_sent - old_out_b
diff_in_p = data.packets_recv - old_in_p
diff_out_p = data.packets_sent - old_out_p
if diff_in_b < 0:
diff_in_b = 0
if diff_out_b < 0:
diff_out_b = 0
if diff_in_p < 0:
diff_in_p = 0
if diff_out_p < 0:
diff_out_p = 0
old_net_data[nic] = data
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "network",
"unit": nic,
"values": {
"bps_in": diff_in_b / interval,
"bps_out": diff_out_b / interval,
"pps_in": diff_in_p / interval,
"pps_out": diff_out_p / interval
}
}))
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "uptime",
"unit": "",
"values": {
"uptime": time.time() - psutil.get_boot_time()
}
}))
time.sleep(interval)

@ -20,7 +20,6 @@ while True:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(0.5)
s.connect(("127.0.0.1", port))
s.shutdown(socket.SHUT_RDWR)
s.close()
up = True
except socket.error, e:

@ -1,72 +0,0 @@
#!/usr/bin/env python2
import zmq, msgpack, time, yaml, psutil, fnmatch
ctx = zmq.Context()
sock = ctx.socket(zmq.PUSH)
sock.connect("ipc:///tmp/cstatsd")
with open("config/processes.yaml", "r") as cfile:
config = yaml.safe_load(cfile)
interval = config["interval"]
old_status = {}
while True:
all_procs = psutil.get_process_list()
for service_name, patterns in config["processes"].iteritems():
matching = []
for proc in all_procs: # Can't use filter() because of exceptions...
try:
if len(proc.cmdline) > 0 and fnmatch.fnmatch(proc.cmdline[0], patterns["name"]):
failed = False
try:
for arg, pattern in patterns["args"].iteritems():
try:
if len(proc.cmdline) < (arg + 1) or not fnmatch.fnmatch(proc.cmdline[arg], pattern):
failed = True
except KeyError, e:
pass
except KeyError, e:
pass
if failed == False:
matching.append(proc)
except psutil._error.NoSuchProcess, e:
pass
if len(matching) > 0:
up = True
else:
up = False
try:
if up == old_status[service_name]:
send_notice = False
else:
send_notice = True
initial = False
except KeyError, e:
send_notice = True
initial = True
old_status[service_name] = up
if send_notice:
if up:
msg_type = "up"
else:
msg_type = "down"
sock.send(msgpack.packb({
"service": "process",
"msg_type": msg_type,
"unit": service_name,
"initial": initial
}))
time.sleep(interval)

@ -1,4 +0,0 @@
#!/bin/bash
# You need squeeze-backports if you run this on squeeze!
apt-get install -y libzmq-dev libffi-dev build-essential python python-dev
pip install pyzmq msgpack-python pynacl pyyaml psutil

@ -1,4 +0,0 @@
# Backports: echo "deb http://backports.debian.org/debian-backports squeeze-backports main" >> /etc/apt/sources.list && apt-get update && apt-get upgrade
apt-get install -y python python-dev && wget cryto.net/~joepie91/pipfix.sh && chmod +x pipfix.sh && ./pipfix.sh
adduser --system --shell /bin/bash --group monitor && apt-get install -y git; su -c "cd ~; git clone https://github.com/joepie91/cryto-status.git" monitor && /home/monitor/cryto-status/deps.sh
# Replace libzmq with a manually compiled version...: wget http://download.zeromq.org/zeromq-4.0.3.tar.gz; tar -xzvf zeromq-4.0.3.tar.gz; cd zeromq-4.0.3; ./configure; make; make install; ldconfig

@ -1,54 +0,0 @@
* allow comments in (parentheses) in units, and ignore these when matching against an alarm pattern...
* web interface (angularjs)
* separate alarm and IRC logic
* monitor inodes
* watchdog on slave and master -> should send WARN notifications
* notifications (text, arbitrary-serialized-data as attachment, DEBUG/INFO/WARN/ERR/CRIT)
* consider redundancy - can already connect multiple masters through pubsub, how to deal with duplicate processing checking?
cprocessd:
-> subscribe to ccollectd
-> debug switch for outputting all to terminal
-> keep up/down state
-> keep last-value state (resource usage)
-> keep track of persistent downtimes (down for more than X time, as configured in config file)
-> alarms (move this from the IRC bot to cprocessd)
-> classify message importance
-> cprocessd-stream socket, PUB that just streams processed data
-> cprocessd-query socket, REP that responds to queries
-> server-status
-> down-list
-> last-value
-> server-list
-> service-list
cmaild:
-> use marrow.mailer
-> receives data from cprocessd-stream
-> sends e-mails for configured importance levels
cbotd:
-> currently named 'alert'
-> receives data from cprocessd-stream
-> IRC bot
-> posts alerts to specified IRC channels, depending on minimum severity level configured for that channel (ie. INFO for #cryto-network but ERR for #crytocc)
csmsd:
-> sends SMS for (critical) alerts
-> receives data from cprocessd-stream
-> Twilio? does a provider-neutral API exist? might need an extra abstraction...
cwebd:
-> offers web interface with streaming status data
-> publicly accessible and password-protected
-> streaming data from cprocessd-stream
-> on-pageload state from cprocessd-query (including 'current downtimes')
-> tornado+zmq ioloop, http://zeromq.github.io/pyzmq/eventloop.html
-> web dashboard
-> AngularJS
-> fancy graphs (via AngularJS? idk if a directive exists for this)
-> show downtimes as well as live per-machine stats
-> also show overview of all machines in a grid, color-coded for average load of all resources
-> historical up/down data
-> sqlite storage? single concurrent write, so should work
-> perhaps letting people sign up for e-mail alerts is an option? to-inbox will be tricky here
Loading…
Cancel
Save