Compare commits

..

33 Commits

Author SHA1 Message Date
Sven Slootweg 7efb3b3dd2 Update todo 11 years ago
Sven Slootweg 59ac723188 Spec out components 11 years ago
Sven Slootweg d6b0fc2ad4 Update todo 11 years ago
Sven Slootweg a796725057 Switch to pubsub for over-the-wire communication, to prevent memory leaks 11 years ago
Sven Slootweg 0c87ee058c Update todo 11 years ago
Sven Slootweg 570d8f3b85 Cast to int properly 11 years ago
Sven Slootweg 57c015f161 Add bootstrap script 11 years ago
Sven Slootweg e87d048ee9 Hack hack hackity hack hack - we now have check_output in 2.6! 11 years ago
Sven Slootweg b67fdc8ca3 More fixes 11 years ago
Sven Slootweg 0dde712144 Forgot another split... 11 years ago
Sven Slootweg 4dbe92396c Oops, forgot a split 11 years ago
Sven Slootweg 7dca261010 Attempt to fix memory accounting in OpenVZ.... 11 years ago
Sven Slootweg 5424432ddb Patch for disk stats on OpenVZ... 11 years ago
Sven Slootweg 971c5ccce3 Fixes and docs 11 years ago
Sven Slootweg 86b013a0b7 Be more lenient towards receiving errors, and update install/todo notes 11 years ago
Sven Slootweg 5a7e3815ca Update install stuff 11 years ago
Sven Slootweg ff98278520 Try using TCP instead... 11 years ago
Sven Slootweg 762d74d477 A few attempts later... 11 years ago
Sven Slootweg 1df76daa0d Fix perms 11 years ago
Sven Slootweg 96aa2b020f Add collectd listening script 11 years ago
Sven Slootweg 5eb46c13e8 (Probably) fix the blip bug 11 years ago
Sven Slootweg e04448dcd8 Fix warning color for bot when blips are announced 11 years ago
Sven Slootweg 11eb813164 Installation steps and bugfixes 11 years ago
Sven Slootweg e2c9097585 Reorganize some files, add support for units in alarm rules, support network blip detection (including more leniency for heartbeat failures), fix bug in alarm matching, remove debug output, add auto-start mechanism and killscript to cstatsd, fix deps.sh 11 years ago
Sven Slootweg 4cf0601b05 Fixes 11 years ago
Sven Slootweg 5db77ab87c In theory, we should now have heartbeating... 11 years ago
Sven Slootweg ae552ac0f9 Add todo list 11 years ago
Sven Slootweg 20eaf50791 Shut down socket connections properly after testing... 11 years ago
Sven Slootweg 97290dbb1c Set up publish/subscribe mechanism, add example configurations, write IRC bot and alarm management mechanism 11 years ago
Sven Slootweg 56ae6b5305 Implement crypto, add disk i/o stats to machine statistics (and change config format), proper error handling for process monitoring 11 years ago
Sven Slootweg 258f62af22 Add key generation script 11 years ago
Sven Slootweg 4550e0a425 Add process watch, write code for ccollectd, implement ZeroMQ timer class, add dep installation script 11 years ago
Sven Slootweg d310a95e7a Change default interval for port polling to 1 11 years ago

9
.gitignore vendored

@ -1 +1,10 @@
cstatsd/config/*.yaml cstatsd/config/*.yaml
ccollectd/config.yaml
alert/config.yaml
*.pyc
ccollectd/pubkey.dat
ccollectd/privkey.dat
cstatsd/pubkey.dat
cstatsd/privkey.dat
cstatsd/cstatsd.pid
alert/rules.pickle

@ -0,0 +1,304 @@
#!/usr/bin/env python2
import socket, yaml, random, zmq, msgpack, time, uuid, fnmatch
import cPickle as pickle
ctx = zmq.Context()
with open("config.yaml", "r") as cfile:
config = yaml.safe_load(cfile)
try:
with open("rules.pickle", "r") as pfile:
rules = pickle.load(pfile)
except IOError, e:
rules = {}
fetcher = ctx.socket(zmq.SUB)
fetcher.setsockopt(zmq.SUBSCRIBE, "")
fetcher.connect("tcp://127.0.0.1:8998")
class Bot(object):
def __init__(self, hosts, port, nickname, realname, channels, admins, subsock):
self.hosts = hosts
self.port = port
self.nickname = nickname
self.realname = realname
self.channels = channels
self.admins = admins
self.subsock = subsock
self.connected = False
self.last_down = {}
self.known_alarms = {}
self.command_map = {
"422": self.join_all_channels,
"376": self.join_all_channels,
"PRIVMSG": self.receive_message
}
def split_irc(self, message):
if message[0] == ":":
prefix = ":"
message = message[1:]
else:
prefix = ""
if ":" in message:
rest, last = message.split(":", 1)
parts = rest.strip().split() + [last]
else:
parts = message.split()
parts[0] = prefix + parts[0]
return parts
def run(self):
while True: # Connect loop
host = random.choice(self.hosts)
self.sock = socket.socket()
try:
self.sock.connect((host, self.port))
except socket.error, e:
continue # Reconnect
self.send_raw("NICK %s" % self.nickname)
self.sock.send("USER %s 0 0 :%s\r\n" % (self.nickname, self.realname))
buff = ""
while True: # Read loop
r, w, x = zmq.select([self.sock, self.subsock], [], [])
for s in r:
if s == self.sock.fileno():
try:
recvdata = self.sock.recv(1024)
except socket.error, e:
break # Something went wrong, reconnect...
if len(recvdata) == 0:
break # We have disconnected...
buff += recvdata
messages = buff.split("\n")
buff = messages.pop()
for message in messages:
self.process_message(self.split_irc(message.strip("\r")))
elif self.subsock.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0:
# Process incoming data from the subscribe socket...
message = msgpack.unpackb(s.recv())
self.process_stats(message)
def send_raw(self, message):
self.sock.send("%s\r\n" % message)
def send_message(self, recipient, message):
if self.connected == True:
self.send_raw("PRIVMSG %s :%s" % (recipient, message))
def send_all(self, message):
for channel in self.channels:
self.send_message(channel, message)
def join(self, channel):
self.send_raw("JOIN %s" % channel)
def join_all_channels(self, message):
self.connected = True
for channel in self.channels:
self.join(channel)
def receive_message(self, message):
args = message[3].split()
sender = message[0][1:].split("!", 1)[0]
channel = message[2]
try:
if sender in self.admins:
if args[0] == "!addrule":
target, rel, value = args[1:4]
target = self.parse_target(target)
if value[-1].lower() in ("k", "m", "g", "t"):
unit = value[-1].lower()
value = value[:-1]
value = float(value)
value = value * (1024 ** (("k", "m", "g", "t").index(unit) + 1))
rule_id = uuid.uuid4()
rules[rule_id] = {
"target": target,
"operator": rel,
"value": value
}
with open("rules.pickle", "w") as pfile:
pickle.dump(rules, pfile)
self.send_message(channel, "Added rule for %s with ID %s." % (args[1], rule_id))
except Exception, e:
self.send_message(channel, str(e))
def parse_target(self, target):
host, rest = target.split("!", 1)
service, rest = rest.split(".", 1)
resource, rest = rest.split(":", 1)
unit, attribute = rest.split(".", 1)
# TODO: unit = unit.split("(", 1)[0].strip() # Allow () for comments
if host == "*":
host = True
if service == "*":
service = True
if attribute == "*":
attribute = True
if resource == "*":
resource = True
if unit == "*":
unit = True
return {
"host": host,
"service": service,
"resource": resource,
"unit": unit,
"attribute": attribute
}
def format_time_duration(self, seconds):
# http://stackoverflow.com/a/20222351/1332715
days, rem = divmod(seconds, 86400)
hours, rem = divmod(rem, 3600)
minutes, seconds = divmod(rem, 60)
if seconds < 1:
seconds = 1
locals_ = locals()
magnitudes_str = ("{n} {magnitude}".format(n=int(locals_[magnitude]), magnitude=magnitude) for magnitude in ("days", "hours", "minutes", "seconds") if locals_[magnitude])
return ", ".join(magnitudes_str)
def process_stats(self, message):
data = message["message"]
data["host"] = message["host"]
if data["msg_type"] == "up" and data["initial"] == True:
return # We don't need to say what is up, initially...
# TODO: Duration
if data["msg_type"] == "up":
try:
data["duration"] = self.format_time_duration(time.time() - self.last_down["%(host)s!%(service)s.%(unit)s" % data])
except KeyError, e:
data["duration"] = "0 seconds"
self.send_all("\x02\x030,3 [ UP ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is now back up. It was down for %(duration)s." % data)
elif data["msg_type"] == "down":
self.last_down["%(host)s!%(service)s.%(unit)s" % data] = time.time()
self.send_all("\x02\x030,4 [ DOWN ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is \x02down!\x02" % data)
elif data["msg_type"] == "blip":
self.send_all("\x02\x030,7 [ WARNING ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that a blip occurred for \x036%(unit)s\x03!" % data)
elif data["msg_type"] == "value":
for rule_id, rule in rules.iteritems():
check_vals = {
"host": [data["host"]],
"service": [data["service"]],
"resource": [data["resource_type"]],
"unit": [data["unit"]]
}
failed = False
for segment in ("host", "service", "resource", "unit"):
for val in check_vals[segment]:
if rule["target"][segment] is not True and not fnmatch.fnmatch(val, rule["target"][segment]):
failed = True
break
if failed:
continue # Skip to next
# We haven't broken out in the past bit of code, so we're still matching the pattern...
eligible_keys = [key for key in data["values"].keys() if fnmatch.fnmatch(key, rule["target"]["attribute"])]
for key in eligible_keys:
value = data["values"][key]
rule_value = float(rule["value"])
operator = rule["operator"]
if operator == "=":
alarm = (value == rule_value)
elif operator == ">":
alarm = (value > rule_value)
elif operator == "<":
alarm = (value < rule_value)
elif operator == ">=":
alarm = (value >= rule_value)
elif operator == "<=":
alarm = (value <= rule_value)
elif operator == "!=":
alarm = (value != rule_value)
else:
alarm = False
self.trigger_alarm(rule_id, data, alarm, value, key)
def trigger_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key)
if key not in self.known_alarms:
if active:
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
self.known_alarms[key] = time.time()
else:
self.known_alarms[key] = False
else:
if self.known_alarms[key] == False and active:
# Alarm activated
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
self.known_alarms[key] = time.time()
elif self.known_alarms[key] != False and not active:
# Alarm deactivated
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
self.known_alarms[key] = False
def transmit_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
# At this point, we're sure that we want to notify...
rule_target = rules[rule_id]["target"].copy()
for k, v in rule_target.iteritems():
if v is True:
rule_target[k] = "*"
rule_pattern = "%(host)s!%(service)s.%(resource)s:%(unit)s.%(attribute)s" % rule_target
info = {
"host": data["host"],
"rule_id": rule_id,
"rule_pattern": rule_pattern
}
if not active:
key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key)
try:
info["duration"] = self.format_time_duration(time.time() - self.known_alarms[key])
except KeyError, e:
info["duration"] = "0 seconds"
info["unit"] = data["unit"]
info["attribute"] = offending_key
self.send_all("\x02\x030,3 [ SOLVED ] \x03\x02 Host \x037%(host)s\x03 reports that the alarm for rule %(rule_id)s (\x036%(rule_pattern)s\x03) was resolved for \x034%(unit)s\x03.\x034%(attribute)s\x03. It was active for %(duration)s." % info)
else:
info["value"] = offending_value
info["spec"] = "%s %s" % (rules[rule_id]["operator"], rules[rule_id]["value"])
info["unit"] = data["unit"]
info["attribute"] = offending_key
self.send_all("\x02\x030,7 [ ALARM ] \x03\x02 Host \x037%(host)s\x03 reports that an alarm was triggered for rule %(rule_id)s (\x036%(rule_pattern)s\x03). The reported value was\x034 %(value)s\x03 for\x034 %(unit)s\x03.\x034%(attribute)s\x03 , triggering the \x032%(spec)s\x03 condition." % info)
def process_message(self, message):
if message[0].upper() == "PING":
self.send_raw("PONG %s" % message[1])
else:
try:
self.command_map[message[1].upper()](message)
except KeyError, e:
pass
bot = Bot(config["irc"]["hosts"], config["irc"]["port"], config["irc"]["nickname"], config["irc"]["realname"], config["irc"]["channels"], config["irc"]["admins"], fetcher)
bot.run()

@ -0,0 +1,12 @@
irc:
hosts:
- kerpia.cryto.net
- box.cryto.net
- arvel.cryto.net
port: 6667
nickname: StatusBot
realname: Cryto System Monitoring Service
admins:
- joepie91
channels:
- "#test"

@ -0,0 +1,104 @@
#!/usr/bin/env python2
import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys, socket
from nacl.public import PublicKey, PrivateKey, Box
ctx = zmq.Context()
distributor = ctx.socket(zmq.PUB)
distributor.bind("tcp://127.0.0.1:8998")
poller = zmq.Poller()
with open("config.yaml", "r") as cfile:
config = yaml.safe_load(cfile)
with open("privkey.dat", "r") as f:
privkey = PrivateKey(binascii.unhexlify(f.read()))
nodes = config["nodes"]
last_node_status = {}
socket_map = {}
boxes = {}
def heartbeat():
for hostname, node in nodes.iteritems():
retries = 0
while retries < config["heartbeat"]["attempts"]:
try:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(float(config["heartbeat"]["timeout"]) / (retries + 1))
s.connect((node["ip"], node["port"]))
s.shutdown(socket.SHUT_RDWR)
s.close()
up = True
break
except socket.error, e:
up = False
retries += 1
try:
status_changed = (up != last_node_status[hostname])
initial = False
except KeyError, e:
status_changed = True
initial = True
last_node_status[hostname] = up
send_message = False
if status_changed:
if up:
msg_type = "up"
send_message = True
else:
msg_type = "down"
send_message = True
else:
if up and retries > 0:
msg_type = "blip"
send_message = True
if send_message:
distributor.send(msgpack.packb({
"host": config["hostname"],
"message": {
"service": "heartbeat",
"msg_type": msg_type,
"unit": hostname,
"initial": initial
}
}))
timers = zmqtimer.ZmqTimerManager()
timers.add_timer(zmqtimer.ZmqTimer(config["heartbeat"]["interval"], heartbeat))
for hostname, node in config["nodes"].iteritems():
boxes[hostname] = Box(privkey, PublicKey(binascii.unhexlify(node["pubkey"])))
grabber = ctx.socket(zmq.SUB)
grabber.setsockopt(zmq.SUBSCRIBE, "")
grabber.connect(node["endpoint"])
socket_map[grabber] = hostname
poller.register(grabber, zmq.POLLIN)
while True:
timers.check()
socks = dict(poller.poll(timers.get_next_interval()))
for sock in socks:
if socks[sock] == zmq.POLLIN:
host = socket_map[sock]
try:
message = msgpack.unpackb(boxes[host].decrypt(sock.recv()))
except nacl.exceptions.CryptoError, e:
# Probably a spoofed message... skip to next socket
sys.stderr.write("Ignoring message... spoofed? (origin: %s)\n" % host) # FIXME: Use logging module...
continue
except Exception, e:
sys.stderr.write(repr(e) + "\n")
continue
distributor.send(msgpack.packb({
"host": host,
"message": message
}))

@ -0,0 +1,13 @@
hostname: monitoring.cryto.net
heartbeat:
interval: 5
timeout: 1
attempts: 3
nodes:
localhost:
ip: 127.0.0.1
port: 6543
endpoint: tcp://127.0.0.1:6543
pubkey: bd784ef4065c9bd31627106dc55e26764605a144c6fc45ce93f33cbd19dd7333

@ -0,0 +1,15 @@
#!/usr/bin/env python2
import yaml, os, stat, binascii
from nacl.public import PrivateKey
privkey = PrivateKey.generate()
pubkey = privkey.public_key
with open("privkey.dat", "w") as f:
f.write(binascii.hexlify(str(privkey)))
with open("pubkey.dat", "w") as f:
f.write(binascii.hexlify(str(pubkey)))
os.chmod("privkey.dat", stat.S_IRUSR | stat.S_IWUSR)

@ -0,0 +1,12 @@
#!/usr/bin/env python2
import zmq, msgpack
ctx = zmq.Context()
fetcher = ctx.socket(zmq.SUB)
fetcher.setsockopt(zmq.SUBSCRIBE, "")
fetcher.connect("tcp://127.0.0.1:8998")
while True:
message = msgpack.unpackb(fetcher.recv())
print message

@ -0,0 +1,41 @@
import time
class ZmqTimerManager(object):
def __init__(self):
self.timers = []
self.next_call = 0
def add_timer(self, timer):
self.timers.append(timer)
def check(self):
if time.time() > self.next_call:
for timer in self.timers:
timer.check()
def get_next_interval(self):
if time.time() >= self.next_call:
call_times = []
for timer in self.timers:
call_times.append(timer.get_next_call())
self.next_call = min(call_times)
if self.next_call < time.time():
return 0
else:
return (self.next_call - time.time()) * 1000
else:
return (self.next_call - time.time()) * 1000
class ZmqTimer(object):
def __init__(self, interval, callback):
self.interval = interval
self.callback = callback
self.last_call = 0
def check(self):
if time.time() > (self.interval + self.last_call):
self.callback()
self.last_call = time.time()
def get_next_call(self):
return self.last_call + self.interval

@ -0,0 +1,6 @@
#!/bin/bash
echo "Generating keypair..."
./genkey 2>/dev/null
./bootstrap-config
echo "Your public key: `cat pubkey.dat`"
echo "Server IP: `curl -s http://wtfismyip.com/text`" 2>/dev/null

@ -0,0 +1,86 @@
#!/usr/bin/env python2
import yaml, sys
master_pubkey = raw_input("Public key of the master server: ")
print "You'll now be asked to configure ports to check. If you don't want to configure any ports, just hit enter without entering any information."
ports = {}
while True:
port = raw_input("Port number: ")
if port.strip() == "":
break
service_name = raw_input("Service name for port %s: " % port)
ports[int(port)] = service_name
print "The same thing, except now for processes to check. Just hit enter without entering any information when you're done; the same goes for the argument list. As a wildcard, you can use *"
services = {}
while True:
service_name = raw_input("Service name: ")
if service_name.strip() == "":
break
process_name = raw_input("Process name: ")
args = {}
argnum = 1
while True:
arg = raw_input("Argument %d: " % argnum)
if arg.strip() == "":
break
args[argnum] = arg
argnum += 1
services[service_name] = {
"name": process_name,
"args": args
}
print "Now enter any disk devices you wish to monitor. Leave empty and hit enter when done."
disks = []
while True:
device_name = raw_input("Device name: ")
if device_name.strip() == "":
break
disks.append(device_name)
# Write config files...
modules = []
modules.append("stats-machine")
with open("config/machine.yaml.example", "r") as ef:
with open("config/machine.yaml", "w") as ff:
data = yaml.safe_load(ef.read())
data["drives"] = disks
ff.write(yaml.dump(data))
if len(ports) > 0:
modules.append("stats-ports")
with open("config/ports.yaml.example", "r") as ef:
with open("config/ports.yaml", "w") as ff:
data = yaml.safe_load(ef.read())
data["ports"] = ports
ff.write(yaml.dump(data))
if len(services) > 0:
modules.append("stats-processes")
with open("config/processes.yaml.example", "r") as ef:
with open("config/processes.yaml", "w") as ff:
data = yaml.safe_load(ef.read())
data["processes"] = services
ff.write(yaml.dump(data))
with open("config/cstatsd.yaml.example", "r") as ef:
with open("config/cstatsd.yaml", "w") as ff:
data = yaml.safe_load(ef.read())
data["pubkey"] = master_pubkey
data["autostart"] = modules
ff.write(yaml.dump(data))

@ -0,0 +1,7 @@
endpoint: tcp://*:6543
pubkey: a266a0634790a79c6934385892f7c377d35b8f03b9c6ac7d5bfed4a94f93ba65
autostart:
- stats-processes
- stats-ports
- stats-machine

@ -1,4 +1,5 @@
interval: 1 interval: 1
drives: drives:
- / - /dev/sda1
- /dev/sdb1

@ -1,4 +1,4 @@
interval: 5 interval: 1
ports: ports:
6667: UnrealIRCd 6667: UnrealIRCd

@ -0,0 +1,15 @@
interval: 5
processes:
radiotray:
name: '*python*'
args:
1: /usr/bin/radiotray
guake:
name: '*python*'
args:
1: /usr/local/bin/guake
keepassx:
name: keepassx

@ -1,12 +1,43 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
import zmq, msgpack import zmq, yaml, binascii, nacl, sys, subprocess, os
from nacl.public import PublicKey, PrivateKey, Box
basedir = os.path.dirname(os.path.realpath(__file__))
with open("cstatsd.pid", "w") as pidfile:
pidfile.write(str(os.getpid()))
ctx = zmq.Context() ctx = zmq.Context()
with open("config/cstatsd.yaml", "r") as cfile:
config = yaml.safe_load(cfile)
pubkey = PublicKey(binascii.unhexlify(config["pubkey"]))
with open("privkey.dat", "r") as f:
privkey = PrivateKey(binascii.unhexlify(f.read()))
box = Box(privkey, pubkey)
collector = ctx.socket(zmq.PULL) collector = ctx.socket(zmq.PULL)
collector.bind("ipc:///tmp/cstatsd") collector.bind("ipc:///tmp/cstatsd")
shipper = ctx.socket(zmq.PUB)
shipper.bind(config["endpoint"])
try:
disable_autostart = (sys.argv[1] == "--disable-autostart")
except:
disable_autostart = False
if disable_autostart == False:
with open("/dev/null", "w+") as stfu:
for script in config["autostart"]:
print os.path.join(basedir, script)
subprocess.Popen([os.path.join(basedir, script)], stdout=stfu, stderr=stfu)
while True: while True:
message = msgpack.unpackb(collector.recv()) message = collector.recv()
print message nonce = nacl.utils.random(Box.NONCE_SIZE)
shipper.send(box.encrypt(message, nonce))

@ -0,0 +1,15 @@
#!/usr/bin/env python2
import yaml, os, stat, binascii
from nacl.public import PrivateKey
privkey = PrivateKey.generate()
pubkey = privkey.public_key
with open("privkey.dat", "w") as f:
f.write(binascii.hexlify(str(privkey)))
with open("pubkey.dat", "w") as f:
f.write(binascii.hexlify(str(pubkey)))
os.chmod("privkey.dat", stat.S_IRUSR | stat.S_IWUSR)

@ -0,0 +1,5 @@
#!/bin/bash
PID=`cat cstatsd.pid`
pkill -P $PID
kill $PID

@ -0,0 +1,224 @@
#!/usr/bin/env python2
import zmq, msgpack, time, psutil, yaml, os, subprocess
from collections import namedtuple
# Horrible hack to make check_output exist in 2.6
# http://stackoverflow.com/a/13160748/1332715
if "check_output" not in dir( subprocess ): # duck punch it in!
def f(*popenargs, **kwargs):
if 'stdout' in kwargs:
raise ValueError('stdout argument not allowed, it will be overridden.')
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
output, unused_err = process.communicate()
retcode = process.poll()
if retcode:
cmd = kwargs.get("args")
if cmd is None:
cmd = popenargs[0]
raise subprocess.CalledProcessError(retcode, cmd)
return output
subprocess.check_output = f
ctx = zmq.Context()
sock = ctx.socket(zmq.PUSH)
sock.connect("ipc:///tmp/cstatsd")
with open("config/machine.yaml", "r") as cfile:
config = yaml.safe_load(cfile)
interval = config["interval"]
old_net_data = {}
disk_map = {}
last_io_data = {}
if os.path.exists("/proc/user_beancounters") and not os.path.exists("/proc/vz/vestat"):
openvz_burst = True
FakeRam = namedtuple("FakeRam", ["total", "used", "available", "percent", "buffers", "cached"])
else:
openvz_burst = False
for disk in psutil.disk_partitions():
disk_map[disk.device] = disk
if len(disk_map) == 0:
# We're probably on OpenVZ, so /proc/partitions doesn't exist. Fall back to 'df'.
FakeDisk = namedtuple("FakeDisk", ["device", "mountpoint"])
for line in subprocess.check_output(["df"]).splitlines()[1:]:
device, _, _, _, _, mountpoint = line.split()
disk_map[device] = FakeDisk(device, mountpoint)
while True:
load_avgs = os.getloadavg()
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "load_average",
"unit": "",
"values": {
"1m": load_avgs[0],
"5m": load_avgs[1],
"15m": load_avgs[2]
}
}))
cpu_loads = psutil.cpu_percent(percpu=True)
for i in xrange(0, len(cpu_loads)):
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "cpu",
"unit": "core%d" % (i + 1),
"values": {
"load": cpu_loads[i]
}
}))
try:
io_counters = psutil.disk_io_counters(perdisk=True)
except IOError, e:
io_counters = {} # OpenVZ...
for drive in config["drives"]:
drive_data = psutil.disk_usage(disk_map[drive].mountpoint)
io_data = None
for diskname, data in io_counters.iteritems():
if drive.endswith(diskname):
io_data = data
if io_data is None or drive not in last_io_data:
read_bps = 0
write_bps = 0
read_iops = 0
write_iops = 0
else:
read_bps = (io_data.read_bytes - last_io_data[drive].read_bytes) / interval
write_bps = (io_data.write_bytes - last_io_data[drive].write_bytes) / interval
read_iops = (io_data.read_count - last_io_data[drive].read_count) / interval
write_iops = (io_data.write_count - last_io_data[drive].write_count) / interval
if io_data is not None:
last_io_data[drive] = io_data
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "disk",
"unit": drive,
"values": {
"total": drive_data.total,
"used": drive_data.used,
"free": drive_data.free,
"used_percentage": drive_data.percent,
"bps_read": read_bps,
"bps_write": write_bps,
"iops_read": read_iops,
"iops_write": write_iops,
}
}))
if openvz_burst:
# Sigh, OpenVZ... let's use 'free', since that apparently -does- understand OpenVZ.
lines = subprocess.check_output(["free", "-b"]).splitlines()
_, ram_total, ram_used, ram_free, _, ram_buffers, ram_cached = lines[1].split()
_, _, _, ram_available = lines[2].split()
ram_total = int(ram_total)
ram_free = int(ram_free)
ram_buffers = int(ram_buffers)
ram_cached = int(ram_cached)
ram_available = int(ram_available)
ram_used = int(ram_used)
ram_percent = 1.0 * (ram_total - ram_available) / ram_total * 100
ram_data = FakeRam(ram_total, ram_used, ram_available, ram_percent, ram_buffers, ram_cached)
else:
ram_data = psutil.virtual_memory()
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "memory",
"unit": "physical",
"values": {
"total": ram_data.total,
"used": ram_data.used,
"free": ram_data.available,
"used_percentage": ram_data.percent,
"buffers": ram_data.buffers,
"cache": ram_data.cached
}
}))
swap_data = psutil.swap_memory()
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "memory",
"unit": "swap",
"values": {
"total": swap_data.total,
"used": swap_data.used,
"free": swap_data.free,
"used_percentage": swap_data.percent
}
}))
net_data = psutil.net_io_counters(pernic=True)
for nic, data in net_data.iteritems():
try:
old_in_b = old_net_data[nic].bytes_recv
old_out_b = old_net_data[nic].bytes_sent
old_in_p = old_net_data[nic].packets_recv
old_out_p = old_net_data[nic].packets_sent
except KeyError, e:
# No old data yet, first run? Save and skip to next...
old_net_data[nic] = data
continue
diff_in_b = data.bytes_recv - old_in_b
diff_out_b = data.bytes_sent - old_out_b
diff_in_p = data.packets_recv - old_in_p
diff_out_p = data.packets_sent - old_out_p
if diff_in_b < 0:
diff_in_b = 0
if diff_out_b < 0:
diff_out_b = 0
if diff_in_p < 0:
diff_in_p = 0
if diff_out_p < 0:
diff_out_p = 0
old_net_data[nic] = data
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "network",
"unit": nic,
"values": {
"bps_in": diff_in_b / interval,
"bps_out": diff_out_b / interval,
"pps_in": diff_in_p / interval,
"pps_out": diff_out_p / interval
}
}))
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "uptime",
"unit": "",
"values": {
"uptime": time.time() - psutil.get_boot_time()
}
}))
time.sleep(interval)

@ -1,143 +0,0 @@
#!/usr/bin/env python2
import zmq, msgpack, time, psutil, yaml, os
ctx = zmq.Context()
sock = ctx.socket(zmq.PUSH)
sock.connect("ipc:///tmp/cstatsd")
with open("config/machine.yaml", "r") as cfile:
config = yaml.safe_load(cfile)
interval = config["interval"]
old_net_data = {}
while True:
load_avgs = os.getloadavg()
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "load_average",
"unit": "",
"values": {
"1m": load_avgs[0],
"5m": load_avgs[1],
"15m": load_avgs[2]
}
}))
cpu_loads = psutil.cpu_percent(percpu=True)
for i in xrange(0, len(cpu_loads)):
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "cpu",
"unit": "core%d" % (i + 1),
"values": {
"load": cpu_loads[i]
}
}))
for drive in config["drives"]:
drive_data = psutil.disk_usage(drive)
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "disk",
"unit": drive,
"values": {
"total": drive_data.total,
"used": drive_data.used,
"free": drive_data.free,
"used_percentage": drive_data.percent
}
}))
ram_data = psutil.virtual_memory()
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "memory",
"unit": "physical",
"values": {
"total": ram_data.total,
"used": ram_data.used,
"free": ram_data.available,
"used_percentage": ram_data.percent,
"buffers": ram_data.buffers,
"cache": ram_data.cached
}
}))
swap_data = psutil.virtual_memory()
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "memory",
"unit": "swap",
"values": {
"total": swap_data.total,
"used": swap_data.used,
"free": swap_data.free,
"used_percentage": swap_data.percent
}
}))
net_data = psutil.net_io_counters(pernic=True)
for nic, data in net_data.iteritems():
try:
old_in_b = old_net_data[nic].bytes_recv
old_out_b = old_net_data[nic].bytes_sent
old_in_p = old_net_data[nic].packets_recv
old_out_p = old_net_data[nic].packets_sent
except KeyError, e:
# No old data yet, first run? Save and skip to next...
old_net_data[nic] = data
continue
diff_in_b = data.bytes_recv - old_in_b
diff_out_b = data.bytes_sent - old_out_b
diff_in_p = data.packets_recv - old_in_p
diff_out_p = data.packets_sent - old_out_p
if diff_in_b < 0:
diff_in_b = 0
if diff_out_b < 0:
diff_out_b = 0
if diff_in_p < 0:
diff_in_p = 0
if diff_out_p < 0:
diff_out_p = 0
old_net_data[nic] = data
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "network",
"unit": nic,
"values": {
"bps_in": diff_in_b / interval,
"bps_out": diff_out_b / interval,
"pps_in": diff_in_p / interval,
"pps_out": diff_out_p / interval
}
}))
sock.send(msgpack.packb({
"service": "machine",
"msg_type": "value",
"resource_type": "uptime",
"unit": "",
"values": {
"uptime": time.time() - psutil.get_boot_time()
}
}))
time.sleep(interval)

@ -20,6 +20,7 @@ while True:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(0.5) s.settimeout(0.5)
s.connect(("127.0.0.1", port)) s.connect(("127.0.0.1", port))
s.shutdown(socket.SHUT_RDWR)
s.close() s.close()
up = True up = True
except socket.error, e: except socket.error, e:

@ -0,0 +1,72 @@
#!/usr/bin/env python2
import zmq, msgpack, time, yaml, psutil, fnmatch
ctx = zmq.Context()
sock = ctx.socket(zmq.PUSH)
sock.connect("ipc:///tmp/cstatsd")
with open("config/processes.yaml", "r") as cfile:
config = yaml.safe_load(cfile)
interval = config["interval"]
old_status = {}
while True:
all_procs = psutil.get_process_list()
for service_name, patterns in config["processes"].iteritems():
matching = []
for proc in all_procs: # Can't use filter() because of exceptions...
try:
if len(proc.cmdline) > 0 and fnmatch.fnmatch(proc.cmdline[0], patterns["name"]):
failed = False
try:
for arg, pattern in patterns["args"].iteritems():
try:
if len(proc.cmdline) < (arg + 1) or not fnmatch.fnmatch(proc.cmdline[arg], pattern):
failed = True
except KeyError, e:
pass
except KeyError, e:
pass
if failed == False:
matching.append(proc)
except psutil._error.NoSuchProcess, e:
pass
if len(matching) > 0:
up = True
else:
up = False
try:
if up == old_status[service_name]:
send_notice = False
else:
send_notice = True
initial = False
except KeyError, e:
send_notice = True
initial = True
old_status[service_name] = up
if send_notice:
if up:
msg_type = "up"
else:
msg_type = "down"
sock.send(msgpack.packb({
"service": "process",
"msg_type": msg_type,
"unit": service_name,
"initial": initial
}))
time.sleep(interval)

@ -0,0 +1,4 @@
#!/bin/bash
# You need squeeze-backports if you run this on squeeze!
apt-get install -y libzmq-dev libffi-dev build-essential python python-dev
pip install pyzmq msgpack-python pynacl pyyaml psutil

@ -0,0 +1,4 @@
# Backports: echo "deb http://backports.debian.org/debian-backports squeeze-backports main" >> /etc/apt/sources.list && apt-get update && apt-get upgrade
apt-get install -y python python-dev && wget cryto.net/~joepie91/pipfix.sh && chmod +x pipfix.sh && ./pipfix.sh
adduser --system --shell /bin/bash --group monitor && apt-get install -y git; su -c "cd ~; git clone https://github.com/joepie91/cryto-status.git" monitor && /home/monitor/cryto-status/deps.sh
# Replace libzmq with a manually compiled version...: wget http://download.zeromq.org/zeromq-4.0.3.tar.gz; tar -xzvf zeromq-4.0.3.tar.gz; cd zeromq-4.0.3; ./configure; make; make install; ldconfig

@ -0,0 +1,54 @@
* allow comments in (parentheses) in units, and ignore these when matching against an alarm pattern...
* web interface (angularjs)
* separate alarm and IRC logic
* monitor inodes
* watchdog on slave and master -> should send WARN notifications
* notifications (text, arbitrary-serialized-data as attachment, DEBUG/INFO/WARN/ERR/CRIT)
* consider redundancy - can already connect multiple masters through pubsub, how to deal with duplicate processing checking?
cprocessd:
-> subscribe to ccollectd
-> debug switch for outputting all to terminal
-> keep up/down state
-> keep last-value state (resource usage)
-> keep track of persistent downtimes (down for more than X time, as configured in config file)
-> alarms (move this from the IRC bot to cprocessd)
-> classify message importance
-> cprocessd-stream socket, PUB that just streams processed data
-> cprocessd-query socket, REP that responds to queries
-> server-status
-> down-list
-> last-value
-> server-list
-> service-list
cmaild:
-> use marrow.mailer
-> receives data from cprocessd-stream
-> sends e-mails for configured importance levels
cbotd:
-> currently named 'alert'
-> receives data from cprocessd-stream
-> IRC bot
-> posts alerts to specified IRC channels, depending on minimum severity level configured for that channel (ie. INFO for #cryto-network but ERR for #crytocc)
csmsd:
-> sends SMS for (critical) alerts
-> receives data from cprocessd-stream
-> Twilio? does a provider-neutral API exist? might need an extra abstraction...
cwebd:
-> offers web interface with streaming status data
-> publicly accessible and password-protected
-> streaming data from cprocessd-stream
-> on-pageload state from cprocessd-query (including 'current downtimes')
-> tornado+zmq ioloop, http://zeromq.github.io/pyzmq/eventloop.html
-> web dashboard
-> AngularJS
-> fancy graphs (via AngularJS? idk if a directive exists for this)
-> show downtimes as well as live per-machine stats
-> also show overview of all machines in a grid, color-coded for average load of all resources
-> historical up/down data
-> sqlite storage? single concurrent write, so should work
-> perhaps letting people sign up for e-mail alerts is an option? to-inbox will be tricky here
Loading…
Cancel
Save