Compare commits
33 Commits
Author | SHA1 | Date |
---|---|---|
Sven Slootweg | 7efb3b3dd2 | 11 years ago |
Sven Slootweg | 59ac723188 | 11 years ago |
Sven Slootweg | d6b0fc2ad4 | 11 years ago |
Sven Slootweg | a796725057 | 11 years ago |
Sven Slootweg | 0c87ee058c | 11 years ago |
Sven Slootweg | 570d8f3b85 | 11 years ago |
Sven Slootweg | 57c015f161 | 11 years ago |
Sven Slootweg | e87d048ee9 | 11 years ago |
Sven Slootweg | b67fdc8ca3 | 11 years ago |
Sven Slootweg | 0dde712144 | 11 years ago |
Sven Slootweg | 4dbe92396c | 11 years ago |
Sven Slootweg | 7dca261010 | 11 years ago |
Sven Slootweg | 5424432ddb | 11 years ago |
Sven Slootweg | 971c5ccce3 | 11 years ago |
Sven Slootweg | 86b013a0b7 | 11 years ago |
Sven Slootweg | 5a7e3815ca | 11 years ago |
Sven Slootweg | ff98278520 | 11 years ago |
Sven Slootweg | 762d74d477 | 11 years ago |
Sven Slootweg | 1df76daa0d | 11 years ago |
Sven Slootweg | 96aa2b020f | 11 years ago |
Sven Slootweg | 5eb46c13e8 | 11 years ago |
Sven Slootweg | e04448dcd8 | 11 years ago |
Sven Slootweg | 11eb813164 | 11 years ago |
Sven Slootweg | e2c9097585 | 11 years ago |
Sven Slootweg | 4cf0601b05 | 11 years ago |
Sven Slootweg | 5db77ab87c | 11 years ago |
Sven Slootweg | ae552ac0f9 | 11 years ago |
Sven Slootweg | 20eaf50791 | 11 years ago |
Sven Slootweg | 97290dbb1c | 11 years ago |
Sven Slootweg | 56ae6b5305 | 11 years ago |
Sven Slootweg | 258f62af22 | 11 years ago |
Sven Slootweg | 4550e0a425 | 11 years ago |
Sven Slootweg | d310a95e7a | 11 years ago |
@ -1 +1,10 @@
|
|||||||
cstatsd/config/*.yaml
|
cstatsd/config/*.yaml
|
||||||
|
ccollectd/config.yaml
|
||||||
|
alert/config.yaml
|
||||||
|
*.pyc
|
||||||
|
ccollectd/pubkey.dat
|
||||||
|
ccollectd/privkey.dat
|
||||||
|
cstatsd/pubkey.dat
|
||||||
|
cstatsd/privkey.dat
|
||||||
|
cstatsd/cstatsd.pid
|
||||||
|
alert/rules.pickle
|
||||||
|
@ -0,0 +1,304 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
|
import socket, yaml, random, zmq, msgpack, time, uuid, fnmatch
|
||||||
|
import cPickle as pickle
|
||||||
|
|
||||||
|
ctx = zmq.Context()
|
||||||
|
|
||||||
|
with open("config.yaml", "r") as cfile:
|
||||||
|
config = yaml.safe_load(cfile)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open("rules.pickle", "r") as pfile:
|
||||||
|
rules = pickle.load(pfile)
|
||||||
|
except IOError, e:
|
||||||
|
rules = {}
|
||||||
|
|
||||||
|
fetcher = ctx.socket(zmq.SUB)
|
||||||
|
fetcher.setsockopt(zmq.SUBSCRIBE, "")
|
||||||
|
fetcher.connect("tcp://127.0.0.1:8998")
|
||||||
|
|
||||||
|
class Bot(object):
|
||||||
|
def __init__(self, hosts, port, nickname, realname, channels, admins, subsock):
|
||||||
|
self.hosts = hosts
|
||||||
|
self.port = port
|
||||||
|
self.nickname = nickname
|
||||||
|
self.realname = realname
|
||||||
|
self.channels = channels
|
||||||
|
self.admins = admins
|
||||||
|
self.subsock = subsock
|
||||||
|
self.connected = False
|
||||||
|
self.last_down = {}
|
||||||
|
self.known_alarms = {}
|
||||||
|
|
||||||
|
self.command_map = {
|
||||||
|
"422": self.join_all_channels,
|
||||||
|
"376": self.join_all_channels,
|
||||||
|
"PRIVMSG": self.receive_message
|
||||||
|
}
|
||||||
|
|
||||||
|
def split_irc(self, message):
|
||||||
|
if message[0] == ":":
|
||||||
|
prefix = ":"
|
||||||
|
message = message[1:]
|
||||||
|
else:
|
||||||
|
prefix = ""
|
||||||
|
|
||||||
|
if ":" in message:
|
||||||
|
rest, last = message.split(":", 1)
|
||||||
|
parts = rest.strip().split() + [last]
|
||||||
|
else:
|
||||||
|
parts = message.split()
|
||||||
|
|
||||||
|
parts[0] = prefix + parts[0]
|
||||||
|
return parts
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
while True: # Connect loop
|
||||||
|
host = random.choice(self.hosts)
|
||||||
|
|
||||||
|
self.sock = socket.socket()
|
||||||
|
try:
|
||||||
|
self.sock.connect((host, self.port))
|
||||||
|
except socket.error, e:
|
||||||
|
continue # Reconnect
|
||||||
|
self.send_raw("NICK %s" % self.nickname)
|
||||||
|
self.sock.send("USER %s 0 0 :%s\r\n" % (self.nickname, self.realname))
|
||||||
|
|
||||||
|
buff = ""
|
||||||
|
while True: # Read loop
|
||||||
|
r, w, x = zmq.select([self.sock, self.subsock], [], [])
|
||||||
|
|
||||||
|
for s in r:
|
||||||
|
if s == self.sock.fileno():
|
||||||
|
try:
|
||||||
|
recvdata = self.sock.recv(1024)
|
||||||
|
except socket.error, e:
|
||||||
|
break # Something went wrong, reconnect...
|
||||||
|
|
||||||
|
if len(recvdata) == 0:
|
||||||
|
break # We have disconnected...
|
||||||
|
|
||||||
|
buff += recvdata
|
||||||
|
messages = buff.split("\n")
|
||||||
|
buff = messages.pop()
|
||||||
|
|
||||||
|
for message in messages:
|
||||||
|
self.process_message(self.split_irc(message.strip("\r")))
|
||||||
|
elif self.subsock.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0:
|
||||||
|
# Process incoming data from the subscribe socket...
|
||||||
|
message = msgpack.unpackb(s.recv())
|
||||||
|
self.process_stats(message)
|
||||||
|
|
||||||
|
def send_raw(self, message):
|
||||||
|
self.sock.send("%s\r\n" % message)
|
||||||
|
|
||||||
|
def send_message(self, recipient, message):
|
||||||
|
if self.connected == True:
|
||||||
|
self.send_raw("PRIVMSG %s :%s" % (recipient, message))
|
||||||
|
|
||||||
|
def send_all(self, message):
|
||||||
|
for channel in self.channels:
|
||||||
|
self.send_message(channel, message)
|
||||||
|
|
||||||
|
def join(self, channel):
|
||||||
|
self.send_raw("JOIN %s" % channel)
|
||||||
|
|
||||||
|
def join_all_channels(self, message):
|
||||||
|
self.connected = True
|
||||||
|
for channel in self.channels:
|
||||||
|
self.join(channel)
|
||||||
|
|
||||||
|
def receive_message(self, message):
|
||||||
|
args = message[3].split()
|
||||||
|
sender = message[0][1:].split("!", 1)[0]
|
||||||
|
channel = message[2]
|
||||||
|
|
||||||
|
try:
|
||||||
|
if sender in self.admins:
|
||||||
|
if args[0] == "!addrule":
|
||||||
|
target, rel, value = args[1:4]
|
||||||
|
target = self.parse_target(target)
|
||||||
|
|
||||||
|
if value[-1].lower() in ("k", "m", "g", "t"):
|
||||||
|
unit = value[-1].lower()
|
||||||
|
value = value[:-1]
|
||||||
|
value = float(value)
|
||||||
|
value = value * (1024 ** (("k", "m", "g", "t").index(unit) + 1))
|
||||||
|
|
||||||
|
rule_id = uuid.uuid4()
|
||||||
|
rules[rule_id] = {
|
||||||
|
"target": target,
|
||||||
|
"operator": rel,
|
||||||
|
"value": value
|
||||||
|
}
|
||||||
|
|
||||||
|
with open("rules.pickle", "w") as pfile:
|
||||||
|
pickle.dump(rules, pfile)
|
||||||
|
|
||||||
|
self.send_message(channel, "Added rule for %s with ID %s." % (args[1], rule_id))
|
||||||
|
except Exception, e:
|
||||||
|
self.send_message(channel, str(e))
|
||||||
|
|
||||||
|
def parse_target(self, target):
|
||||||
|
host, rest = target.split("!", 1)
|
||||||
|
service, rest = rest.split(".", 1)
|
||||||
|
resource, rest = rest.split(":", 1)
|
||||||
|
unit, attribute = rest.split(".", 1)
|
||||||
|
# TODO: unit = unit.split("(", 1)[0].strip() # Allow () for comments
|
||||||
|
if host == "*":
|
||||||
|
host = True
|
||||||
|
if service == "*":
|
||||||
|
service = True
|
||||||
|
if attribute == "*":
|
||||||
|
attribute = True
|
||||||
|
if resource == "*":
|
||||||
|
resource = True
|
||||||
|
if unit == "*":
|
||||||
|
unit = True
|
||||||
|
return {
|
||||||
|
"host": host,
|
||||||
|
"service": service,
|
||||||
|
"resource": resource,
|
||||||
|
"unit": unit,
|
||||||
|
"attribute": attribute
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def format_time_duration(self, seconds):
|
||||||
|
# http://stackoverflow.com/a/20222351/1332715
|
||||||
|
days, rem = divmod(seconds, 86400)
|
||||||
|
hours, rem = divmod(rem, 3600)
|
||||||
|
minutes, seconds = divmod(rem, 60)
|
||||||
|
if seconds < 1:
|
||||||
|
seconds = 1
|
||||||
|
locals_ = locals()
|
||||||
|
magnitudes_str = ("{n} {magnitude}".format(n=int(locals_[magnitude]), magnitude=magnitude) for magnitude in ("days", "hours", "minutes", "seconds") if locals_[magnitude])
|
||||||
|
return ", ".join(magnitudes_str)
|
||||||
|
|
||||||
|
def process_stats(self, message):
|
||||||
|
data = message["message"]
|
||||||
|
data["host"] = message["host"]
|
||||||
|
|
||||||
|
if data["msg_type"] == "up" and data["initial"] == True:
|
||||||
|
return # We don't need to say what is up, initially...
|
||||||
|
|
||||||
|
# TODO: Duration
|
||||||
|
if data["msg_type"] == "up":
|
||||||
|
try:
|
||||||
|
data["duration"] = self.format_time_duration(time.time() - self.last_down["%(host)s!%(service)s.%(unit)s" % data])
|
||||||
|
except KeyError, e:
|
||||||
|
data["duration"] = "0 seconds"
|
||||||
|
self.send_all("\x02\x030,3 [ UP ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is now back up. It was down for %(duration)s." % data)
|
||||||
|
elif data["msg_type"] == "down":
|
||||||
|
self.last_down["%(host)s!%(service)s.%(unit)s" % data] = time.time()
|
||||||
|
self.send_all("\x02\x030,4 [ DOWN ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is \x02down!\x02" % data)
|
||||||
|
elif data["msg_type"] == "blip":
|
||||||
|
self.send_all("\x02\x030,7 [ WARNING ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that a blip occurred for \x036%(unit)s\x03!" % data)
|
||||||
|
elif data["msg_type"] == "value":
|
||||||
|
for rule_id, rule in rules.iteritems():
|
||||||
|
check_vals = {
|
||||||
|
"host": [data["host"]],
|
||||||
|
"service": [data["service"]],
|
||||||
|
"resource": [data["resource_type"]],
|
||||||
|
"unit": [data["unit"]]
|
||||||
|
}
|
||||||
|
|
||||||
|
failed = False
|
||||||
|
for segment in ("host", "service", "resource", "unit"):
|
||||||
|
for val in check_vals[segment]:
|
||||||
|
if rule["target"][segment] is not True and not fnmatch.fnmatch(val, rule["target"][segment]):
|
||||||
|
failed = True
|
||||||
|
break
|
||||||
|
if failed:
|
||||||
|
continue # Skip to next
|
||||||
|
|
||||||
|
# We haven't broken out in the past bit of code, so we're still matching the pattern...
|
||||||
|
eligible_keys = [key for key in data["values"].keys() if fnmatch.fnmatch(key, rule["target"]["attribute"])]
|
||||||
|
|
||||||
|
for key in eligible_keys:
|
||||||
|
value = data["values"][key]
|
||||||
|
rule_value = float(rule["value"])
|
||||||
|
operator = rule["operator"]
|
||||||
|
|
||||||
|
if operator == "=":
|
||||||
|
alarm = (value == rule_value)
|
||||||
|
elif operator == ">":
|
||||||
|
alarm = (value > rule_value)
|
||||||
|
elif operator == "<":
|
||||||
|
alarm = (value < rule_value)
|
||||||
|
elif operator == ">=":
|
||||||
|
alarm = (value >= rule_value)
|
||||||
|
elif operator == "<=":
|
||||||
|
alarm = (value <= rule_value)
|
||||||
|
elif operator == "!=":
|
||||||
|
alarm = (value != rule_value)
|
||||||
|
else:
|
||||||
|
alarm = False
|
||||||
|
|
||||||
|
self.trigger_alarm(rule_id, data, alarm, value, key)
|
||||||
|
|
||||||
|
def trigger_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
|
||||||
|
key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key)
|
||||||
|
|
||||||
|
if key not in self.known_alarms:
|
||||||
|
if active:
|
||||||
|
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
|
||||||
|
self.known_alarms[key] = time.time()
|
||||||
|
else:
|
||||||
|
self.known_alarms[key] = False
|
||||||
|
else:
|
||||||
|
if self.known_alarms[key] == False and active:
|
||||||
|
# Alarm activated
|
||||||
|
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
|
||||||
|
self.known_alarms[key] = time.time()
|
||||||
|
elif self.known_alarms[key] != False and not active:
|
||||||
|
# Alarm deactivated
|
||||||
|
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
|
||||||
|
self.known_alarms[key] = False
|
||||||
|
|
||||||
|
def transmit_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
|
||||||
|
# At this point, we're sure that we want to notify...
|
||||||
|
rule_target = rules[rule_id]["target"].copy()
|
||||||
|
for k, v in rule_target.iteritems():
|
||||||
|
if v is True:
|
||||||
|
rule_target[k] = "*"
|
||||||
|
|
||||||
|
rule_pattern = "%(host)s!%(service)s.%(resource)s:%(unit)s.%(attribute)s" % rule_target
|
||||||
|
|
||||||
|
info = {
|
||||||
|
"host": data["host"],
|
||||||
|
"rule_id": rule_id,
|
||||||
|
"rule_pattern": rule_pattern
|
||||||
|
}
|
||||||
|
|
||||||
|
if not active:
|
||||||
|
key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key)
|
||||||
|
try:
|
||||||
|
info["duration"] = self.format_time_duration(time.time() - self.known_alarms[key])
|
||||||
|
except KeyError, e:
|
||||||
|
info["duration"] = "0 seconds"
|
||||||
|
info["unit"] = data["unit"]
|
||||||
|
info["attribute"] = offending_key
|
||||||
|
|
||||||
|
self.send_all("\x02\x030,3 [ SOLVED ] \x03\x02 Host \x037%(host)s\x03 reports that the alarm for rule %(rule_id)s (\x036%(rule_pattern)s\x03) was resolved for \x034%(unit)s\x03.\x034%(attribute)s\x03. It was active for %(duration)s." % info)
|
||||||
|
else:
|
||||||
|
info["value"] = offending_value
|
||||||
|
info["spec"] = "%s %s" % (rules[rule_id]["operator"], rules[rule_id]["value"])
|
||||||
|
info["unit"] = data["unit"]
|
||||||
|
info["attribute"] = offending_key
|
||||||
|
|
||||||
|
self.send_all("\x02\x030,7 [ ALARM ] \x03\x02 Host \x037%(host)s\x03 reports that an alarm was triggered for rule %(rule_id)s (\x036%(rule_pattern)s\x03). The reported value was\x034 %(value)s\x03 for\x034 %(unit)s\x03.\x034%(attribute)s\x03 , triggering the \x032%(spec)s\x03 condition." % info)
|
||||||
|
|
||||||
|
def process_message(self, message):
|
||||||
|
if message[0].upper() == "PING":
|
||||||
|
self.send_raw("PONG %s" % message[1])
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
self.command_map[message[1].upper()](message)
|
||||||
|
except KeyError, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
bot = Bot(config["irc"]["hosts"], config["irc"]["port"], config["irc"]["nickname"], config["irc"]["realname"], config["irc"]["channels"], config["irc"]["admins"], fetcher)
|
||||||
|
bot.run()
|
@ -0,0 +1,12 @@
|
|||||||
|
irc:
|
||||||
|
hosts:
|
||||||
|
- kerpia.cryto.net
|
||||||
|
- box.cryto.net
|
||||||
|
- arvel.cryto.net
|
||||||
|
port: 6667
|
||||||
|
nickname: StatusBot
|
||||||
|
realname: Cryto System Monitoring Service
|
||||||
|
admins:
|
||||||
|
- joepie91
|
||||||
|
channels:
|
||||||
|
- "#test"
|
@ -0,0 +1,104 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
|
import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys, socket
|
||||||
|
from nacl.public import PublicKey, PrivateKey, Box
|
||||||
|
|
||||||
|
ctx = zmq.Context()
|
||||||
|
|
||||||
|
distributor = ctx.socket(zmq.PUB)
|
||||||
|
distributor.bind("tcp://127.0.0.1:8998")
|
||||||
|
|
||||||
|
poller = zmq.Poller()
|
||||||
|
|
||||||
|
with open("config.yaml", "r") as cfile:
|
||||||
|
config = yaml.safe_load(cfile)
|
||||||
|
|
||||||
|
with open("privkey.dat", "r") as f:
|
||||||
|
privkey = PrivateKey(binascii.unhexlify(f.read()))
|
||||||
|
|
||||||
|
nodes = config["nodes"]
|
||||||
|
last_node_status = {}
|
||||||
|
socket_map = {}
|
||||||
|
boxes = {}
|
||||||
|
|
||||||
|
def heartbeat():
|
||||||
|
for hostname, node in nodes.iteritems():
|
||||||
|
retries = 0
|
||||||
|
while retries < config["heartbeat"]["attempts"]:
|
||||||
|
try:
|
||||||
|
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
s.settimeout(float(config["heartbeat"]["timeout"]) / (retries + 1))
|
||||||
|
s.connect((node["ip"], node["port"]))
|
||||||
|
s.shutdown(socket.SHUT_RDWR)
|
||||||
|
s.close()
|
||||||
|
up = True
|
||||||
|
break
|
||||||
|
except socket.error, e:
|
||||||
|
up = False
|
||||||
|
retries += 1
|
||||||
|
|
||||||
|
try:
|
||||||
|
status_changed = (up != last_node_status[hostname])
|
||||||
|
initial = False
|
||||||
|
except KeyError, e:
|
||||||
|
status_changed = True
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
last_node_status[hostname] = up
|
||||||
|
|
||||||
|
send_message = False
|
||||||
|
if status_changed:
|
||||||
|
if up:
|
||||||
|
msg_type = "up"
|
||||||
|
send_message = True
|
||||||
|
else:
|
||||||
|
msg_type = "down"
|
||||||
|
send_message = True
|
||||||
|
else:
|
||||||
|
if up and retries > 0:
|
||||||
|
msg_type = "blip"
|
||||||
|
send_message = True
|
||||||
|
|
||||||
|
if send_message:
|
||||||
|
distributor.send(msgpack.packb({
|
||||||
|
"host": config["hostname"],
|
||||||
|
"message": {
|
||||||
|
"service": "heartbeat",
|
||||||
|
"msg_type": msg_type,
|
||||||
|
"unit": hostname,
|
||||||
|
"initial": initial
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
timers = zmqtimer.ZmqTimerManager()
|
||||||
|
timers.add_timer(zmqtimer.ZmqTimer(config["heartbeat"]["interval"], heartbeat))
|
||||||
|
|
||||||
|
for hostname, node in config["nodes"].iteritems():
|
||||||
|
boxes[hostname] = Box(privkey, PublicKey(binascii.unhexlify(node["pubkey"])))
|
||||||
|
grabber = ctx.socket(zmq.SUB)
|
||||||
|
grabber.setsockopt(zmq.SUBSCRIBE, "")
|
||||||
|
grabber.connect(node["endpoint"])
|
||||||
|
socket_map[grabber] = hostname
|
||||||
|
poller.register(grabber, zmq.POLLIN)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
timers.check()
|
||||||
|
socks = dict(poller.poll(timers.get_next_interval()))
|
||||||
|
|
||||||
|
for sock in socks:
|
||||||
|
if socks[sock] == zmq.POLLIN:
|
||||||
|
host = socket_map[sock]
|
||||||
|
try:
|
||||||
|
message = msgpack.unpackb(boxes[host].decrypt(sock.recv()))
|
||||||
|
except nacl.exceptions.CryptoError, e:
|
||||||
|
# Probably a spoofed message... skip to next socket
|
||||||
|
sys.stderr.write("Ignoring message... spoofed? (origin: %s)\n" % host) # FIXME: Use logging module...
|
||||||
|
continue
|
||||||
|
except Exception, e:
|
||||||
|
sys.stderr.write(repr(e) + "\n")
|
||||||
|
continue
|
||||||
|
distributor.send(msgpack.packb({
|
||||||
|
"host": host,
|
||||||
|
"message": message
|
||||||
|
}))
|
||||||
|
|
@ -0,0 +1,13 @@
|
|||||||
|
hostname: monitoring.cryto.net
|
||||||
|
|
||||||
|
heartbeat:
|
||||||
|
interval: 5
|
||||||
|
timeout: 1
|
||||||
|
attempts: 3
|
||||||
|
|
||||||
|
nodes:
|
||||||
|
localhost:
|
||||||
|
ip: 127.0.0.1
|
||||||
|
port: 6543
|
||||||
|
endpoint: tcp://127.0.0.1:6543
|
||||||
|
pubkey: bd784ef4065c9bd31627106dc55e26764605a144c6fc45ce93f33cbd19dd7333
|
@ -0,0 +1,15 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
|
import yaml, os, stat, binascii
|
||||||
|
from nacl.public import PrivateKey
|
||||||
|
|
||||||
|
privkey = PrivateKey.generate()
|
||||||
|
pubkey = privkey.public_key
|
||||||
|
|
||||||
|
with open("privkey.dat", "w") as f:
|
||||||
|
f.write(binascii.hexlify(str(privkey)))
|
||||||
|
|
||||||
|
with open("pubkey.dat", "w") as f:
|
||||||
|
f.write(binascii.hexlify(str(pubkey)))
|
||||||
|
|
||||||
|
os.chmod("privkey.dat", stat.S_IRUSR | stat.S_IWUSR)
|
@ -0,0 +1,12 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
import zmq, msgpack
|
||||||
|
|
||||||
|
ctx = zmq.Context()
|
||||||
|
|
||||||
|
fetcher = ctx.socket(zmq.SUB)
|
||||||
|
fetcher.setsockopt(zmq.SUBSCRIBE, "")
|
||||||
|
fetcher.connect("tcp://127.0.0.1:8998")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
message = msgpack.unpackb(fetcher.recv())
|
||||||
|
print message
|
@ -0,0 +1,41 @@
|
|||||||
|
import time
|
||||||
|
|
||||||
|
class ZmqTimerManager(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.timers = []
|
||||||
|
self.next_call = 0
|
||||||
|
|
||||||
|
def add_timer(self, timer):
|
||||||
|
self.timers.append(timer)
|
||||||
|
|
||||||
|
def check(self):
|
||||||
|
if time.time() > self.next_call:
|
||||||
|
for timer in self.timers:
|
||||||
|
timer.check()
|
||||||
|
|
||||||
|
def get_next_interval(self):
|
||||||
|
if time.time() >= self.next_call:
|
||||||
|
call_times = []
|
||||||
|
for timer in self.timers:
|
||||||
|
call_times.append(timer.get_next_call())
|
||||||
|
self.next_call = min(call_times)
|
||||||
|
if self.next_call < time.time():
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
return (self.next_call - time.time()) * 1000
|
||||||
|
else:
|
||||||
|
return (self.next_call - time.time()) * 1000
|
||||||
|
|
||||||
|
class ZmqTimer(object):
|
||||||
|
def __init__(self, interval, callback):
|
||||||
|
self.interval = interval
|
||||||
|
self.callback = callback
|
||||||
|
self.last_call = 0
|
||||||
|
|
||||||
|
def check(self):
|
||||||
|
if time.time() > (self.interval + self.last_call):
|
||||||
|
self.callback()
|
||||||
|
self.last_call = time.time()
|
||||||
|
|
||||||
|
def get_next_call(self):
|
||||||
|
return self.last_call + self.interval
|
@ -0,0 +1,6 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
echo "Generating keypair..."
|
||||||
|
./genkey 2>/dev/null
|
||||||
|
./bootstrap-config
|
||||||
|
echo "Your public key: `cat pubkey.dat`"
|
||||||
|
echo "Server IP: `curl -s http://wtfismyip.com/text`" 2>/dev/null
|
@ -0,0 +1,86 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
|
import yaml, sys
|
||||||
|
|
||||||
|
master_pubkey = raw_input("Public key of the master server: ")
|
||||||
|
|
||||||
|
print "You'll now be asked to configure ports to check. If you don't want to configure any ports, just hit enter without entering any information."
|
||||||
|
|
||||||
|
ports = {}
|
||||||
|
|
||||||
|
while True:
|
||||||
|
port = raw_input("Port number: ")
|
||||||
|
if port.strip() == "":
|
||||||
|
break
|
||||||
|
service_name = raw_input("Service name for port %s: " % port)
|
||||||
|
ports[int(port)] = service_name
|
||||||
|
|
||||||
|
print "The same thing, except now for processes to check. Just hit enter without entering any information when you're done; the same goes for the argument list. As a wildcard, you can use *"
|
||||||
|
|
||||||
|
services = {}
|
||||||
|
|
||||||
|
while True:
|
||||||
|
service_name = raw_input("Service name: ")
|
||||||
|
|
||||||
|
if service_name.strip() == "":
|
||||||
|
break
|
||||||
|
|
||||||
|
process_name = raw_input("Process name: ")
|
||||||
|
|
||||||
|
args = {}
|
||||||
|
argnum = 1
|
||||||
|
while True:
|
||||||
|
arg = raw_input("Argument %d: " % argnum)
|
||||||
|
if arg.strip() == "":
|
||||||
|
break
|
||||||
|
args[argnum] = arg
|
||||||
|
argnum += 1
|
||||||
|
|
||||||
|
services[service_name] = {
|
||||||
|
"name": process_name,
|
||||||
|
"args": args
|
||||||
|
}
|
||||||
|
|
||||||
|
print "Now enter any disk devices you wish to monitor. Leave empty and hit enter when done."
|
||||||
|
|
||||||
|
disks = []
|
||||||
|
|
||||||
|
while True:
|
||||||
|
device_name = raw_input("Device name: ")
|
||||||
|
if device_name.strip() == "":
|
||||||
|
break
|
||||||
|
disks.append(device_name)
|
||||||
|
|
||||||
|
# Write config files...
|
||||||
|
|
||||||
|
modules = []
|
||||||
|
|
||||||
|
modules.append("stats-machine")
|
||||||
|
with open("config/machine.yaml.example", "r") as ef:
|
||||||
|
with open("config/machine.yaml", "w") as ff:
|
||||||
|
data = yaml.safe_load(ef.read())
|
||||||
|
data["drives"] = disks
|
||||||
|
ff.write(yaml.dump(data))
|
||||||
|
|
||||||
|
if len(ports) > 0:
|
||||||
|
modules.append("stats-ports")
|
||||||
|
with open("config/ports.yaml.example", "r") as ef:
|
||||||
|
with open("config/ports.yaml", "w") as ff:
|
||||||
|
data = yaml.safe_load(ef.read())
|
||||||
|
data["ports"] = ports
|
||||||
|
ff.write(yaml.dump(data))
|
||||||
|
|
||||||
|
if len(services) > 0:
|
||||||
|
modules.append("stats-processes")
|
||||||
|
with open("config/processes.yaml.example", "r") as ef:
|
||||||
|
with open("config/processes.yaml", "w") as ff:
|
||||||
|
data = yaml.safe_load(ef.read())
|
||||||
|
data["processes"] = services
|
||||||
|
ff.write(yaml.dump(data))
|
||||||
|
|
||||||
|
with open("config/cstatsd.yaml.example", "r") as ef:
|
||||||
|
with open("config/cstatsd.yaml", "w") as ff:
|
||||||
|
data = yaml.safe_load(ef.read())
|
||||||
|
data["pubkey"] = master_pubkey
|
||||||
|
data["autostart"] = modules
|
||||||
|
ff.write(yaml.dump(data))
|
@ -0,0 +1,7 @@
|
|||||||
|
endpoint: tcp://*:6543
|
||||||
|
pubkey: a266a0634790a79c6934385892f7c377d35b8f03b9c6ac7d5bfed4a94f93ba65
|
||||||
|
|
||||||
|
autostart:
|
||||||
|
- stats-processes
|
||||||
|
- stats-ports
|
||||||
|
- stats-machine
|
@ -1,4 +1,5 @@
|
|||||||
interval: 1
|
interval: 1
|
||||||
|
|
||||||
drives:
|
drives:
|
||||||
- /
|
- /dev/sda1
|
||||||
|
- /dev/sdb1
|
||||||
|
@ -0,0 +1,15 @@
|
|||||||
|
interval: 5
|
||||||
|
|
||||||
|
processes:
|
||||||
|
radiotray:
|
||||||
|
name: '*python*'
|
||||||
|
args:
|
||||||
|
1: /usr/bin/radiotray
|
||||||
|
|
||||||
|
guake:
|
||||||
|
name: '*python*'
|
||||||
|
args:
|
||||||
|
1: /usr/local/bin/guake
|
||||||
|
|
||||||
|
keepassx:
|
||||||
|
name: keepassx
|
@ -1,12 +1,43 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
import zmq, msgpack
|
import zmq, yaml, binascii, nacl, sys, subprocess, os
|
||||||
|
from nacl.public import PublicKey, PrivateKey, Box
|
||||||
|
|
||||||
|
basedir = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
with open("cstatsd.pid", "w") as pidfile:
|
||||||
|
pidfile.write(str(os.getpid()))
|
||||||
|
|
||||||
ctx = zmq.Context()
|
ctx = zmq.Context()
|
||||||
|
|
||||||
|
with open("config/cstatsd.yaml", "r") as cfile:
|
||||||
|
config = yaml.safe_load(cfile)
|
||||||
|
|
||||||
|
pubkey = PublicKey(binascii.unhexlify(config["pubkey"]))
|
||||||
|
|
||||||
|
with open("privkey.dat", "r") as f:
|
||||||
|
privkey = PrivateKey(binascii.unhexlify(f.read()))
|
||||||
|
|
||||||
|
box = Box(privkey, pubkey)
|
||||||
|
|
||||||
collector = ctx.socket(zmq.PULL)
|
collector = ctx.socket(zmq.PULL)
|
||||||
collector.bind("ipc:///tmp/cstatsd")
|
collector.bind("ipc:///tmp/cstatsd")
|
||||||
|
|
||||||
|
shipper = ctx.socket(zmq.PUB)
|
||||||
|
shipper.bind(config["endpoint"])
|
||||||
|
|
||||||
|
try:
|
||||||
|
disable_autostart = (sys.argv[1] == "--disable-autostart")
|
||||||
|
except:
|
||||||
|
disable_autostart = False
|
||||||
|
|
||||||
|
if disable_autostart == False:
|
||||||
|
with open("/dev/null", "w+") as stfu:
|
||||||
|
for script in config["autostart"]:
|
||||||
|
print os.path.join(basedir, script)
|
||||||
|
subprocess.Popen([os.path.join(basedir, script)], stdout=stfu, stderr=stfu)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
message = msgpack.unpackb(collector.recv())
|
message = collector.recv()
|
||||||
print message
|
nonce = nacl.utils.random(Box.NONCE_SIZE)
|
||||||
|
shipper.send(box.encrypt(message, nonce))
|
||||||
|
@ -0,0 +1,15 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
|
import yaml, os, stat, binascii
|
||||||
|
from nacl.public import PrivateKey
|
||||||
|
|
||||||
|
privkey = PrivateKey.generate()
|
||||||
|
pubkey = privkey.public_key
|
||||||
|
|
||||||
|
with open("privkey.dat", "w") as f:
|
||||||
|
f.write(binascii.hexlify(str(privkey)))
|
||||||
|
|
||||||
|
with open("pubkey.dat", "w") as f:
|
||||||
|
f.write(binascii.hexlify(str(pubkey)))
|
||||||
|
|
||||||
|
os.chmod("privkey.dat", stat.S_IRUSR | stat.S_IWUSR)
|
@ -0,0 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
PID=`cat cstatsd.pid`
|
||||||
|
pkill -P $PID
|
||||||
|
kill $PID
|
@ -0,0 +1,224 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
|
import zmq, msgpack, time, psutil, yaml, os, subprocess
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
# Horrible hack to make check_output exist in 2.6
|
||||||
|
# http://stackoverflow.com/a/13160748/1332715
|
||||||
|
if "check_output" not in dir( subprocess ): # duck punch it in!
|
||||||
|
def f(*popenargs, **kwargs):
|
||||||
|
if 'stdout' in kwargs:
|
||||||
|
raise ValueError('stdout argument not allowed, it will be overridden.')
|
||||||
|
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
|
||||||
|
output, unused_err = process.communicate()
|
||||||
|
retcode = process.poll()
|
||||||
|
if retcode:
|
||||||
|
cmd = kwargs.get("args")
|
||||||
|
if cmd is None:
|
||||||
|
cmd = popenargs[0]
|
||||||
|
raise subprocess.CalledProcessError(retcode, cmd)
|
||||||
|
return output
|
||||||
|
subprocess.check_output = f
|
||||||
|
|
||||||
|
ctx = zmq.Context()
|
||||||
|
|
||||||
|
sock = ctx.socket(zmq.PUSH)
|
||||||
|
sock.connect("ipc:///tmp/cstatsd")
|
||||||
|
|
||||||
|
with open("config/machine.yaml", "r") as cfile:
|
||||||
|
config = yaml.safe_load(cfile)
|
||||||
|
|
||||||
|
interval = config["interval"]
|
||||||
|
old_net_data = {}
|
||||||
|
|
||||||
|
disk_map = {}
|
||||||
|
last_io_data = {}
|
||||||
|
|
||||||
|
if os.path.exists("/proc/user_beancounters") and not os.path.exists("/proc/vz/vestat"):
|
||||||
|
openvz_burst = True
|
||||||
|
FakeRam = namedtuple("FakeRam", ["total", "used", "available", "percent", "buffers", "cached"])
|
||||||
|
else:
|
||||||
|
openvz_burst = False
|
||||||
|
|
||||||
|
for disk in psutil.disk_partitions():
|
||||||
|
disk_map[disk.device] = disk
|
||||||
|
|
||||||
|
if len(disk_map) == 0:
|
||||||
|
# We're probably on OpenVZ, so /proc/partitions doesn't exist. Fall back to 'df'.
|
||||||
|
FakeDisk = namedtuple("FakeDisk", ["device", "mountpoint"])
|
||||||
|
for line in subprocess.check_output(["df"]).splitlines()[1:]:
|
||||||
|
device, _, _, _, _, mountpoint = line.split()
|
||||||
|
disk_map[device] = FakeDisk(device, mountpoint)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
load_avgs = os.getloadavg()
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "machine",
|
||||||
|
"msg_type": "value",
|
||||||
|
"resource_type": "load_average",
|
||||||
|
"unit": "",
|
||||||
|
"values": {
|
||||||
|
"1m": load_avgs[0],
|
||||||
|
"5m": load_avgs[1],
|
||||||
|
"15m": load_avgs[2]
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
cpu_loads = psutil.cpu_percent(percpu=True)
|
||||||
|
|
||||||
|
for i in xrange(0, len(cpu_loads)):
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "machine",
|
||||||
|
"msg_type": "value",
|
||||||
|
"resource_type": "cpu",
|
||||||
|
"unit": "core%d" % (i + 1),
|
||||||
|
"values": {
|
||||||
|
"load": cpu_loads[i]
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
try:
|
||||||
|
io_counters = psutil.disk_io_counters(perdisk=True)
|
||||||
|
except IOError, e:
|
||||||
|
io_counters = {} # OpenVZ...
|
||||||
|
|
||||||
|
for drive in config["drives"]:
|
||||||
|
drive_data = psutil.disk_usage(disk_map[drive].mountpoint)
|
||||||
|
io_data = None
|
||||||
|
|
||||||
|
for diskname, data in io_counters.iteritems():
|
||||||
|
if drive.endswith(diskname):
|
||||||
|
io_data = data
|
||||||
|
|
||||||
|
if io_data is None or drive not in last_io_data:
|
||||||
|
read_bps = 0
|
||||||
|
write_bps = 0
|
||||||
|
read_iops = 0
|
||||||
|
write_iops = 0
|
||||||
|
else:
|
||||||
|
read_bps = (io_data.read_bytes - last_io_data[drive].read_bytes) / interval
|
||||||
|
write_bps = (io_data.write_bytes - last_io_data[drive].write_bytes) / interval
|
||||||
|
read_iops = (io_data.read_count - last_io_data[drive].read_count) / interval
|
||||||
|
write_iops = (io_data.write_count - last_io_data[drive].write_count) / interval
|
||||||
|
|
||||||
|
if io_data is not None:
|
||||||
|
last_io_data[drive] = io_data
|
||||||
|
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "machine",
|
||||||
|
"msg_type": "value",
|
||||||
|
"resource_type": "disk",
|
||||||
|
"unit": drive,
|
||||||
|
"values": {
|
||||||
|
"total": drive_data.total,
|
||||||
|
"used": drive_data.used,
|
||||||
|
"free": drive_data.free,
|
||||||
|
"used_percentage": drive_data.percent,
|
||||||
|
"bps_read": read_bps,
|
||||||
|
"bps_write": write_bps,
|
||||||
|
"iops_read": read_iops,
|
||||||
|
"iops_write": write_iops,
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
if openvz_burst:
|
||||||
|
# Sigh, OpenVZ... let's use 'free', since that apparently -does- understand OpenVZ.
|
||||||
|
lines = subprocess.check_output(["free", "-b"]).splitlines()
|
||||||
|
_, ram_total, ram_used, ram_free, _, ram_buffers, ram_cached = lines[1].split()
|
||||||
|
_, _, _, ram_available = lines[2].split()
|
||||||
|
ram_total = int(ram_total)
|
||||||
|
ram_free = int(ram_free)
|
||||||
|
ram_buffers = int(ram_buffers)
|
||||||
|
ram_cached = int(ram_cached)
|
||||||
|
ram_available = int(ram_available)
|
||||||
|
ram_used = int(ram_used)
|
||||||
|
ram_percent = 1.0 * (ram_total - ram_available) / ram_total * 100
|
||||||
|
ram_data = FakeRam(ram_total, ram_used, ram_available, ram_percent, ram_buffers, ram_cached)
|
||||||
|
else:
|
||||||
|
ram_data = psutil.virtual_memory()
|
||||||
|
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "machine",
|
||||||
|
"msg_type": "value",
|
||||||
|
"resource_type": "memory",
|
||||||
|
"unit": "physical",
|
||||||
|
"values": {
|
||||||
|
"total": ram_data.total,
|
||||||
|
"used": ram_data.used,
|
||||||
|
"free": ram_data.available,
|
||||||
|
"used_percentage": ram_data.percent,
|
||||||
|
"buffers": ram_data.buffers,
|
||||||
|
"cache": ram_data.cached
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
swap_data = psutil.swap_memory()
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "machine",
|
||||||
|
"msg_type": "value",
|
||||||
|
"resource_type": "memory",
|
||||||
|
"unit": "swap",
|
||||||
|
"values": {
|
||||||
|
"total": swap_data.total,
|
||||||
|
"used": swap_data.used,
|
||||||
|
"free": swap_data.free,
|
||||||
|
"used_percentage": swap_data.percent
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
net_data = psutil.net_io_counters(pernic=True)
|
||||||
|
for nic, data in net_data.iteritems():
|
||||||
|
try:
|
||||||
|
old_in_b = old_net_data[nic].bytes_recv
|
||||||
|
old_out_b = old_net_data[nic].bytes_sent
|
||||||
|
old_in_p = old_net_data[nic].packets_recv
|
||||||
|
old_out_p = old_net_data[nic].packets_sent
|
||||||
|
except KeyError, e:
|
||||||
|
# No old data yet, first run? Save and skip to next...
|
||||||
|
old_net_data[nic] = data
|
||||||
|
continue
|
||||||
|
|
||||||
|
diff_in_b = data.bytes_recv - old_in_b
|
||||||
|
diff_out_b = data.bytes_sent - old_out_b
|
||||||
|
diff_in_p = data.packets_recv - old_in_p
|
||||||
|
diff_out_p = data.packets_sent - old_out_p
|
||||||
|
|
||||||
|
if diff_in_b < 0:
|
||||||
|
diff_in_b = 0
|
||||||
|
|
||||||
|
if diff_out_b < 0:
|
||||||
|
diff_out_b = 0
|
||||||
|
|
||||||
|
if diff_in_p < 0:
|
||||||
|
diff_in_p = 0
|
||||||
|
|
||||||
|
if diff_out_p < 0:
|
||||||
|
diff_out_p = 0
|
||||||
|
|
||||||
|
old_net_data[nic] = data
|
||||||
|
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "machine",
|
||||||
|
"msg_type": "value",
|
||||||
|
"resource_type": "network",
|
||||||
|
"unit": nic,
|
||||||
|
"values": {
|
||||||
|
"bps_in": diff_in_b / interval,
|
||||||
|
"bps_out": diff_out_b / interval,
|
||||||
|
"pps_in": diff_in_p / interval,
|
||||||
|
"pps_out": diff_out_p / interval
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "machine",
|
||||||
|
"msg_type": "value",
|
||||||
|
"resource_type": "uptime",
|
||||||
|
"unit": "",
|
||||||
|
"values": {
|
||||||
|
"uptime": time.time() - psutil.get_boot_time()
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
time.sleep(interval)
|
||||||
|
|
@ -1,143 +0,0 @@
|
|||||||
#!/usr/bin/env python2
|
|
||||||
|
|
||||||
import zmq, msgpack, time, psutil, yaml, os
|
|
||||||
|
|
||||||
ctx = zmq.Context()
|
|
||||||
|
|
||||||
sock = ctx.socket(zmq.PUSH)
|
|
||||||
sock.connect("ipc:///tmp/cstatsd")
|
|
||||||
|
|
||||||
with open("config/machine.yaml", "r") as cfile:
|
|
||||||
config = yaml.safe_load(cfile)
|
|
||||||
|
|
||||||
interval = config["interval"]
|
|
||||||
old_net_data = {}
|
|
||||||
|
|
||||||
while True:
|
|
||||||
load_avgs = os.getloadavg()
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "machine",
|
|
||||||
"msg_type": "value",
|
|
||||||
"resource_type": "load_average",
|
|
||||||
"unit": "",
|
|
||||||
"values": {
|
|
||||||
"1m": load_avgs[0],
|
|
||||||
"5m": load_avgs[1],
|
|
||||||
"15m": load_avgs[2]
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
cpu_loads = psutil.cpu_percent(percpu=True)
|
|
||||||
|
|
||||||
for i in xrange(0, len(cpu_loads)):
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "machine",
|
|
||||||
"msg_type": "value",
|
|
||||||
"resource_type": "cpu",
|
|
||||||
"unit": "core%d" % (i + 1),
|
|
||||||
"values": {
|
|
||||||
"load": cpu_loads[i]
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
for drive in config["drives"]:
|
|
||||||
drive_data = psutil.disk_usage(drive)
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "machine",
|
|
||||||
"msg_type": "value",
|
|
||||||
"resource_type": "disk",
|
|
||||||
"unit": drive,
|
|
||||||
"values": {
|
|
||||||
"total": drive_data.total,
|
|
||||||
"used": drive_data.used,
|
|
||||||
"free": drive_data.free,
|
|
||||||
"used_percentage": drive_data.percent
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
ram_data = psutil.virtual_memory()
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "machine",
|
|
||||||
"msg_type": "value",
|
|
||||||
"resource_type": "memory",
|
|
||||||
"unit": "physical",
|
|
||||||
"values": {
|
|
||||||
"total": ram_data.total,
|
|
||||||
"used": ram_data.used,
|
|
||||||
"free": ram_data.available,
|
|
||||||
"used_percentage": ram_data.percent,
|
|
||||||
"buffers": ram_data.buffers,
|
|
||||||
"cache": ram_data.cached
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
swap_data = psutil.virtual_memory()
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "machine",
|
|
||||||
"msg_type": "value",
|
|
||||||
"resource_type": "memory",
|
|
||||||
"unit": "swap",
|
|
||||||
"values": {
|
|
||||||
"total": swap_data.total,
|
|
||||||
"used": swap_data.used,
|
|
||||||
"free": swap_data.free,
|
|
||||||
"used_percentage": swap_data.percent
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
net_data = psutil.net_io_counters(pernic=True)
|
|
||||||
for nic, data in net_data.iteritems():
|
|
||||||
try:
|
|
||||||
old_in_b = old_net_data[nic].bytes_recv
|
|
||||||
old_out_b = old_net_data[nic].bytes_sent
|
|
||||||
old_in_p = old_net_data[nic].packets_recv
|
|
||||||
old_out_p = old_net_data[nic].packets_sent
|
|
||||||
except KeyError, e:
|
|
||||||
# No old data yet, first run? Save and skip to next...
|
|
||||||
old_net_data[nic] = data
|
|
||||||
continue
|
|
||||||
|
|
||||||
diff_in_b = data.bytes_recv - old_in_b
|
|
||||||
diff_out_b = data.bytes_sent - old_out_b
|
|
||||||
diff_in_p = data.packets_recv - old_in_p
|
|
||||||
diff_out_p = data.packets_sent - old_out_p
|
|
||||||
|
|
||||||
if diff_in_b < 0:
|
|
||||||
diff_in_b = 0
|
|
||||||
|
|
||||||
if diff_out_b < 0:
|
|
||||||
diff_out_b = 0
|
|
||||||
|
|
||||||
if diff_in_p < 0:
|
|
||||||
diff_in_p = 0
|
|
||||||
|
|
||||||
if diff_out_p < 0:
|
|
||||||
diff_out_p = 0
|
|
||||||
|
|
||||||
old_net_data[nic] = data
|
|
||||||
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "machine",
|
|
||||||
"msg_type": "value",
|
|
||||||
"resource_type": "network",
|
|
||||||
"unit": nic,
|
|
||||||
"values": {
|
|
||||||
"bps_in": diff_in_b / interval,
|
|
||||||
"bps_out": diff_out_b / interval,
|
|
||||||
"pps_in": diff_in_p / interval,
|
|
||||||
"pps_out": diff_out_p / interval
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "machine",
|
|
||||||
"msg_type": "value",
|
|
||||||
"resource_type": "uptime",
|
|
||||||
"unit": "",
|
|
||||||
"values": {
|
|
||||||
"uptime": time.time() - psutil.get_boot_time()
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
time.sleep(interval)
|
|
||||||
|
|
@ -0,0 +1,72 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
|
import zmq, msgpack, time, yaml, psutil, fnmatch
|
||||||
|
|
||||||
|
ctx = zmq.Context()
|
||||||
|
|
||||||
|
sock = ctx.socket(zmq.PUSH)
|
||||||
|
sock.connect("ipc:///tmp/cstatsd")
|
||||||
|
|
||||||
|
with open("config/processes.yaml", "r") as cfile:
|
||||||
|
config = yaml.safe_load(cfile)
|
||||||
|
|
||||||
|
interval = config["interval"]
|
||||||
|
|
||||||
|
old_status = {}
|
||||||
|
|
||||||
|
while True:
|
||||||
|
all_procs = psutil.get_process_list()
|
||||||
|
|
||||||
|
for service_name, patterns in config["processes"].iteritems():
|
||||||
|
matching = []
|
||||||
|
for proc in all_procs: # Can't use filter() because of exceptions...
|
||||||
|
try:
|
||||||
|
if len(proc.cmdline) > 0 and fnmatch.fnmatch(proc.cmdline[0], patterns["name"]):
|
||||||
|
failed = False
|
||||||
|
try:
|
||||||
|
for arg, pattern in patterns["args"].iteritems():
|
||||||
|
try:
|
||||||
|
if len(proc.cmdline) < (arg + 1) or not fnmatch.fnmatch(proc.cmdline[arg], pattern):
|
||||||
|
failed = True
|
||||||
|
except KeyError, e:
|
||||||
|
pass
|
||||||
|
except KeyError, e:
|
||||||
|
pass
|
||||||
|
if failed == False:
|
||||||
|
matching.append(proc)
|
||||||
|
except psutil._error.NoSuchProcess, e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if len(matching) > 0:
|
||||||
|
up = True
|
||||||
|
else:
|
||||||
|
up = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
if up == old_status[service_name]:
|
||||||
|
send_notice = False
|
||||||
|
else:
|
||||||
|
send_notice = True
|
||||||
|
initial = False
|
||||||
|
except KeyError, e:
|
||||||
|
send_notice = True
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
old_status[service_name] = up
|
||||||
|
|
||||||
|
if send_notice:
|
||||||
|
if up:
|
||||||
|
msg_type = "up"
|
||||||
|
else:
|
||||||
|
msg_type = "down"
|
||||||
|
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "process",
|
||||||
|
"msg_type": msg_type,
|
||||||
|
"unit": service_name,
|
||||||
|
"initial": initial
|
||||||
|
}))
|
||||||
|
|
||||||
|
|
||||||
|
time.sleep(interval)
|
||||||
|
|
@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# You need squeeze-backports if you run this on squeeze!
|
||||||
|
apt-get install -y libzmq-dev libffi-dev build-essential python python-dev
|
||||||
|
pip install pyzmq msgpack-python pynacl pyyaml psutil
|
@ -0,0 +1,4 @@
|
|||||||
|
# Backports: echo "deb http://backports.debian.org/debian-backports squeeze-backports main" >> /etc/apt/sources.list && apt-get update && apt-get upgrade
|
||||||
|
apt-get install -y python python-dev && wget cryto.net/~joepie91/pipfix.sh && chmod +x pipfix.sh && ./pipfix.sh
|
||||||
|
adduser --system --shell /bin/bash --group monitor && apt-get install -y git; su -c "cd ~; git clone https://github.com/joepie91/cryto-status.git" monitor && /home/monitor/cryto-status/deps.sh
|
||||||
|
# Replace libzmq with a manually compiled version...: wget http://download.zeromq.org/zeromq-4.0.3.tar.gz; tar -xzvf zeromq-4.0.3.tar.gz; cd zeromq-4.0.3; ./configure; make; make install; ldconfig
|
@ -0,0 +1,54 @@
|
|||||||
|
* allow comments in (parentheses) in units, and ignore these when matching against an alarm pattern...
|
||||||
|
* web interface (angularjs)
|
||||||
|
* separate alarm and IRC logic
|
||||||
|
* monitor inodes
|
||||||
|
* watchdog on slave and master -> should send WARN notifications
|
||||||
|
* notifications (text, arbitrary-serialized-data as attachment, DEBUG/INFO/WARN/ERR/CRIT)
|
||||||
|
* consider redundancy - can already connect multiple masters through pubsub, how to deal with duplicate processing checking?
|
||||||
|
|
||||||
|
cprocessd:
|
||||||
|
-> subscribe to ccollectd
|
||||||
|
-> debug switch for outputting all to terminal
|
||||||
|
-> keep up/down state
|
||||||
|
-> keep last-value state (resource usage)
|
||||||
|
-> keep track of persistent downtimes (down for more than X time, as configured in config file)
|
||||||
|
-> alarms (move this from the IRC bot to cprocessd)
|
||||||
|
-> classify message importance
|
||||||
|
-> cprocessd-stream socket, PUB that just streams processed data
|
||||||
|
-> cprocessd-query socket, REP that responds to queries
|
||||||
|
-> server-status
|
||||||
|
-> down-list
|
||||||
|
-> last-value
|
||||||
|
-> server-list
|
||||||
|
-> service-list
|
||||||
|
|
||||||
|
cmaild:
|
||||||
|
-> use marrow.mailer
|
||||||
|
-> receives data from cprocessd-stream
|
||||||
|
-> sends e-mails for configured importance levels
|
||||||
|
|
||||||
|
cbotd:
|
||||||
|
-> currently named 'alert'
|
||||||
|
-> receives data from cprocessd-stream
|
||||||
|
-> IRC bot
|
||||||
|
-> posts alerts to specified IRC channels, depending on minimum severity level configured for that channel (ie. INFO for #cryto-network but ERR for #crytocc)
|
||||||
|
|
||||||
|
csmsd:
|
||||||
|
-> sends SMS for (critical) alerts
|
||||||
|
-> receives data from cprocessd-stream
|
||||||
|
-> Twilio? does a provider-neutral API exist? might need an extra abstraction...
|
||||||
|
|
||||||
|
cwebd:
|
||||||
|
-> offers web interface with streaming status data
|
||||||
|
-> publicly accessible and password-protected
|
||||||
|
-> streaming data from cprocessd-stream
|
||||||
|
-> on-pageload state from cprocessd-query (including 'current downtimes')
|
||||||
|
-> tornado+zmq ioloop, http://zeromq.github.io/pyzmq/eventloop.html
|
||||||
|
-> web dashboard
|
||||||
|
-> AngularJS
|
||||||
|
-> fancy graphs (via AngularJS? idk if a directive exists for this)
|
||||||
|
-> show downtimes as well as live per-machine stats
|
||||||
|
-> also show overview of all machines in a grid, color-coded for average load of all resources
|
||||||
|
-> historical up/down data
|
||||||
|
-> sqlite storage? single concurrent write, so should work
|
||||||
|
-> perhaps letting people sign up for e-mail alerts is an option? to-inbox will be tricky here
|
Loading…
Reference in New Issue