Compare commits
No commits in common. 'develop' and 'master' have entirely different histories.
@ -1,304 +0,0 @@
|
|||||||
#!/usr/bin/env python2
|
|
||||||
|
|
||||||
import socket, yaml, random, zmq, msgpack, time, uuid, fnmatch
|
|
||||||
import cPickle as pickle
|
|
||||||
|
|
||||||
ctx = zmq.Context()
|
|
||||||
|
|
||||||
with open("config.yaml", "r") as cfile:
|
|
||||||
config = yaml.safe_load(cfile)
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open("rules.pickle", "r") as pfile:
|
|
||||||
rules = pickle.load(pfile)
|
|
||||||
except IOError, e:
|
|
||||||
rules = {}
|
|
||||||
|
|
||||||
fetcher = ctx.socket(zmq.SUB)
|
|
||||||
fetcher.setsockopt(zmq.SUBSCRIBE, "")
|
|
||||||
fetcher.connect("tcp://127.0.0.1:8998")
|
|
||||||
|
|
||||||
class Bot(object):
|
|
||||||
def __init__(self, hosts, port, nickname, realname, channels, admins, subsock):
|
|
||||||
self.hosts = hosts
|
|
||||||
self.port = port
|
|
||||||
self.nickname = nickname
|
|
||||||
self.realname = realname
|
|
||||||
self.channels = channels
|
|
||||||
self.admins = admins
|
|
||||||
self.subsock = subsock
|
|
||||||
self.connected = False
|
|
||||||
self.last_down = {}
|
|
||||||
self.known_alarms = {}
|
|
||||||
|
|
||||||
self.command_map = {
|
|
||||||
"422": self.join_all_channels,
|
|
||||||
"376": self.join_all_channels,
|
|
||||||
"PRIVMSG": self.receive_message
|
|
||||||
}
|
|
||||||
|
|
||||||
def split_irc(self, message):
|
|
||||||
if message[0] == ":":
|
|
||||||
prefix = ":"
|
|
||||||
message = message[1:]
|
|
||||||
else:
|
|
||||||
prefix = ""
|
|
||||||
|
|
||||||
if ":" in message:
|
|
||||||
rest, last = message.split(":", 1)
|
|
||||||
parts = rest.strip().split() + [last]
|
|
||||||
else:
|
|
||||||
parts = message.split()
|
|
||||||
|
|
||||||
parts[0] = prefix + parts[0]
|
|
||||||
return parts
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
while True: # Connect loop
|
|
||||||
host = random.choice(self.hosts)
|
|
||||||
|
|
||||||
self.sock = socket.socket()
|
|
||||||
try:
|
|
||||||
self.sock.connect((host, self.port))
|
|
||||||
except socket.error, e:
|
|
||||||
continue # Reconnect
|
|
||||||
self.send_raw("NICK %s" % self.nickname)
|
|
||||||
self.sock.send("USER %s 0 0 :%s\r\n" % (self.nickname, self.realname))
|
|
||||||
|
|
||||||
buff = ""
|
|
||||||
while True: # Read loop
|
|
||||||
r, w, x = zmq.select([self.sock, self.subsock], [], [])
|
|
||||||
|
|
||||||
for s in r:
|
|
||||||
if s == self.sock.fileno():
|
|
||||||
try:
|
|
||||||
recvdata = self.sock.recv(1024)
|
|
||||||
except socket.error, e:
|
|
||||||
break # Something went wrong, reconnect...
|
|
||||||
|
|
||||||
if len(recvdata) == 0:
|
|
||||||
break # We have disconnected...
|
|
||||||
|
|
||||||
buff += recvdata
|
|
||||||
messages = buff.split("\n")
|
|
||||||
buff = messages.pop()
|
|
||||||
|
|
||||||
for message in messages:
|
|
||||||
self.process_message(self.split_irc(message.strip("\r")))
|
|
||||||
elif self.subsock.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0:
|
|
||||||
# Process incoming data from the subscribe socket...
|
|
||||||
message = msgpack.unpackb(s.recv())
|
|
||||||
self.process_stats(message)
|
|
||||||
|
|
||||||
def send_raw(self, message):
|
|
||||||
self.sock.send("%s\r\n" % message)
|
|
||||||
|
|
||||||
def send_message(self, recipient, message):
|
|
||||||
if self.connected == True:
|
|
||||||
self.send_raw("PRIVMSG %s :%s" % (recipient, message))
|
|
||||||
|
|
||||||
def send_all(self, message):
|
|
||||||
for channel in self.channels:
|
|
||||||
self.send_message(channel, message)
|
|
||||||
|
|
||||||
def join(self, channel):
|
|
||||||
self.send_raw("JOIN %s" % channel)
|
|
||||||
|
|
||||||
def join_all_channels(self, message):
|
|
||||||
self.connected = True
|
|
||||||
for channel in self.channels:
|
|
||||||
self.join(channel)
|
|
||||||
|
|
||||||
def receive_message(self, message):
|
|
||||||
args = message[3].split()
|
|
||||||
sender = message[0][1:].split("!", 1)[0]
|
|
||||||
channel = message[2]
|
|
||||||
|
|
||||||
try:
|
|
||||||
if sender in self.admins:
|
|
||||||
if args[0] == "!addrule":
|
|
||||||
target, rel, value = args[1:4]
|
|
||||||
target = self.parse_target(target)
|
|
||||||
|
|
||||||
if value[-1].lower() in ("k", "m", "g", "t"):
|
|
||||||
unit = value[-1].lower()
|
|
||||||
value = value[:-1]
|
|
||||||
value = float(value)
|
|
||||||
value = value * (1024 ** (("k", "m", "g", "t").index(unit) + 1))
|
|
||||||
|
|
||||||
rule_id = uuid.uuid4()
|
|
||||||
rules[rule_id] = {
|
|
||||||
"target": target,
|
|
||||||
"operator": rel,
|
|
||||||
"value": value
|
|
||||||
}
|
|
||||||
|
|
||||||
with open("rules.pickle", "w") as pfile:
|
|
||||||
pickle.dump(rules, pfile)
|
|
||||||
|
|
||||||
self.send_message(channel, "Added rule for %s with ID %s." % (args[1], rule_id))
|
|
||||||
except Exception, e:
|
|
||||||
self.send_message(channel, str(e))
|
|
||||||
|
|
||||||
def parse_target(self, target):
|
|
||||||
host, rest = target.split("!", 1)
|
|
||||||
service, rest = rest.split(".", 1)
|
|
||||||
resource, rest = rest.split(":", 1)
|
|
||||||
unit, attribute = rest.split(".", 1)
|
|
||||||
# TODO: unit = unit.split("(", 1)[0].strip() # Allow () for comments
|
|
||||||
if host == "*":
|
|
||||||
host = True
|
|
||||||
if service == "*":
|
|
||||||
service = True
|
|
||||||
if attribute == "*":
|
|
||||||
attribute = True
|
|
||||||
if resource == "*":
|
|
||||||
resource = True
|
|
||||||
if unit == "*":
|
|
||||||
unit = True
|
|
||||||
return {
|
|
||||||
"host": host,
|
|
||||||
"service": service,
|
|
||||||
"resource": resource,
|
|
||||||
"unit": unit,
|
|
||||||
"attribute": attribute
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def format_time_duration(self, seconds):
|
|
||||||
# http://stackoverflow.com/a/20222351/1332715
|
|
||||||
days, rem = divmod(seconds, 86400)
|
|
||||||
hours, rem = divmod(rem, 3600)
|
|
||||||
minutes, seconds = divmod(rem, 60)
|
|
||||||
if seconds < 1:
|
|
||||||
seconds = 1
|
|
||||||
locals_ = locals()
|
|
||||||
magnitudes_str = ("{n} {magnitude}".format(n=int(locals_[magnitude]), magnitude=magnitude) for magnitude in ("days", "hours", "minutes", "seconds") if locals_[magnitude])
|
|
||||||
return ", ".join(magnitudes_str)
|
|
||||||
|
|
||||||
def process_stats(self, message):
|
|
||||||
data = message["message"]
|
|
||||||
data["host"] = message["host"]
|
|
||||||
|
|
||||||
if data["msg_type"] == "up" and data["initial"] == True:
|
|
||||||
return # We don't need to say what is up, initially...
|
|
||||||
|
|
||||||
# TODO: Duration
|
|
||||||
if data["msg_type"] == "up":
|
|
||||||
try:
|
|
||||||
data["duration"] = self.format_time_duration(time.time() - self.last_down["%(host)s!%(service)s.%(unit)s" % data])
|
|
||||||
except KeyError, e:
|
|
||||||
data["duration"] = "0 seconds"
|
|
||||||
self.send_all("\x02\x030,3 [ UP ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is now back up. It was down for %(duration)s." % data)
|
|
||||||
elif data["msg_type"] == "down":
|
|
||||||
self.last_down["%(host)s!%(service)s.%(unit)s" % data] = time.time()
|
|
||||||
self.send_all("\x02\x030,4 [ DOWN ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that \x036%(unit)s\x03 is \x02down!\x02" % data)
|
|
||||||
elif data["msg_type"] == "blip":
|
|
||||||
self.send_all("\x02\x030,7 [ WARNING ] \x03\x02 Service \x032%(service)s\x03 on host \x037%(host)s\x03 reports that a blip occurred for \x036%(unit)s\x03!" % data)
|
|
||||||
elif data["msg_type"] == "value":
|
|
||||||
for rule_id, rule in rules.iteritems():
|
|
||||||
check_vals = {
|
|
||||||
"host": [data["host"]],
|
|
||||||
"service": [data["service"]],
|
|
||||||
"resource": [data["resource_type"]],
|
|
||||||
"unit": [data["unit"]]
|
|
||||||
}
|
|
||||||
|
|
||||||
failed = False
|
|
||||||
for segment in ("host", "service", "resource", "unit"):
|
|
||||||
for val in check_vals[segment]:
|
|
||||||
if rule["target"][segment] is not True and not fnmatch.fnmatch(val, rule["target"][segment]):
|
|
||||||
failed = True
|
|
||||||
break
|
|
||||||
if failed:
|
|
||||||
continue # Skip to next
|
|
||||||
|
|
||||||
# We haven't broken out in the past bit of code, so we're still matching the pattern...
|
|
||||||
eligible_keys = [key for key in data["values"].keys() if fnmatch.fnmatch(key, rule["target"]["attribute"])]
|
|
||||||
|
|
||||||
for key in eligible_keys:
|
|
||||||
value = data["values"][key]
|
|
||||||
rule_value = float(rule["value"])
|
|
||||||
operator = rule["operator"]
|
|
||||||
|
|
||||||
if operator == "=":
|
|
||||||
alarm = (value == rule_value)
|
|
||||||
elif operator == ">":
|
|
||||||
alarm = (value > rule_value)
|
|
||||||
elif operator == "<":
|
|
||||||
alarm = (value < rule_value)
|
|
||||||
elif operator == ">=":
|
|
||||||
alarm = (value >= rule_value)
|
|
||||||
elif operator == "<=":
|
|
||||||
alarm = (value <= rule_value)
|
|
||||||
elif operator == "!=":
|
|
||||||
alarm = (value != rule_value)
|
|
||||||
else:
|
|
||||||
alarm = False
|
|
||||||
|
|
||||||
self.trigger_alarm(rule_id, data, alarm, value, key)
|
|
||||||
|
|
||||||
def trigger_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
|
|
||||||
key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key)
|
|
||||||
|
|
||||||
if key not in self.known_alarms:
|
|
||||||
if active:
|
|
||||||
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
|
|
||||||
self.known_alarms[key] = time.time()
|
|
||||||
else:
|
|
||||||
self.known_alarms[key] = False
|
|
||||||
else:
|
|
||||||
if self.known_alarms[key] == False and active:
|
|
||||||
# Alarm activated
|
|
||||||
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
|
|
||||||
self.known_alarms[key] = time.time()
|
|
||||||
elif self.known_alarms[key] != False and not active:
|
|
||||||
# Alarm deactivated
|
|
||||||
self.transmit_alarm(rule_id, data, active, offending_value, offending_key)
|
|
||||||
self.known_alarms[key] = False
|
|
||||||
|
|
||||||
def transmit_alarm(self, rule_id, data, active, offending_value=None, offending_key=None):
|
|
||||||
# At this point, we're sure that we want to notify...
|
|
||||||
rule_target = rules[rule_id]["target"].copy()
|
|
||||||
for k, v in rule_target.iteritems():
|
|
||||||
if v is True:
|
|
||||||
rule_target[k] = "*"
|
|
||||||
|
|
||||||
rule_pattern = "%(host)s!%(service)s.%(resource)s:%(unit)s.%(attribute)s" % rule_target
|
|
||||||
|
|
||||||
info = {
|
|
||||||
"host": data["host"],
|
|
||||||
"rule_id": rule_id,
|
|
||||||
"rule_pattern": rule_pattern
|
|
||||||
}
|
|
||||||
|
|
||||||
if not active:
|
|
||||||
key = "%s/%s/%s/%s" % (rule_id, data["host"], data["unit"], offending_key)
|
|
||||||
try:
|
|
||||||
info["duration"] = self.format_time_duration(time.time() - self.known_alarms[key])
|
|
||||||
except KeyError, e:
|
|
||||||
info["duration"] = "0 seconds"
|
|
||||||
info["unit"] = data["unit"]
|
|
||||||
info["attribute"] = offending_key
|
|
||||||
|
|
||||||
self.send_all("\x02\x030,3 [ SOLVED ] \x03\x02 Host \x037%(host)s\x03 reports that the alarm for rule %(rule_id)s (\x036%(rule_pattern)s\x03) was resolved for \x034%(unit)s\x03.\x034%(attribute)s\x03. It was active for %(duration)s." % info)
|
|
||||||
else:
|
|
||||||
info["value"] = offending_value
|
|
||||||
info["spec"] = "%s %s" % (rules[rule_id]["operator"], rules[rule_id]["value"])
|
|
||||||
info["unit"] = data["unit"]
|
|
||||||
info["attribute"] = offending_key
|
|
||||||
|
|
||||||
self.send_all("\x02\x030,7 [ ALARM ] \x03\x02 Host \x037%(host)s\x03 reports that an alarm was triggered for rule %(rule_id)s (\x036%(rule_pattern)s\x03). The reported value was\x034 %(value)s\x03 for\x034 %(unit)s\x03.\x034%(attribute)s\x03 , triggering the \x032%(spec)s\x03 condition." % info)
|
|
||||||
|
|
||||||
def process_message(self, message):
|
|
||||||
if message[0].upper() == "PING":
|
|
||||||
self.send_raw("PONG %s" % message[1])
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
self.command_map[message[1].upper()](message)
|
|
||||||
except KeyError, e:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
bot = Bot(config["irc"]["hosts"], config["irc"]["port"], config["irc"]["nickname"], config["irc"]["realname"], config["irc"]["channels"], config["irc"]["admins"], fetcher)
|
|
||||||
bot.run()
|
|
@ -1,12 +0,0 @@
|
|||||||
irc:
|
|
||||||
hosts:
|
|
||||||
- kerpia.cryto.net
|
|
||||||
- box.cryto.net
|
|
||||||
- arvel.cryto.net
|
|
||||||
port: 6667
|
|
||||||
nickname: StatusBot
|
|
||||||
realname: Cryto System Monitoring Service
|
|
||||||
admins:
|
|
||||||
- joepie91
|
|
||||||
channels:
|
|
||||||
- "#test"
|
|
@ -1,104 +0,0 @@
|
|||||||
#!/usr/bin/env python2
|
|
||||||
|
|
||||||
import zmq, msgpack, yaml, zmqtimer, binascii, nacl, sys, socket
|
|
||||||
from nacl.public import PublicKey, PrivateKey, Box
|
|
||||||
|
|
||||||
ctx = zmq.Context()
|
|
||||||
|
|
||||||
distributor = ctx.socket(zmq.PUB)
|
|
||||||
distributor.bind("tcp://127.0.0.1:8998")
|
|
||||||
|
|
||||||
poller = zmq.Poller()
|
|
||||||
|
|
||||||
with open("config.yaml", "r") as cfile:
|
|
||||||
config = yaml.safe_load(cfile)
|
|
||||||
|
|
||||||
with open("privkey.dat", "r") as f:
|
|
||||||
privkey = PrivateKey(binascii.unhexlify(f.read()))
|
|
||||||
|
|
||||||
nodes = config["nodes"]
|
|
||||||
last_node_status = {}
|
|
||||||
socket_map = {}
|
|
||||||
boxes = {}
|
|
||||||
|
|
||||||
def heartbeat():
|
|
||||||
for hostname, node in nodes.iteritems():
|
|
||||||
retries = 0
|
|
||||||
while retries < config["heartbeat"]["attempts"]:
|
|
||||||
try:
|
|
||||||
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
||||||
s.settimeout(float(config["heartbeat"]["timeout"]) / (retries + 1))
|
|
||||||
s.connect((node["ip"], node["port"]))
|
|
||||||
s.shutdown(socket.SHUT_RDWR)
|
|
||||||
s.close()
|
|
||||||
up = True
|
|
||||||
break
|
|
||||||
except socket.error, e:
|
|
||||||
up = False
|
|
||||||
retries += 1
|
|
||||||
|
|
||||||
try:
|
|
||||||
status_changed = (up != last_node_status[hostname])
|
|
||||||
initial = False
|
|
||||||
except KeyError, e:
|
|
||||||
status_changed = True
|
|
||||||
initial = True
|
|
||||||
|
|
||||||
last_node_status[hostname] = up
|
|
||||||
|
|
||||||
send_message = False
|
|
||||||
if status_changed:
|
|
||||||
if up:
|
|
||||||
msg_type = "up"
|
|
||||||
send_message = True
|
|
||||||
else:
|
|
||||||
msg_type = "down"
|
|
||||||
send_message = True
|
|
||||||
else:
|
|
||||||
if up and retries > 0:
|
|
||||||
msg_type = "blip"
|
|
||||||
send_message = True
|
|
||||||
|
|
||||||
if send_message:
|
|
||||||
distributor.send(msgpack.packb({
|
|
||||||
"host": config["hostname"],
|
|
||||||
"message": {
|
|
||||||
"service": "heartbeat",
|
|
||||||
"msg_type": msg_type,
|
|
||||||
"unit": hostname,
|
|
||||||
"initial": initial
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
timers = zmqtimer.ZmqTimerManager()
|
|
||||||
timers.add_timer(zmqtimer.ZmqTimer(config["heartbeat"]["interval"], heartbeat))
|
|
||||||
|
|
||||||
for hostname, node in config["nodes"].iteritems():
|
|
||||||
boxes[hostname] = Box(privkey, PublicKey(binascii.unhexlify(node["pubkey"])))
|
|
||||||
grabber = ctx.socket(zmq.SUB)
|
|
||||||
grabber.setsockopt(zmq.SUBSCRIBE, "")
|
|
||||||
grabber.connect(node["endpoint"])
|
|
||||||
socket_map[grabber] = hostname
|
|
||||||
poller.register(grabber, zmq.POLLIN)
|
|
||||||
|
|
||||||
while True:
|
|
||||||
timers.check()
|
|
||||||
socks = dict(poller.poll(timers.get_next_interval()))
|
|
||||||
|
|
||||||
for sock in socks:
|
|
||||||
if socks[sock] == zmq.POLLIN:
|
|
||||||
host = socket_map[sock]
|
|
||||||
try:
|
|
||||||
message = msgpack.unpackb(boxes[host].decrypt(sock.recv()))
|
|
||||||
except nacl.exceptions.CryptoError, e:
|
|
||||||
# Probably a spoofed message... skip to next socket
|
|
||||||
sys.stderr.write("Ignoring message... spoofed? (origin: %s)\n" % host) # FIXME: Use logging module...
|
|
||||||
continue
|
|
||||||
except Exception, e:
|
|
||||||
sys.stderr.write(repr(e) + "\n")
|
|
||||||
continue
|
|
||||||
distributor.send(msgpack.packb({
|
|
||||||
"host": host,
|
|
||||||
"message": message
|
|
||||||
}))
|
|
||||||
|
|
@ -1,13 +0,0 @@
|
|||||||
hostname: monitoring.cryto.net
|
|
||||||
|
|
||||||
heartbeat:
|
|
||||||
interval: 5
|
|
||||||
timeout: 1
|
|
||||||
attempts: 3
|
|
||||||
|
|
||||||
nodes:
|
|
||||||
localhost:
|
|
||||||
ip: 127.0.0.1
|
|
||||||
port: 6543
|
|
||||||
endpoint: tcp://127.0.0.1:6543
|
|
||||||
pubkey: bd784ef4065c9bd31627106dc55e26764605a144c6fc45ce93f33cbd19dd7333
|
|
@ -1,15 +0,0 @@
|
|||||||
#!/usr/bin/env python2
|
|
||||||
|
|
||||||
import yaml, os, stat, binascii
|
|
||||||
from nacl.public import PrivateKey
|
|
||||||
|
|
||||||
privkey = PrivateKey.generate()
|
|
||||||
pubkey = privkey.public_key
|
|
||||||
|
|
||||||
with open("privkey.dat", "w") as f:
|
|
||||||
f.write(binascii.hexlify(str(privkey)))
|
|
||||||
|
|
||||||
with open("pubkey.dat", "w") as f:
|
|
||||||
f.write(binascii.hexlify(str(pubkey)))
|
|
||||||
|
|
||||||
os.chmod("privkey.dat", stat.S_IRUSR | stat.S_IWUSR)
|
|
@ -1,12 +0,0 @@
|
|||||||
#!/usr/bin/env python2
|
|
||||||
import zmq, msgpack
|
|
||||||
|
|
||||||
ctx = zmq.Context()
|
|
||||||
|
|
||||||
fetcher = ctx.socket(zmq.SUB)
|
|
||||||
fetcher.setsockopt(zmq.SUBSCRIBE, "")
|
|
||||||
fetcher.connect("tcp://127.0.0.1:8998")
|
|
||||||
|
|
||||||
while True:
|
|
||||||
message = msgpack.unpackb(fetcher.recv())
|
|
||||||
print message
|
|
@ -1,41 +0,0 @@
|
|||||||
import time
|
|
||||||
|
|
||||||
class ZmqTimerManager(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.timers = []
|
|
||||||
self.next_call = 0
|
|
||||||
|
|
||||||
def add_timer(self, timer):
|
|
||||||
self.timers.append(timer)
|
|
||||||
|
|
||||||
def check(self):
|
|
||||||
if time.time() > self.next_call:
|
|
||||||
for timer in self.timers:
|
|
||||||
timer.check()
|
|
||||||
|
|
||||||
def get_next_interval(self):
|
|
||||||
if time.time() >= self.next_call:
|
|
||||||
call_times = []
|
|
||||||
for timer in self.timers:
|
|
||||||
call_times.append(timer.get_next_call())
|
|
||||||
self.next_call = min(call_times)
|
|
||||||
if self.next_call < time.time():
|
|
||||||
return 0
|
|
||||||
else:
|
|
||||||
return (self.next_call - time.time()) * 1000
|
|
||||||
else:
|
|
||||||
return (self.next_call - time.time()) * 1000
|
|
||||||
|
|
||||||
class ZmqTimer(object):
|
|
||||||
def __init__(self, interval, callback):
|
|
||||||
self.interval = interval
|
|
||||||
self.callback = callback
|
|
||||||
self.last_call = 0
|
|
||||||
|
|
||||||
def check(self):
|
|
||||||
if time.time() > (self.interval + self.last_call):
|
|
||||||
self.callback()
|
|
||||||
self.last_call = time.time()
|
|
||||||
|
|
||||||
def get_next_call(self):
|
|
||||||
return self.last_call + self.interval
|
|
@ -1,6 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
echo "Generating keypair..."
|
|
||||||
./genkey 2>/dev/null
|
|
||||||
./bootstrap-config
|
|
||||||
echo "Your public key: `cat pubkey.dat`"
|
|
||||||
echo "Server IP: `curl -s http://wtfismyip.com/text`" 2>/dev/null
|
|
@ -1,86 +0,0 @@
|
|||||||
#!/usr/bin/env python2
|
|
||||||
|
|
||||||
import yaml, sys
|
|
||||||
|
|
||||||
master_pubkey = raw_input("Public key of the master server: ")
|
|
||||||
|
|
||||||
print "You'll now be asked to configure ports to check. If you don't want to configure any ports, just hit enter without entering any information."
|
|
||||||
|
|
||||||
ports = {}
|
|
||||||
|
|
||||||
while True:
|
|
||||||
port = raw_input("Port number: ")
|
|
||||||
if port.strip() == "":
|
|
||||||
break
|
|
||||||
service_name = raw_input("Service name for port %s: " % port)
|
|
||||||
ports[int(port)] = service_name
|
|
||||||
|
|
||||||
print "The same thing, except now for processes to check. Just hit enter without entering any information when you're done; the same goes for the argument list. As a wildcard, you can use *"
|
|
||||||
|
|
||||||
services = {}
|
|
||||||
|
|
||||||
while True:
|
|
||||||
service_name = raw_input("Service name: ")
|
|
||||||
|
|
||||||
if service_name.strip() == "":
|
|
||||||
break
|
|
||||||
|
|
||||||
process_name = raw_input("Process name: ")
|
|
||||||
|
|
||||||
args = {}
|
|
||||||
argnum = 1
|
|
||||||
while True:
|
|
||||||
arg = raw_input("Argument %d: " % argnum)
|
|
||||||
if arg.strip() == "":
|
|
||||||
break
|
|
||||||
args[argnum] = arg
|
|
||||||
argnum += 1
|
|
||||||
|
|
||||||
services[service_name] = {
|
|
||||||
"name": process_name,
|
|
||||||
"args": args
|
|
||||||
}
|
|
||||||
|
|
||||||
print "Now enter any disk devices you wish to monitor. Leave empty and hit enter when done."
|
|
||||||
|
|
||||||
disks = []
|
|
||||||
|
|
||||||
while True:
|
|
||||||
device_name = raw_input("Device name: ")
|
|
||||||
if device_name.strip() == "":
|
|
||||||
break
|
|
||||||
disks.append(device_name)
|
|
||||||
|
|
||||||
# Write config files...
|
|
||||||
|
|
||||||
modules = []
|
|
||||||
|
|
||||||
modules.append("stats-machine")
|
|
||||||
with open("config/machine.yaml.example", "r") as ef:
|
|
||||||
with open("config/machine.yaml", "w") as ff:
|
|
||||||
data = yaml.safe_load(ef.read())
|
|
||||||
data["drives"] = disks
|
|
||||||
ff.write(yaml.dump(data))
|
|
||||||
|
|
||||||
if len(ports) > 0:
|
|
||||||
modules.append("stats-ports")
|
|
||||||
with open("config/ports.yaml.example", "r") as ef:
|
|
||||||
with open("config/ports.yaml", "w") as ff:
|
|
||||||
data = yaml.safe_load(ef.read())
|
|
||||||
data["ports"] = ports
|
|
||||||
ff.write(yaml.dump(data))
|
|
||||||
|
|
||||||
if len(services) > 0:
|
|
||||||
modules.append("stats-processes")
|
|
||||||
with open("config/processes.yaml.example", "r") as ef:
|
|
||||||
with open("config/processes.yaml", "w") as ff:
|
|
||||||
data = yaml.safe_load(ef.read())
|
|
||||||
data["processes"] = services
|
|
||||||
ff.write(yaml.dump(data))
|
|
||||||
|
|
||||||
with open("config/cstatsd.yaml.example", "r") as ef:
|
|
||||||
with open("config/cstatsd.yaml", "w") as ff:
|
|
||||||
data = yaml.safe_load(ef.read())
|
|
||||||
data["pubkey"] = master_pubkey
|
|
||||||
data["autostart"] = modules
|
|
||||||
ff.write(yaml.dump(data))
|
|
@ -1,7 +0,0 @@
|
|||||||
endpoint: tcp://*:6543
|
|
||||||
pubkey: a266a0634790a79c6934385892f7c377d35b8f03b9c6ac7d5bfed4a94f93ba65
|
|
||||||
|
|
||||||
autostart:
|
|
||||||
- stats-processes
|
|
||||||
- stats-ports
|
|
||||||
- stats-machine
|
|
@ -1,5 +1,4 @@
|
|||||||
interval: 1
|
interval: 1
|
||||||
|
|
||||||
drives:
|
drives:
|
||||||
- /dev/sda1
|
- /
|
||||||
- /dev/sdb1
|
|
||||||
|
@ -1,15 +0,0 @@
|
|||||||
interval: 5
|
|
||||||
|
|
||||||
processes:
|
|
||||||
radiotray:
|
|
||||||
name: '*python*'
|
|
||||||
args:
|
|
||||||
1: /usr/bin/radiotray
|
|
||||||
|
|
||||||
guake:
|
|
||||||
name: '*python*'
|
|
||||||
args:
|
|
||||||
1: /usr/local/bin/guake
|
|
||||||
|
|
||||||
keepassx:
|
|
||||||
name: keepassx
|
|
@ -1,43 +1,12 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
import zmq, yaml, binascii, nacl, sys, subprocess, os
|
import zmq, msgpack
|
||||||
from nacl.public import PublicKey, PrivateKey, Box
|
|
||||||
|
|
||||||
basedir = os.path.dirname(os.path.realpath(__file__))
|
|
||||||
|
|
||||||
with open("cstatsd.pid", "w") as pidfile:
|
|
||||||
pidfile.write(str(os.getpid()))
|
|
||||||
|
|
||||||
ctx = zmq.Context()
|
ctx = zmq.Context()
|
||||||
|
|
||||||
with open("config/cstatsd.yaml", "r") as cfile:
|
|
||||||
config = yaml.safe_load(cfile)
|
|
||||||
|
|
||||||
pubkey = PublicKey(binascii.unhexlify(config["pubkey"]))
|
|
||||||
|
|
||||||
with open("privkey.dat", "r") as f:
|
|
||||||
privkey = PrivateKey(binascii.unhexlify(f.read()))
|
|
||||||
|
|
||||||
box = Box(privkey, pubkey)
|
|
||||||
|
|
||||||
collector = ctx.socket(zmq.PULL)
|
collector = ctx.socket(zmq.PULL)
|
||||||
collector.bind("ipc:///tmp/cstatsd")
|
collector.bind("ipc:///tmp/cstatsd")
|
||||||
|
|
||||||
shipper = ctx.socket(zmq.PUB)
|
|
||||||
shipper.bind(config["endpoint"])
|
|
||||||
|
|
||||||
try:
|
|
||||||
disable_autostart = (sys.argv[1] == "--disable-autostart")
|
|
||||||
except:
|
|
||||||
disable_autostart = False
|
|
||||||
|
|
||||||
if disable_autostart == False:
|
|
||||||
with open("/dev/null", "w+") as stfu:
|
|
||||||
for script in config["autostart"]:
|
|
||||||
print os.path.join(basedir, script)
|
|
||||||
subprocess.Popen([os.path.join(basedir, script)], stdout=stfu, stderr=stfu)
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
message = collector.recv()
|
message = msgpack.unpackb(collector.recv())
|
||||||
nonce = nacl.utils.random(Box.NONCE_SIZE)
|
print message
|
||||||
shipper.send(box.encrypt(message, nonce))
|
|
||||||
|
@ -1,15 +0,0 @@
|
|||||||
#!/usr/bin/env python2
|
|
||||||
|
|
||||||
import yaml, os, stat, binascii
|
|
||||||
from nacl.public import PrivateKey
|
|
||||||
|
|
||||||
privkey = PrivateKey.generate()
|
|
||||||
pubkey = privkey.public_key
|
|
||||||
|
|
||||||
with open("privkey.dat", "w") as f:
|
|
||||||
f.write(binascii.hexlify(str(privkey)))
|
|
||||||
|
|
||||||
with open("pubkey.dat", "w") as f:
|
|
||||||
f.write(binascii.hexlify(str(pubkey)))
|
|
||||||
|
|
||||||
os.chmod("privkey.dat", stat.S_IRUSR | stat.S_IWUSR)
|
|
@ -1,5 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
PID=`cat cstatsd.pid`
|
|
||||||
pkill -P $PID
|
|
||||||
kill $PID
|
|
@ -1,224 +0,0 @@
|
|||||||
#!/usr/bin/env python2
|
|
||||||
|
|
||||||
import zmq, msgpack, time, psutil, yaml, os, subprocess
|
|
||||||
from collections import namedtuple
|
|
||||||
|
|
||||||
# Horrible hack to make check_output exist in 2.6
|
|
||||||
# http://stackoverflow.com/a/13160748/1332715
|
|
||||||
if "check_output" not in dir( subprocess ): # duck punch it in!
|
|
||||||
def f(*popenargs, **kwargs):
|
|
||||||
if 'stdout' in kwargs:
|
|
||||||
raise ValueError('stdout argument not allowed, it will be overridden.')
|
|
||||||
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
|
|
||||||
output, unused_err = process.communicate()
|
|
||||||
retcode = process.poll()
|
|
||||||
if retcode:
|
|
||||||
cmd = kwargs.get("args")
|
|
||||||
if cmd is None:
|
|
||||||
cmd = popenargs[0]
|
|
||||||
raise subprocess.CalledProcessError(retcode, cmd)
|
|
||||||
return output
|
|
||||||
subprocess.check_output = f
|
|
||||||
|
|
||||||
ctx = zmq.Context()
|
|
||||||
|
|
||||||
sock = ctx.socket(zmq.PUSH)
|
|
||||||
sock.connect("ipc:///tmp/cstatsd")
|
|
||||||
|
|
||||||
with open("config/machine.yaml", "r") as cfile:
|
|
||||||
config = yaml.safe_load(cfile)
|
|
||||||
|
|
||||||
interval = config["interval"]
|
|
||||||
old_net_data = {}
|
|
||||||
|
|
||||||
disk_map = {}
|
|
||||||
last_io_data = {}
|
|
||||||
|
|
||||||
if os.path.exists("/proc/user_beancounters") and not os.path.exists("/proc/vz/vestat"):
|
|
||||||
openvz_burst = True
|
|
||||||
FakeRam = namedtuple("FakeRam", ["total", "used", "available", "percent", "buffers", "cached"])
|
|
||||||
else:
|
|
||||||
openvz_burst = False
|
|
||||||
|
|
||||||
for disk in psutil.disk_partitions():
|
|
||||||
disk_map[disk.device] = disk
|
|
||||||
|
|
||||||
if len(disk_map) == 0:
|
|
||||||
# We're probably on OpenVZ, so /proc/partitions doesn't exist. Fall back to 'df'.
|
|
||||||
FakeDisk = namedtuple("FakeDisk", ["device", "mountpoint"])
|
|
||||||
for line in subprocess.check_output(["df"]).splitlines()[1:]:
|
|
||||||
device, _, _, _, _, mountpoint = line.split()
|
|
||||||
disk_map[device] = FakeDisk(device, mountpoint)
|
|
||||||
|
|
||||||
while True:
|
|
||||||
load_avgs = os.getloadavg()
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "machine",
|
|
||||||
"msg_type": "value",
|
|
||||||
"resource_type": "load_average",
|
|
||||||
"unit": "",
|
|
||||||
"values": {
|
|
||||||
"1m": load_avgs[0],
|
|
||||||
"5m": load_avgs[1],
|
|
||||||
"15m": load_avgs[2]
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
cpu_loads = psutil.cpu_percent(percpu=True)
|
|
||||||
|
|
||||||
for i in xrange(0, len(cpu_loads)):
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "machine",
|
|
||||||
"msg_type": "value",
|
|
||||||
"resource_type": "cpu",
|
|
||||||
"unit": "core%d" % (i + 1),
|
|
||||||
"values": {
|
|
||||||
"load": cpu_loads[i]
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
try:
|
|
||||||
io_counters = psutil.disk_io_counters(perdisk=True)
|
|
||||||
except IOError, e:
|
|
||||||
io_counters = {} # OpenVZ...
|
|
||||||
|
|
||||||
for drive in config["drives"]:
|
|
||||||
drive_data = psutil.disk_usage(disk_map[drive].mountpoint)
|
|
||||||
io_data = None
|
|
||||||
|
|
||||||
for diskname, data in io_counters.iteritems():
|
|
||||||
if drive.endswith(diskname):
|
|
||||||
io_data = data
|
|
||||||
|
|
||||||
if io_data is None or drive not in last_io_data:
|
|
||||||
read_bps = 0
|
|
||||||
write_bps = 0
|
|
||||||
read_iops = 0
|
|
||||||
write_iops = 0
|
|
||||||
else:
|
|
||||||
read_bps = (io_data.read_bytes - last_io_data[drive].read_bytes) / interval
|
|
||||||
write_bps = (io_data.write_bytes - last_io_data[drive].write_bytes) / interval
|
|
||||||
read_iops = (io_data.read_count - last_io_data[drive].read_count) / interval
|
|
||||||
write_iops = (io_data.write_count - last_io_data[drive].write_count) / interval
|
|
||||||
|
|
||||||
if io_data is not None:
|
|
||||||
last_io_data[drive] = io_data
|
|
||||||
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "machine",
|
|
||||||
"msg_type": "value",
|
|
||||||
"resource_type": "disk",
|
|
||||||
"unit": drive,
|
|
||||||
"values": {
|
|
||||||
"total": drive_data.total,
|
|
||||||
"used": drive_data.used,
|
|
||||||
"free": drive_data.free,
|
|
||||||
"used_percentage": drive_data.percent,
|
|
||||||
"bps_read": read_bps,
|
|
||||||
"bps_write": write_bps,
|
|
||||||
"iops_read": read_iops,
|
|
||||||
"iops_write": write_iops,
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
if openvz_burst:
|
|
||||||
# Sigh, OpenVZ... let's use 'free', since that apparently -does- understand OpenVZ.
|
|
||||||
lines = subprocess.check_output(["free", "-b"]).splitlines()
|
|
||||||
_, ram_total, ram_used, ram_free, _, ram_buffers, ram_cached = lines[1].split()
|
|
||||||
_, _, _, ram_available = lines[2].split()
|
|
||||||
ram_total = int(ram_total)
|
|
||||||
ram_free = int(ram_free)
|
|
||||||
ram_buffers = int(ram_buffers)
|
|
||||||
ram_cached = int(ram_cached)
|
|
||||||
ram_available = int(ram_available)
|
|
||||||
ram_used = int(ram_used)
|
|
||||||
ram_percent = 1.0 * (ram_total - ram_available) / ram_total * 100
|
|
||||||
ram_data = FakeRam(ram_total, ram_used, ram_available, ram_percent, ram_buffers, ram_cached)
|
|
||||||
else:
|
|
||||||
ram_data = psutil.virtual_memory()
|
|
||||||
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "machine",
|
|
||||||
"msg_type": "value",
|
|
||||||
"resource_type": "memory",
|
|
||||||
"unit": "physical",
|
|
||||||
"values": {
|
|
||||||
"total": ram_data.total,
|
|
||||||
"used": ram_data.used,
|
|
||||||
"free": ram_data.available,
|
|
||||||
"used_percentage": ram_data.percent,
|
|
||||||
"buffers": ram_data.buffers,
|
|
||||||
"cache": ram_data.cached
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
swap_data = psutil.swap_memory()
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "machine",
|
|
||||||
"msg_type": "value",
|
|
||||||
"resource_type": "memory",
|
|
||||||
"unit": "swap",
|
|
||||||
"values": {
|
|
||||||
"total": swap_data.total,
|
|
||||||
"used": swap_data.used,
|
|
||||||
"free": swap_data.free,
|
|
||||||
"used_percentage": swap_data.percent
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
net_data = psutil.net_io_counters(pernic=True)
|
|
||||||
for nic, data in net_data.iteritems():
|
|
||||||
try:
|
|
||||||
old_in_b = old_net_data[nic].bytes_recv
|
|
||||||
old_out_b = old_net_data[nic].bytes_sent
|
|
||||||
old_in_p = old_net_data[nic].packets_recv
|
|
||||||
old_out_p = old_net_data[nic].packets_sent
|
|
||||||
except KeyError, e:
|
|
||||||
# No old data yet, first run? Save and skip to next...
|
|
||||||
old_net_data[nic] = data
|
|
||||||
continue
|
|
||||||
|
|
||||||
diff_in_b = data.bytes_recv - old_in_b
|
|
||||||
diff_out_b = data.bytes_sent - old_out_b
|
|
||||||
diff_in_p = data.packets_recv - old_in_p
|
|
||||||
diff_out_p = data.packets_sent - old_out_p
|
|
||||||
|
|
||||||
if diff_in_b < 0:
|
|
||||||
diff_in_b = 0
|
|
||||||
|
|
||||||
if diff_out_b < 0:
|
|
||||||
diff_out_b = 0
|
|
||||||
|
|
||||||
if diff_in_p < 0:
|
|
||||||
diff_in_p = 0
|
|
||||||
|
|
||||||
if diff_out_p < 0:
|
|
||||||
diff_out_p = 0
|
|
||||||
|
|
||||||
old_net_data[nic] = data
|
|
||||||
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "machine",
|
|
||||||
"msg_type": "value",
|
|
||||||
"resource_type": "network",
|
|
||||||
"unit": nic,
|
|
||||||
"values": {
|
|
||||||
"bps_in": diff_in_b / interval,
|
|
||||||
"bps_out": diff_out_b / interval,
|
|
||||||
"pps_in": diff_in_p / interval,
|
|
||||||
"pps_out": diff_out_p / interval
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "machine",
|
|
||||||
"msg_type": "value",
|
|
||||||
"resource_type": "uptime",
|
|
||||||
"unit": "",
|
|
||||||
"values": {
|
|
||||||
"uptime": time.time() - psutil.get_boot_time()
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
|
|
||||||
time.sleep(interval)
|
|
||||||
|
|
@ -0,0 +1,143 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
|
||||||
|
import zmq, msgpack, time, psutil, yaml, os
|
||||||
|
|
||||||
|
ctx = zmq.Context()
|
||||||
|
|
||||||
|
sock = ctx.socket(zmq.PUSH)
|
||||||
|
sock.connect("ipc:///tmp/cstatsd")
|
||||||
|
|
||||||
|
with open("config/machine.yaml", "r") as cfile:
|
||||||
|
config = yaml.safe_load(cfile)
|
||||||
|
|
||||||
|
interval = config["interval"]
|
||||||
|
old_net_data = {}
|
||||||
|
|
||||||
|
while True:
|
||||||
|
load_avgs = os.getloadavg()
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "machine",
|
||||||
|
"msg_type": "value",
|
||||||
|
"resource_type": "load_average",
|
||||||
|
"unit": "",
|
||||||
|
"values": {
|
||||||
|
"1m": load_avgs[0],
|
||||||
|
"5m": load_avgs[1],
|
||||||
|
"15m": load_avgs[2]
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
cpu_loads = psutil.cpu_percent(percpu=True)
|
||||||
|
|
||||||
|
for i in xrange(0, len(cpu_loads)):
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "machine",
|
||||||
|
"msg_type": "value",
|
||||||
|
"resource_type": "cpu",
|
||||||
|
"unit": "core%d" % (i + 1),
|
||||||
|
"values": {
|
||||||
|
"load": cpu_loads[i]
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
for drive in config["drives"]:
|
||||||
|
drive_data = psutil.disk_usage(drive)
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "machine",
|
||||||
|
"msg_type": "value",
|
||||||
|
"resource_type": "disk",
|
||||||
|
"unit": drive,
|
||||||
|
"values": {
|
||||||
|
"total": drive_data.total,
|
||||||
|
"used": drive_data.used,
|
||||||
|
"free": drive_data.free,
|
||||||
|
"used_percentage": drive_data.percent
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
ram_data = psutil.virtual_memory()
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "machine",
|
||||||
|
"msg_type": "value",
|
||||||
|
"resource_type": "memory",
|
||||||
|
"unit": "physical",
|
||||||
|
"values": {
|
||||||
|
"total": ram_data.total,
|
||||||
|
"used": ram_data.used,
|
||||||
|
"free": ram_data.available,
|
||||||
|
"used_percentage": ram_data.percent,
|
||||||
|
"buffers": ram_data.buffers,
|
||||||
|
"cache": ram_data.cached
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
swap_data = psutil.virtual_memory()
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "machine",
|
||||||
|
"msg_type": "value",
|
||||||
|
"resource_type": "memory",
|
||||||
|
"unit": "swap",
|
||||||
|
"values": {
|
||||||
|
"total": swap_data.total,
|
||||||
|
"used": swap_data.used,
|
||||||
|
"free": swap_data.free,
|
||||||
|
"used_percentage": swap_data.percent
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
net_data = psutil.net_io_counters(pernic=True)
|
||||||
|
for nic, data in net_data.iteritems():
|
||||||
|
try:
|
||||||
|
old_in_b = old_net_data[nic].bytes_recv
|
||||||
|
old_out_b = old_net_data[nic].bytes_sent
|
||||||
|
old_in_p = old_net_data[nic].packets_recv
|
||||||
|
old_out_p = old_net_data[nic].packets_sent
|
||||||
|
except KeyError, e:
|
||||||
|
# No old data yet, first run? Save and skip to next...
|
||||||
|
old_net_data[nic] = data
|
||||||
|
continue
|
||||||
|
|
||||||
|
diff_in_b = data.bytes_recv - old_in_b
|
||||||
|
diff_out_b = data.bytes_sent - old_out_b
|
||||||
|
diff_in_p = data.packets_recv - old_in_p
|
||||||
|
diff_out_p = data.packets_sent - old_out_p
|
||||||
|
|
||||||
|
if diff_in_b < 0:
|
||||||
|
diff_in_b = 0
|
||||||
|
|
||||||
|
if diff_out_b < 0:
|
||||||
|
diff_out_b = 0
|
||||||
|
|
||||||
|
if diff_in_p < 0:
|
||||||
|
diff_in_p = 0
|
||||||
|
|
||||||
|
if diff_out_p < 0:
|
||||||
|
diff_out_p = 0
|
||||||
|
|
||||||
|
old_net_data[nic] = data
|
||||||
|
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "machine",
|
||||||
|
"msg_type": "value",
|
||||||
|
"resource_type": "network",
|
||||||
|
"unit": nic,
|
||||||
|
"values": {
|
||||||
|
"bps_in": diff_in_b / interval,
|
||||||
|
"bps_out": diff_out_b / interval,
|
||||||
|
"pps_in": diff_in_p / interval,
|
||||||
|
"pps_out": diff_out_p / interval
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
sock.send(msgpack.packb({
|
||||||
|
"service": "machine",
|
||||||
|
"msg_type": "value",
|
||||||
|
"resource_type": "uptime",
|
||||||
|
"unit": "",
|
||||||
|
"values": {
|
||||||
|
"uptime": time.time() - psutil.get_boot_time()
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
|
||||||
|
time.sleep(interval)
|
||||||
|
|
@ -1,72 +0,0 @@
|
|||||||
#!/usr/bin/env python2
|
|
||||||
|
|
||||||
import zmq, msgpack, time, yaml, psutil, fnmatch
|
|
||||||
|
|
||||||
ctx = zmq.Context()
|
|
||||||
|
|
||||||
sock = ctx.socket(zmq.PUSH)
|
|
||||||
sock.connect("ipc:///tmp/cstatsd")
|
|
||||||
|
|
||||||
with open("config/processes.yaml", "r") as cfile:
|
|
||||||
config = yaml.safe_load(cfile)
|
|
||||||
|
|
||||||
interval = config["interval"]
|
|
||||||
|
|
||||||
old_status = {}
|
|
||||||
|
|
||||||
while True:
|
|
||||||
all_procs = psutil.get_process_list()
|
|
||||||
|
|
||||||
for service_name, patterns in config["processes"].iteritems():
|
|
||||||
matching = []
|
|
||||||
for proc in all_procs: # Can't use filter() because of exceptions...
|
|
||||||
try:
|
|
||||||
if len(proc.cmdline) > 0 and fnmatch.fnmatch(proc.cmdline[0], patterns["name"]):
|
|
||||||
failed = False
|
|
||||||
try:
|
|
||||||
for arg, pattern in patterns["args"].iteritems():
|
|
||||||
try:
|
|
||||||
if len(proc.cmdline) < (arg + 1) or not fnmatch.fnmatch(proc.cmdline[arg], pattern):
|
|
||||||
failed = True
|
|
||||||
except KeyError, e:
|
|
||||||
pass
|
|
||||||
except KeyError, e:
|
|
||||||
pass
|
|
||||||
if failed == False:
|
|
||||||
matching.append(proc)
|
|
||||||
except psutil._error.NoSuchProcess, e:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if len(matching) > 0:
|
|
||||||
up = True
|
|
||||||
else:
|
|
||||||
up = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
if up == old_status[service_name]:
|
|
||||||
send_notice = False
|
|
||||||
else:
|
|
||||||
send_notice = True
|
|
||||||
initial = False
|
|
||||||
except KeyError, e:
|
|
||||||
send_notice = True
|
|
||||||
initial = True
|
|
||||||
|
|
||||||
old_status[service_name] = up
|
|
||||||
|
|
||||||
if send_notice:
|
|
||||||
if up:
|
|
||||||
msg_type = "up"
|
|
||||||
else:
|
|
||||||
msg_type = "down"
|
|
||||||
|
|
||||||
sock.send(msgpack.packb({
|
|
||||||
"service": "process",
|
|
||||||
"msg_type": msg_type,
|
|
||||||
"unit": service_name,
|
|
||||||
"initial": initial
|
|
||||||
}))
|
|
||||||
|
|
||||||
|
|
||||||
time.sleep(interval)
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# You need squeeze-backports if you run this on squeeze!
|
|
||||||
apt-get install -y libzmq-dev libffi-dev build-essential python python-dev
|
|
||||||
pip install pyzmq msgpack-python pynacl pyyaml psutil
|
|
@ -1,4 +0,0 @@
|
|||||||
# Backports: echo "deb http://backports.debian.org/debian-backports squeeze-backports main" >> /etc/apt/sources.list && apt-get update && apt-get upgrade
|
|
||||||
apt-get install -y python python-dev && wget cryto.net/~joepie91/pipfix.sh && chmod +x pipfix.sh && ./pipfix.sh
|
|
||||||
adduser --system --shell /bin/bash --group monitor && apt-get install -y git; su -c "cd ~; git clone https://github.com/joepie91/cryto-status.git" monitor && /home/monitor/cryto-status/deps.sh
|
|
||||||
# Replace libzmq with a manually compiled version...: wget http://download.zeromq.org/zeromq-4.0.3.tar.gz; tar -xzvf zeromq-4.0.3.tar.gz; cd zeromq-4.0.3; ./configure; make; make install; ldconfig
|
|
@ -1,54 +0,0 @@
|
|||||||
* allow comments in (parentheses) in units, and ignore these when matching against an alarm pattern...
|
|
||||||
* web interface (angularjs)
|
|
||||||
* separate alarm and IRC logic
|
|
||||||
* monitor inodes
|
|
||||||
* watchdog on slave and master -> should send WARN notifications
|
|
||||||
* notifications (text, arbitrary-serialized-data as attachment, DEBUG/INFO/WARN/ERR/CRIT)
|
|
||||||
* consider redundancy - can already connect multiple masters through pubsub, how to deal with duplicate processing checking?
|
|
||||||
|
|
||||||
cprocessd:
|
|
||||||
-> subscribe to ccollectd
|
|
||||||
-> debug switch for outputting all to terminal
|
|
||||||
-> keep up/down state
|
|
||||||
-> keep last-value state (resource usage)
|
|
||||||
-> keep track of persistent downtimes (down for more than X time, as configured in config file)
|
|
||||||
-> alarms (move this from the IRC bot to cprocessd)
|
|
||||||
-> classify message importance
|
|
||||||
-> cprocessd-stream socket, PUB that just streams processed data
|
|
||||||
-> cprocessd-query socket, REP that responds to queries
|
|
||||||
-> server-status
|
|
||||||
-> down-list
|
|
||||||
-> last-value
|
|
||||||
-> server-list
|
|
||||||
-> service-list
|
|
||||||
|
|
||||||
cmaild:
|
|
||||||
-> use marrow.mailer
|
|
||||||
-> receives data from cprocessd-stream
|
|
||||||
-> sends e-mails for configured importance levels
|
|
||||||
|
|
||||||
cbotd:
|
|
||||||
-> currently named 'alert'
|
|
||||||
-> receives data from cprocessd-stream
|
|
||||||
-> IRC bot
|
|
||||||
-> posts alerts to specified IRC channels, depending on minimum severity level configured for that channel (ie. INFO for #cryto-network but ERR for #crytocc)
|
|
||||||
|
|
||||||
csmsd:
|
|
||||||
-> sends SMS for (critical) alerts
|
|
||||||
-> receives data from cprocessd-stream
|
|
||||||
-> Twilio? does a provider-neutral API exist? might need an extra abstraction...
|
|
||||||
|
|
||||||
cwebd:
|
|
||||||
-> offers web interface with streaming status data
|
|
||||||
-> publicly accessible and password-protected
|
|
||||||
-> streaming data from cprocessd-stream
|
|
||||||
-> on-pageload state from cprocessd-query (including 'current downtimes')
|
|
||||||
-> tornado+zmq ioloop, http://zeromq.github.io/pyzmq/eventloop.html
|
|
||||||
-> web dashboard
|
|
||||||
-> AngularJS
|
|
||||||
-> fancy graphs (via AngularJS? idk if a directive exists for this)
|
|
||||||
-> show downtimes as well as live per-machine stats
|
|
||||||
-> also show overview of all machines in a grid, color-coded for average load of all resources
|
|
||||||
-> historical up/down data
|
|
||||||
-> sqlite storage? single concurrent write, so should work
|
|
||||||
-> perhaps letting people sign up for e-mail alerts is an option? to-inbox will be tricky here
|
|
Loading…
Reference in New Issue