diff --git a/parse.py b/parse.py index 12330df..a892e08 100644 --- a/parse.py +++ b/parse.py @@ -1,3 +1,5 @@ +#!/usr/bin/python + import re, argparse, os def show_sorted(dictionary): @@ -28,6 +30,7 @@ urls = {} extensions = {} total_digits = 10 +current_lines = 0 try: ignore_extensions = options['extensions'].split(',') @@ -39,67 +42,73 @@ for logpath in options['logfiles']: log = open(logpath, 'r') for line in log: - ip, hostname, dash, datetime, timezone, method, uri, version, status, size, referer, useragent = line.split(' ', 11) - datetime = datetime[1:] - date = datetime.split(':')[0] - timezone = timezone[:-1] - method = method[1:] - version = version[:-1] - useragent = useragent[1:-2] - referer = referer[1:-1] - filename = uri.split('?')[0] - extension = os.path.splitext(filename)[1][1:] - - if extension not in ignore_extensions: - if hostname not in hosts: - hosts[hostname] = 0 - - if referer not in referers: - referers[referer] = 0 + try: + ip, hostname, dash, datetime, timezone, method, uri, version, status, size, referer, useragent = line.split(' ', 11) + datetime = datetime[1:] + date = datetime.split(':')[0] + timezone = timezone[:-1] + method = method[1:] + version = version[:-1] + useragent = useragent[1:-2] + referer = referer[1:-1] + filename = uri.split('?')[0] + extension = os.path.splitext(filename)[1][1:] - if date not in days: - days[date] = 0 - - if filename not in files: - files[filename] = 0 - - if uri not in urls: - urls[uri] = 0 + if extension not in ignore_extensions: + if hostname not in hosts: + hosts[hostname] = 0 - if extension not in extensions: - extensions[extension] = 0 + if referer not in referers: + referers[referer] = 0 - hosts[hostname] += 1 - referers[referer] += 1 - days[date] += 1 - files[filename] += 1 - urls[uri] += 1 - extensions[extension] += 1 - - print "Top days:" - show_sorted(days) - print "" - - print "Top requested hostnames:" - show_sorted(hosts) - print "" - - print "Top files:" - show_sorted(files) - print "" - - print "Top extensions:" - show_sorted(extensions) - print "" - - print "Top referers:" - show_sorted(referers) - print "" - - print "Top URLs:" - show_sorted(urls) - print "" - - + if date not in days: + days[date] = 0 + + if filename not in files: + files[filename] = 0 + + if uri not in urls: + urls[uri] = 0 + + if extension not in extensions: + extensions[extension] = 0 + + hosts[hostname] += 1 + referers[referer] += 1 + days[date] += 1 + files[filename] += 1 + urls[uri] += 1 + extensions[extension] += 1 + except ValueError: + print "Corrupt log line at line %d, contents: %s" % (current_lines + 1, line[:-1]) + + current_lines += 1 + + if current_lines % 1000 == 0: + print "Processed %d lines." % current_lines except IOError: print "Could not find file %s, ignored entry." % logpath + +print "Top days:" +show_sorted(days) +print "" + +print "Top requested hostnames:" +show_sorted(hosts) +print "" + +print "Top files:" +show_sorted(files) +print "" + +print "Top extensions:" +show_sorted(extensions) +print "" + +print "Top referers:" +show_sorted(referers) +print "" + +print "Top URLs:" +show_sorted(urls) +print ""