From 9203d83c035844f914d35934fdcc7a34fe15c072 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sat, 28 Jun 2014 17:03:43 +0200 Subject: [PATCH] Benchmarking and optimizations --- pwhois | 7 ++++++- pythonwhois/parse.py | 35 +++++++++++++++++++++++++++++++---- test.py | 22 +++++++++++++++++++++- 3 files changed, 58 insertions(+), 6 deletions(-) diff --git a/pwhois b/pwhois index a1ad45b..09563f4 100755 --- a/pwhois +++ b/pwhois @@ -101,7 +101,12 @@ else: if key in contact_data and contact_data[key] is not None: label = " " + value + (" " * (widest_label - len(value))) + " :" if sys.version_info < (3, 0): - actual_data = unicode(contact_data[key]) + if type(contact_data[key]) == str: + actual_data = contact_data[key].decode("utf-8") + elif type(contact_data[key]) == datetime.datetime: + actual_data = unicode(contact_data[key]) + else: + actual_data = contact_data[key] else: actual_data = str(contact_data[key]) if "\n" in actual_data: # Indent multi-line values properly diff --git a/pythonwhois/parse.py b/pythonwhois/parse.py index 475beea..6a96c06 100644 --- a/pythonwhois/parse.py +++ b/pythonwhois/parse.py @@ -48,6 +48,9 @@ read_dataset("states_au.dat", states_au, 0, 1) read_dataset("states_us.dat", states_us, "abbreviation", "name", is_dict=True) read_dataset("states_ca.dat", states_ca, "abbreviation", "name", is_dict=True) +def precompile_regexes(source, flags=0): + return [re.compile(regex, flags) for regex in source] + grammar = { "_data": { 'id': ['Domain ID:[ ]*(?P.+)'], @@ -389,6 +392,30 @@ organization_regexes = ( r"\ss\.?a\.?r\.?l\.?($|\s)", ) +grammar["_data"]["id"] = precompile_regexes(grammar["_data"]["id"], re.IGNORECASE) +grammar["_data"]["status"] = precompile_regexes(grammar["_data"]["status"], re.IGNORECASE) +grammar["_data"]["creation_date"] = precompile_regexes(grammar["_data"]["creation_date"], re.IGNORECASE) +grammar["_data"]["expiration_date"] = precompile_regexes(grammar["_data"]["expiration_date"], re.IGNORECASE) +grammar["_data"]["updated_date"] = precompile_regexes(grammar["_data"]["updated_date"], re.IGNORECASE) +grammar["_data"]["registrar"] = precompile_regexes(grammar["_data"]["registrar"], re.IGNORECASE) +grammar["_data"]["whois_server"] = precompile_regexes(grammar["_data"]["whois_server"], re.IGNORECASE) +grammar["_data"]["nameservers"] = precompile_regexes(grammar["_data"]["nameservers"], re.IGNORECASE) +grammar["_data"]["emails"] = precompile_regexes(grammar["_data"]["emails"], re.IGNORECASE) + +grammar["_dateformats"] = precompile_regexes(grammar["_dateformats"], re.IGNORECASE) + +registrant_regexes = precompile_regexes(registrant_regexes) +tech_contact_regexes = precompile_regexes(tech_contact_regexes) +billing_contact_regexes = precompile_regexes(billing_contact_regexes) +admin_contact_regexes = precompile_regexes(admin_contact_regexes) +nic_contact_regexes = precompile_regexes(nic_contact_regexes) +organization_regexes = precompile_regexes(organization_regexes, re.IGNORECASE) + +nic_contact_references["registrant"] = precompile_regexes(nic_contact_references["registrant"]) +nic_contact_references["tech"] = precompile_regexes(nic_contact_references["tech"]) +nic_contact_references["admin"] = precompile_regexes(nic_contact_references["admin"]) +nic_contact_references["billing"] = precompile_regexes(nic_contact_references["billing"]) + if sys.version_info < (3, 0): def is_string(data): """Test for string with support for python 2.""" @@ -409,7 +436,7 @@ def parse_raw_whois(raw_data, normalized=[], never_query_handles=True, handle_se if (rule_key in data) == False: for line in segment.splitlines(): for regex in rule_regexes: - result = re.search(regex, line, re.IGNORECASE) + result = re.search(regex, line) if result is not None: val = result.group("val").strip() @@ -634,7 +661,7 @@ def normalize_data(data, normalized): new_lines = [] for i, line in enumerate(lines): for regex in organization_regexes: - if re.search(regex, line, re.IGNORECASE): + if re.search(regex, line): new_lines.append(line) del lines[i] break @@ -650,7 +677,7 @@ def normalize_data(data, normalized): lines = [x.strip() for x in contact["street"].splitlines()] if len(lines) > 1: for regex in organization_regexes: - if re.search(regex, lines[0], re.IGNORECASE): + if re.search(regex, lines[0]): contact["organization"] = lines[0] contact["street"] = "\n".join(lines[1:]) break @@ -714,7 +741,7 @@ def parse_dates(dates): for date in dates: for rule in grammar['_dateformats']: - result = re.match(rule, date, re.IGNORECASE) + result = re.match(rule, date) if result is not None: try: diff --git a/test.py b/test.py index 4117c69..0833b54 100755 --- a/test.py +++ b/test.py @@ -1,6 +1,6 @@ #!/usr/bin/env python2 -import sys, argparse, os, pythonwhois, json, datetime, codecs +import sys, argparse, os, pythonwhois, json, datetime, codecs, time import pkgutil import encodings @@ -94,6 +94,8 @@ else: targets.sort() if args.mode[0] == "run": + times_default = [] + times_normalized = [] errors = False suites = [] for target in targets: @@ -134,7 +136,9 @@ if args.mode[0] == "run": total = len(suites) * 2 for target, data, target_default, target_normalized in suites: for normalization in (True, []): + start_time = time.time() parsed = pythonwhois.parse.parse_raw_whois(data, normalized=normalization) + time_taken = (time.time() - start_time) * 1000 # in ms parsed = json.loads(encoded_json_dumps(parsed)) # Stupid Unicode hack if normalization == True: @@ -155,6 +159,10 @@ if args.mode[0] == "run": sys.stdout.write(OK) sys.stdout.write(progress_prefix + "%s passed in %s mode.\n" % (target, mode)) sys.stderr.write(ENDC) + if normalization == True: + times_normalized.append(time_taken) + else: + times_default.append(time_taken) total_passed += 1 else: sys.stderr.write(FAIL) @@ -169,6 +177,18 @@ if args.mode[0] == "run": total_failed += 1 done += 1 + if len(times_default) > 0: + average_default = int(sum(times_default) / float(len(times_default))) + min_default = min(times_default) + max_default = max(times_default) + sys.stdout.write("Timing in default mode: %dms avg, %dms min, %dms max\n" % (average_default, min_default, max_default)) + + if len(times_normalized) > 0: + average_normalized = int(sum(times_normalized) / float(len(times_normalized))) + min_normalized = min(times_normalized) + max_normalized = max(times_normalized) + sys.stdout.write("Timing in normalized mode: %dms avg, %dms min, %dms max\n" % (average_normalized, min_normalized, max_normalized)) + if total_failed == 0: sys.stdout.write(OK) sys.stdout.write("All tests passed!\n")