Benchmarking and optimizations

master
Sven Slootweg 10 years ago
parent 672f64975c
commit 9203d83c03

@ -101,7 +101,12 @@ else:
if key in contact_data and contact_data[key] is not None:
label = " " + value + (" " * (widest_label - len(value))) + " :"
if sys.version_info < (3, 0):
actual_data = unicode(contact_data[key])
if type(contact_data[key]) == str:
actual_data = contact_data[key].decode("utf-8")
elif type(contact_data[key]) == datetime.datetime:
actual_data = unicode(contact_data[key])
else:
actual_data = contact_data[key]
else:
actual_data = str(contact_data[key])
if "\n" in actual_data: # Indent multi-line values properly

@ -48,6 +48,9 @@ read_dataset("states_au.dat", states_au, 0, 1)
read_dataset("states_us.dat", states_us, "abbreviation", "name", is_dict=True)
read_dataset("states_ca.dat", states_ca, "abbreviation", "name", is_dict=True)
def precompile_regexes(source, flags=0):
return [re.compile(regex, flags) for regex in source]
grammar = {
"_data": {
'id': ['Domain ID:[ ]*(?P<val>.+)'],
@ -389,6 +392,30 @@ organization_regexes = (
r"\ss\.?a\.?r\.?l\.?($|\s)",
)
grammar["_data"]["id"] = precompile_regexes(grammar["_data"]["id"], re.IGNORECASE)
grammar["_data"]["status"] = precompile_regexes(grammar["_data"]["status"], re.IGNORECASE)
grammar["_data"]["creation_date"] = precompile_regexes(grammar["_data"]["creation_date"], re.IGNORECASE)
grammar["_data"]["expiration_date"] = precompile_regexes(grammar["_data"]["expiration_date"], re.IGNORECASE)
grammar["_data"]["updated_date"] = precompile_regexes(grammar["_data"]["updated_date"], re.IGNORECASE)
grammar["_data"]["registrar"] = precompile_regexes(grammar["_data"]["registrar"], re.IGNORECASE)
grammar["_data"]["whois_server"] = precompile_regexes(grammar["_data"]["whois_server"], re.IGNORECASE)
grammar["_data"]["nameservers"] = precompile_regexes(grammar["_data"]["nameservers"], re.IGNORECASE)
grammar["_data"]["emails"] = precompile_regexes(grammar["_data"]["emails"], re.IGNORECASE)
grammar["_dateformats"] = precompile_regexes(grammar["_dateformats"], re.IGNORECASE)
registrant_regexes = precompile_regexes(registrant_regexes)
tech_contact_regexes = precompile_regexes(tech_contact_regexes)
billing_contact_regexes = precompile_regexes(billing_contact_regexes)
admin_contact_regexes = precompile_regexes(admin_contact_regexes)
nic_contact_regexes = precompile_regexes(nic_contact_regexes)
organization_regexes = precompile_regexes(organization_regexes, re.IGNORECASE)
nic_contact_references["registrant"] = precompile_regexes(nic_contact_references["registrant"])
nic_contact_references["tech"] = precompile_regexes(nic_contact_references["tech"])
nic_contact_references["admin"] = precompile_regexes(nic_contact_references["admin"])
nic_contact_references["billing"] = precompile_regexes(nic_contact_references["billing"])
if sys.version_info < (3, 0):
def is_string(data):
"""Test for string with support for python 2."""
@ -409,7 +436,7 @@ def parse_raw_whois(raw_data, normalized=[], never_query_handles=True, handle_se
if (rule_key in data) == False:
for line in segment.splitlines():
for regex in rule_regexes:
result = re.search(regex, line, re.IGNORECASE)
result = re.search(regex, line)
if result is not None:
val = result.group("val").strip()
@ -634,7 +661,7 @@ def normalize_data(data, normalized):
new_lines = []
for i, line in enumerate(lines):
for regex in organization_regexes:
if re.search(regex, line, re.IGNORECASE):
if re.search(regex, line):
new_lines.append(line)
del lines[i]
break
@ -650,7 +677,7 @@ def normalize_data(data, normalized):
lines = [x.strip() for x in contact["street"].splitlines()]
if len(lines) > 1:
for regex in organization_regexes:
if re.search(regex, lines[0], re.IGNORECASE):
if re.search(regex, lines[0]):
contact["organization"] = lines[0]
contact["street"] = "\n".join(lines[1:])
break
@ -714,7 +741,7 @@ def parse_dates(dates):
for date in dates:
for rule in grammar['_dateformats']:
result = re.match(rule, date, re.IGNORECASE)
result = re.match(rule, date)
if result is not None:
try:

@ -1,6 +1,6 @@
#!/usr/bin/env python2
import sys, argparse, os, pythonwhois, json, datetime, codecs
import sys, argparse, os, pythonwhois, json, datetime, codecs, time
import pkgutil
import encodings
@ -94,6 +94,8 @@ else:
targets.sort()
if args.mode[0] == "run":
times_default = []
times_normalized = []
errors = False
suites = []
for target in targets:
@ -134,7 +136,9 @@ if args.mode[0] == "run":
total = len(suites) * 2
for target, data, target_default, target_normalized in suites:
for normalization in (True, []):
start_time = time.time()
parsed = pythonwhois.parse.parse_raw_whois(data, normalized=normalization)
time_taken = (time.time() - start_time) * 1000 # in ms
parsed = json.loads(encoded_json_dumps(parsed)) # Stupid Unicode hack
if normalization == True:
@ -155,6 +159,10 @@ if args.mode[0] == "run":
sys.stdout.write(OK)
sys.stdout.write(progress_prefix + "%s passed in %s mode.\n" % (target, mode))
sys.stderr.write(ENDC)
if normalization == True:
times_normalized.append(time_taken)
else:
times_default.append(time_taken)
total_passed += 1
else:
sys.stderr.write(FAIL)
@ -169,6 +177,18 @@ if args.mode[0] == "run":
total_failed += 1
done += 1
if len(times_default) > 0:
average_default = int(sum(times_default) / float(len(times_default)))
min_default = min(times_default)
max_default = max(times_default)
sys.stdout.write("Timing in default mode: %dms avg, %dms min, %dms max\n" % (average_default, min_default, max_default))
if len(times_normalized) > 0:
average_normalized = int(sum(times_normalized) / float(len(times_normalized)))
min_normalized = min(times_normalized)
max_normalized = max(times_normalized)
sys.stdout.write("Timing in normalized mode: %dms avg, %dms min, %dms max\n" % (average_normalized, min_normalized, max_normalized))
if total_failed == 0:
sys.stdout.write(OK)
sys.stdout.write("All tests passed!\n")

Loading…
Cancel
Save