Manually merge pullreq #4 due to mistarget

master
Sven Slootweg 11 years ago
commit dda0525cac

@ -1,4 +1,13 @@
import re, datetime from __future__ import print_function
import re, sys, datetime
if sys.version_info[0] >= 3:
def iteritems(d):
return iter(d.items())
else:
def iteritems(d):
return d.iteritems()
grammar = { grammar = {
"_data": { "_data": {
@ -142,8 +151,8 @@ def parse_raw_whois(raw_data, normalized=[]):
raw_data = [segment.replace("\r", "") for segment in raw_data] # Carriage returns are the devil raw_data = [segment.replace("\r", "") for segment in raw_data] # Carriage returns are the devil
for segment in raw_data: for segment in raw_data:
for rule_key, rule_regexes in grammar['_data'].iteritems(): for rule_key, rule_regexes in iteritems(grammar['_data']):
if data.has_key(rule_key) == False: if rule_key not in data:
for line in segment.splitlines(): for line in segment.splitlines():
for regex in rule_regexes: for regex in rule_regexes:
result = re.search(regex, line, re.IGNORECASE) result = re.search(regex, line, re.IGNORECASE)
@ -153,7 +162,7 @@ def parse_raw_whois(raw_data, normalized=[]):
if val != "": if val != "":
try: try:
data[rule_key].append(val) data[rule_key].append(val)
except KeyError, e: except KeyError as e:
data[rule_key] = [val] data[rule_key] = [val]
# Whois.com is a bit special... Fabulous.com also seems to use this format. # Whois.com is a bit special... Fabulous.com also seems to use this format.
@ -163,7 +172,7 @@ def parse_raw_whois(raw_data, normalized=[]):
for match in re.findall("[ ]+(.+)\n", chunk): for match in re.findall("[ ]+(.+)\n", chunk):
try: try:
data["nameservers"].append(match.strip()) data["nameservers"].append(match.strip())
except KeyError, e: except KeyError as e:
data["nameservers"] = [match.strip()] data["nameservers"] = [match.strip()]
# Nominet also needs some special attention # Nominet also needs some special attention
match = re.search(" Registrar:\n (.+)\n", segment) match = re.search(" Registrar:\n (.+)\n", segment)
@ -176,7 +185,7 @@ def parse_raw_whois(raw_data, normalized=[]):
match = match.split()[0] match = match.split()[0]
try: try:
data["nameservers"].append(match.strip()) data["nameservers"].append(match.strip())
except KeyError, e: except KeyError as e:
data["nameservers"] = [match.strip()] data["nameservers"] = [match.strip()]
# .am plays the same game # .am plays the same game
match = re.search(" DNS servers:([\s\S]*?\n)\n", segment) match = re.search(" DNS servers:([\s\S]*?\n)\n", segment)
@ -186,7 +195,7 @@ def parse_raw_whois(raw_data, normalized=[]):
match = match.split()[0] match = match.split()[0]
try: try:
data["nameservers"].append(match.strip()) data["nameservers"].append(match.strip())
except KeyError, e: except KeyError as e:
data["nameservers"] = [match.strip()] data["nameservers"] = [match.strip()]
# SIDN isn't very standard either. # SIDN isn't very standard either.
match = re.search("Registrar:\n\s+(\S.*)", segment) match = re.search("Registrar:\n\s+(\S.*)", segment)
@ -199,7 +208,7 @@ def parse_raw_whois(raw_data, normalized=[]):
match = match.split()[0] match = match.split()[0]
try: try:
data["nameservers"].append(match.strip()) data["nameservers"].append(match.strip())
except KeyError, e: except KeyError as e:
data["nameservers"] = [match.strip()] data["nameservers"] = [match.strip()]
# The .ie WHOIS server puts ambiguous status information in an unhelpful order # The .ie WHOIS server puts ambiguous status information in an unhelpful order
match = re.search('ren-status:\s*(.+)', segment) match = re.search('ren-status:\s*(.+)', segment)
@ -212,34 +221,34 @@ def parse_raw_whois(raw_data, normalized=[]):
try: try:
data['expiration_date'] = remove_duplicates(data['expiration_date']) data['expiration_date'] = remove_duplicates(data['expiration_date'])
data['expiration_date'] = parse_dates(data['expiration_date']) data['expiration_date'] = parse_dates(data['expiration_date'])
except KeyError, e: except KeyError as e:
pass # Not present pass # Not present
try: try:
data['creation_date'] = remove_duplicates(data['creation_date']) data['creation_date'] = remove_duplicates(data['creation_date'])
data['creation_date'] = parse_dates(data['creation_date']) data['creation_date'] = parse_dates(data['creation_date'])
except KeyError, e: except KeyError as e:
pass # Not present pass # Not present
try: try:
data['updated_date'] = remove_duplicates(data['updated_date']) data['updated_date'] = remove_duplicates(data['updated_date'])
data['updated_date'] = parse_dates(data['updated_date']) data['updated_date'] = parse_dates(data['updated_date'])
except KeyError, e: except KeyError as e:
pass # Not present pass # Not present
try: try:
data['nameservers'] = remove_duplicates([ns.rstrip(".") for ns in data['nameservers']]) data['nameservers'] = remove_duplicates([ns.rstrip(".") for ns in data['nameservers']])
except KeyError, e: except KeyError as e:
pass # Not present pass # Not present
try: try:
data['emails'] = remove_duplicates(data['emails']) data['emails'] = remove_duplicates(data['emails'])
except KeyError, e: except KeyError as e:
pass # Not present pass # Not present
try: try:
data['registrar'] = remove_duplicates(data['registrar']) data['registrar'] = remove_duplicates(data['registrar'])
except KeyError, e: except KeyError as e:
pass # Not present pass # Not present
# Remove e-mail addresses if they are already listed for any of the contacts # Remove e-mail addresses if they are already listed for any of the contacts
@ -248,11 +257,11 @@ def parse_raw_whois(raw_data, normalized=[]):
if data["contacts"][contact] is not None: if data["contacts"][contact] is not None:
try: try:
known_emails.append(data["contacts"][contact]["email"]) known_emails.append(data["contacts"][contact]["email"])
except KeyError, e: except KeyError as e:
pass # No e-mail recorded for this contact... pass # No e-mail recorded for this contact...
try: try:
data['emails'] = [email for email in data["emails"] if email not in known_emails] data['emails'] = [email for email in data["emails"] if email not in known_emails]
except KeyError, e: except KeyError as e:
pass # Not present pass # Not present
for key in data.keys(): for key in data.keys():
@ -281,7 +290,7 @@ def normalize_data(data, normalized):
else: else:
data[key] = [normalize_name(item, abbreviation_threshold=threshold, length_threshold=1) for item in data[key]] data[key] = [normalize_name(item, abbreviation_threshold=threshold, length_threshold=1) for item in data[key]]
for contact_type, contact in data['contacts'].iteritems(): for contact_type, contact in iteritems(data['contacts']):
if contact is not None: if contact is not None:
for key in ("email",): for key in ("email",):
if key in contact and contact[key] is not None and (normalized == True or key in normalized): if key in contact and contact[key] is not None and (normalized == True or key in normalized):
@ -301,7 +310,7 @@ def normalize_data(data, normalized):
for key in contact.keys(): for key in contact.keys():
try: try:
contact[key] = contact[key].strip(", ") contact[key] = contact[key].strip(", ")
except AttributeError, e: except AttributeError as e:
pass # Not a string pass # Not a string
return data return data
@ -368,37 +377,37 @@ def parse_dates(dates):
# This will require some more guesswork - some WHOIS servers present the name of the month # This will require some more guesswork - some WHOIS servers present the name of the month
try: try:
month = int(result.group("month")) month = int(result.group("month"))
except ValueError, e: except ValueError as e:
# Apparently not a number. Look up the corresponding number. # Apparently not a number. Look up the corresponding number.
try: try:
month = grammar['_months'][result.group("month").lower()] month = grammar['_months'][result.group("month").lower()]
except KeyError, e: except KeyError as e:
# Unknown month name, default to 0 # Unknown month name, default to 0
month = 0 month = 0
try: try:
hour = int(result.group("hour")) hour = int(result.group("hour"))
except IndexError, e: except IndexError as e:
hour = 0 hour = 0
except TypeError, e: except TypeError as e:
hour = 0 hour = 0
try: try:
minute = int(result.group("minute")) minute = int(result.group("minute"))
except IndexError, e: except IndexError as e:
minute = 0 minute = 0
except TypeError, e: except TypeError as e:
minute = 0 minute = 0
try: try:
second = int(result.group("second")) second = int(result.group("second"))
except IndexError, e: except IndexError as e:
second = 0 second = 0
except TypeError, e: except TypeError as e:
second = 0 second = 0
break break
except ValueError, e: except ValueError as e:
# Something went horribly wrong, maybe there is no valid date present? # Something went horribly wrong, maybe there is no valid date present?
year = 0 year = 0
month = 0 month = 0
@ -406,16 +415,16 @@ def parse_dates(dates):
hour = 0 hour = 0
minute = 0 minute = 0
second = 0 second = 0
print e.message print(e.message)
try: try:
if year > 0: if year > 0:
try: try:
parsed_dates.append(datetime.datetime(year, month, day, hour, minute, second)) parsed_dates.append(datetime.datetime(year, month, day, hour, minute, second))
except ValueError, e: except ValueError as e:
# We might have gotten the day and month the wrong way around, let's try it the other way around # We might have gotten the day and month the wrong way around, let's try it the other way around
# If you're not using an ISO-standard date format, you're an evil registrar! # If you're not using an ISO-standard date format, you're an evil registrar!
parsed_dates.append(datetime.datetime(year, day, month, hour, minute, second)) parsed_dates.append(datetime.datetime(year, day, month, hour, minute, second))
except UnboundLocalError, e: except UnboundLocalError as e:
pass pass
if len(parsed_dates) > 0: if len(parsed_dates) > 0:
@ -623,7 +632,7 @@ def parse_registrants(data):
admin_contact = data_reference admin_contact = data_reference
break break
# Post-processing # Post-processing
for obj in (registrant, tech_contact, billing_contact, admin_contact): for obj in (registrant, tech_contact, billing_contact, admin_contact):
if obj is not None: if obj is not None:
for key in obj.keys(): for key in obj.keys():
@ -642,7 +651,7 @@ def parse_registrants(data):
try: try:
street_items.append(obj["street%d" % i]) street_items.append(obj["street%d" % i])
del obj["street%d" % i] del obj["street%d" % i]
except KeyError, e: except KeyError as e:
break break
i += 1 i += 1
obj["street"] = "\n".join(street_items) obj["street"] = "\n".join(street_items)

Loading…
Cancel
Save