import re, datetime grammar = { "_data": { 'id': ['Domain ID:[ ]*(?P.+)'], 'status': ['\[Status\]\s*(?P.+)', 'Status\s*:\s?(?P.+)', 'state:\s*(?P.+)'], 'creation_date': ['\[Created on\]\s*(?P.+)', 'Created on[.]*: [a-zA-Z]+, (?P.+)', 'Creation Date:\s?(?P.+)', 'Created Date:\s?(?P.+)', 'Created on:\s?(?P.+)', 'Created on\s?[.]*:\s?(?P.+)\.', 'Date Registered\s?[.]*:\s?(?P.+)', 'Domain Created\s?[.]*:\s?(?P.+)', 'Domain registered\s?[.]*:\s?(?P.+)', 'Domain record activated\s?[.]*:\s*?(?P.+)', 'Record created on\s?[.]*:?\s*?(?P.+)', 'Record created\s?[.]*:?\s*?(?P.+)', 'Created\s?[.]*:?\s*?(?P.+)', 'Registered on\s?[.]*:?\s*?(?P.+)', 'Registered\s?[.]*:?\s*?(?P.+)', 'Domain Create Date\s?[.]*:?\s*?(?P.+)', 'Domain Registration Date\s?[.]*:?\s*?(?P.+)', 'created:\s*(?P.+)', 'created-date:\s*(?P.+)', 'registered:\s*(?P.+)'], 'expiration_date': ['\[Expires on\]\s*(?P.+)', 'Registrar Registration Expiration Date:[ ]*(?P.+)-[0-9]{4}', 'Expires on[.]*: [a-zA-Z]+, (?P.+)', 'Expiration Date:\s?(?P.+)', 'Expires on:\s?(?P.+)', 'Expires on\s?[.]*:\s?(?P.+)\.', 'Expiry Date\s?[.]*:\s?(?P.+)', 'Expiry\s*:\s?(?P.+)', 'Domain Currently Expires\s?[.]*:\s?(?P.+)', 'Record will expire on\s?[.]*:\s?(?P.+)', 'Domain expires\s?[.]*:\s*?(?P.+)', 'Record expires on\s?[.]*:?\s*?(?P.+)', 'Record expires\s?[.]*:?\s*?(?P.+)', 'Expires\s?[.]*:?\s*?(?P.+)', 'Expire Date\s?[.]*:?\s*?(?P.+)', 'Expired\s?[.]*:?\s*?(?P.+)', 'Domain Expiration Date\s?[.]*:?\s*?(?P.+)', 'paid-till:\s*(?P.+)', 'expire:\s*(?P.+)'], 'updated_date': ['\[Last Updated\]\s*(?P.+)', 'Record last updated on[.]*: [a-zA-Z]+, (?P.+)', 'Updated Date:\s?(?P.+)', #'Database last updated on\s?[.]*:?\s*?(?P.+)\s[a-z]+\.?', 'Record last updated on\s?[.]*:?\s?(?P.+)\.', 'Domain record last updated\s?[.]*:\s*?(?P.+)', 'Domain Last Updated\s?[.]*:\s*?(?P.+)', 'Last updated on:\s?(?P.+)', 'Date Modified\s?[.]*:\s?(?P.+)', 'Last Modified\s?[.]*:\s?(?P.+)', 'Domain Last Updated Date\s?[.]*:\s?(?P.+)', 'Record last updated\s?[.]*:\s?(?P.+)', 'Modified\s?[.]*:\s?(?P.+)', 'changed:\s*(?P.+)', 'Last Update\s?[.]*:\s?(?P.+)', 'Last updated on (?P.+) [a-z]{3,4}', 'Last updated:\s*(?P.+)', 'Last update of whois database:\s?[a-z]{3}, (?P.+) [a-z]{3,4}'], 'registrar': ['registrar:\s*(?P.+)', 'Registrar:\s*(?P.+)', 'Sponsoring Registrar Organization:\s*(?P.+)', 'Registered through:\s?(?P.+)', 'Registrar Name[.]*:\s?(?P.+)', 'Record maintained by:\s?(?P.+)', 'Registration Service Provided By:\s?(?P.+)', 'Registrar of Record:\s?(?P.+)', '\tName:\t\s(?P.+)'], 'whois_server': ['Whois Server:\s?(?P.+)', 'Registrar Whois:\s?(?P.+)'], 'nameservers': ['Name Server:[ ]*(?P[^ ]+)', '(?[a-z]*d?ns[0-9]+([a-z]{3})?\.([a-z0-9-]+\.)+[a-z0-9]+)', 'nameserver:\s*(?P.+)', 'nserver:\s*(?P[^[\s]+)', 'Name Server[.]+ (?P[^[\s]+)', 'DNS[0-9]+:\s*(?P.+)', 'ns[0-9]+:\s*(?P.+)', 'NS [0-9]+\s*:\s*(?P.+)', '(?[a-z0-9-]+\.d?ns[0-9]*\.([a-z0-9-]+\.)+[a-z0-9]+)', '(?([a-z0-9-]+\.)+[a-z0-9]+)(\s+([0-9]{1,3}\.){3}[0-9]{1,3})', '(?d?ns\.([a-z0-9-]+\.)+[a-z0-9]+)'], 'emails': ['(?P[\w.-]+@[\w.-]+\.[\w]{2,6})', # Really need to fix this, much longer TLDs now exist... '(?P[\w.-]+\sAT\s[\w.-]+\sDOT\s[\w]{2,6})'] }, "_dateformats": ( '(?P[0-9]{1,2})[./ -](?PJan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[./ -](?P[0-9]{4}|[0-9]{2})' '(\s+(?P[0-9]{1,2})[:.](?P[0-9]{1,2})[:.](?P[0-9]{1,2}))?', '[a-z]{3}\s(?PJan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[./ -](?P[0-9]{1,2})' '(\s+(?P[0-9]{1,2})[:.](?P[0-9]{1,2})[:.](?P[0-9]{1,2}))?' '\s[a-z]{3}\s(?P[0-9]{4}|[0-9]{2})', '(?P[0-9]{4})[./-]?(?P[0-9]{2})[./-]?(?P[0-9]{2})(\s|T)((?P[0-9]{1,2})[:.](?P[0-9]{1,2})[:.](?P[0-9]{1,2}))', '(?P[0-9]{4})[./-](?P[0-9]{1,2})[./-](?P[0-9]{1,2})', '(?P[0-9]{1,2})[./ -](?P[0-9]{1,2})[./ -](?P[0-9]{4}|[0-9]{2})', '(?PJan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (?P[0-9]{1,2}),? (?P[0-9]{4})' ), "_months": { 'jan': 1, 'january': 1, 'feb': 2, 'february': 2, 'mar': 3, 'march': 3, 'apr': 4, 'april': 4, 'may': 5, 'jun': 6, 'june': 6, 'jul': 7, 'july': 7, 'aug': 8, 'august': 8, 'sep': 9, 'sept': 9, 'september': 9, 'oct': 10, 'october': 10, 'nov': 11, 'november': 11, 'dec': 12, 'december': 12 } } def parse_raw_whois(raw_data, normalized=[]): data = {} raw_data = [segment.replace("\r", "") for segment in raw_data] # Carriage returns are the devil for segment in raw_data: for rule_key, rule_regexes in grammar['_data'].iteritems(): if data.has_key(rule_key) == False: for line in segment.splitlines(): for regex in rule_regexes: result =, line, re.IGNORECASE) if result is not None: val ="val").strip() if val != "": try: data[rule_key].append(val) except KeyError, e: data[rule_key] = [val] # is a bit special... match ="Name Servers:([/s/S]+)\n\n", segment) if match is not None: chunk = for match in re.findall("[ ]+(.+)\n", chunk): try: data["nameservers"].append(match.strip()) except KeyError, e: data["nameservers"] = [match.strip()] # Nominet also needs some special attention match =" Registrar:\n (.+)\n", segment) if match is not None: data["registrar"] = [] match =" Name servers:([\s\S]*?\n)\n", segment) if match is not None: chunk = for match in re.findall(" (.+)\n", chunk): match = match.split()[0] try: data["nameservers"].append(match.strip()) except KeyError, e: data["nameservers"] = [match.strip()] data["contacts"] = parse_registrants(raw_data) # Parse dates try: data['expiration_date'] = remove_duplicates(data['expiration_date']) data['expiration_date'] = parse_dates(data['expiration_date']) except KeyError, e: pass # Not present try: data['creation_date'] = remove_duplicates(data['creation_date']) data['creation_date'] = parse_dates(data['creation_date']) except KeyError, e: pass # Not present try: data['updated_date'] = remove_duplicates(data['updated_date']) data['updated_date'] = parse_dates(data['updated_date']) except KeyError, e: pass # Not present try: data['nameservers'] = remove_duplicates([ns.rstrip(".") for ns in data['nameservers']]) except KeyError, e: pass # Not present try: data['emails'] = remove_duplicates(data['emails']) except KeyError, e: pass # Not present try: data['registrar'] = remove_duplicates(data['registrar']) except KeyError, e: pass # Not present # Remove e-mail addresses if they are already listed for any of the contacts known_emails = [] for contact in ("registrant", "tech", "admin", "billing"): if data["contacts"][contact] is not None: try: known_emails.append(data["contacts"][contact]["email"]) except KeyError, e: pass # No e-mail recorded for this contact... try: data['emails'] = [email for email in data["emails"] if email not in known_emails] except KeyError, e: pass # Not present data["raw"] = raw_data if normalized != []: data = normalize_data(data, normalized) return data def normalize_data(data, normalized): for key in ("nameservers", "emails", "whois_server"): if key in data and data[key] is not None and (normalized == True or key in normalized): if isinstance(data[key], basestring): data[key] = data[key].lower() else: data[key] = [item.lower() for item in data[key]] for key in ("registrar", "status"): if key in data and data[key] is not None and (normalized == True or key in normalized): if isinstance(data[key], basestring) and data[key].isupper(): if len(data[key] > 3): # Don't change abbreviations.. # This won't do newlines correctly... Fix that! (known issue with eg., data[key] = " ".join(word.capitalize() for word in data[key].split(" ")) else: # This might mess up the order? Also seems like there may be another bug here... data[key] = [" ".join(word.capitalize() for word in item.split(" ")) for item in data[key] if item.isupper() and len(item) > 3] + [item for item in data[key] if not item.isupper() or len(item) <= 3] for contact_type, contact in data['contacts'].iteritems(): if contact is not None: for key in ("email",): if key in contact and contact[key] is not None and (normalized == True or key in normalized): if isinstance(contact[key], basestring): contact[key] = contact[key].lower() else: contact[key] = [item.lower() for item in contact[key]] for key in ("name", "street"): if key in contact and contact[key] is not None and (normalized == True or key in normalized): contact[key] = normalize_name(contact[key], abbreviation_threshold=3) for key in ("city", "organization", "state", "country"): if key in contact and contact[key] is not None and (normalized == True or key in normalized): contact[key] = normalize_name(contact[key], abbreviation_threshold=3, length_threshold=3) for key in contact.keys(): try: contact[key] = contact[key].strip(", ") except AttributeError, e: pass # Not a string return data def normalize_name(value, abbreviation_threshold=4, length_threshold=8): normalized_lines = [] for line in value.split("\n"): line = line.strip(",") # Get rid of useless comma's if (line.isupper() or line.islower()) and len(line) >= length_threshold: # This line is likely not capitalized properly words = line.split() normalized_words = [] if len(words) >= 1: # First word if len(words[0]) >= abbreviation_threshold and "." not in words[0]: normalized_words.append(words[0].capitalize()) else: # Probably an abbreviation or domain, leave it alone normalized_words.append(words[0]) if len(words) >= 3: # Words between the first and last for word in words[1:-1]: if len(word) >= abbreviation_threshold and "." not in word: normalized_words.append(word.capitalize()) else: # Probably an abbreviation or domain, leave it alone normalized_words.append(word) if len(words) >= 2: # Last word if len(words[-1]) >= abbreviation_threshold and "." not in words[-1]: normalized_words.append(words[-1].capitalize()) else: # Probably an abbreviation or domain, leave it alone normalized_words.append(words[-1]) line = " ".join(normalized_words) normalized_lines.append(line) return "\n".join(normalized_lines) def parse_dates(dates): global grammar parsed_dates = [] for date in dates: for rule in grammar['_dateformats']: result = re.match(rule, date, re.IGNORECASE) if result is not None: try: # These are always numeric. If they fail, there is no valid date present. year = int("year")) day = int("day")) # Detect and correct shorthand year notation if year < 60: year += 2000 elif year < 100: year += 1900 # This will require some more guesswork - some WHOIS servers present the name of the month try: month = int("month")) except ValueError, e: # Apparently not a number. Look up the corresponding number. try: month = grammar['_months']["month").lower()] except KeyError, e: # Unknown month name, default to 0 month = 0 try: hour = int("hour")) except IndexError, e: hour = 0 except TypeError, e: hour = 0 try: minute = int("minute")) except IndexError, e: minute = 0 except TypeError, e: minute = 0 try: second = int("second")) except IndexError, e: second = 0 except TypeError, e: second = 0 break except ValueError, e: # Something went horribly wrong, maybe there is no valid date present? year = 0 month = 0 day = 0 hour = 0 minute = 0 second = 0 print e.message try: if year > 0: try: parsed_dates.append(datetime.datetime(year, month, day, hour, minute, second)) except ValueError, e: # We might have gotten the day and month the wrong way around, let's try it the other way around # If you're not using an ISO-standard date format, you're an evil registrar! parsed_dates.append(datetime.datetime(year, day, month, hour, minute, second)) except UnboundLocalError, e: pass if len(parsed_dates) > 0: return parsed_dates else: return None def remove_duplicates(data): cleaned_list = [] for entry in data: if entry not in cleaned_list: cleaned_list.append(entry) return cleaned_list def parse_registrants(data): registrant = None tech_contact = None billing_contact = None admin_contact = None registrant_regexes = [ " Registrant:[ ]*\n (?P.*)\n (?P.*)\n (?P.*)\n (?P.*), (?P.*) (?P.*)\n (?P.*)\n(?: Phone: (?P.*)\n)? Email: (?P.*)\n", # Corporate Domains, Inc. "Registrant:\n (?P.+)\n (?P.+)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.+), (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n\n", # OVH "Registrant ID:(?P.+)\nRegistrant Name:(?P.*)\n(?:Registrant Organization:(?P.*)\n)?Registrant Street1:(?P.*)\n(?:Registrant Street2:(?P.*)\n)?(?:Registrant Street3:(?P.*)\n)?Registrant City:(?P.*)\nRegistrant State/Province:(?P.*)\nRegistrant Postal Code:(?P.*)\nRegistrant Country:(?P.*)\nRegistrant Phone:(?P.*)\n(?:Registrant Phone Ext.:(?P.*)\n)?(?:Registrant FAX:(?P.*)\n)?(?:Registrant FAX Ext.:(?P.*)\n)?Registrant Email:(?P.*)", # Public Interest Registry (.org), "Registrant ID:\s*(?P.+)\nRegistrant Name:\s*(?P.+)\nRegistrant Organization:\s*(?P.*)\nRegistrant Address1:\s*(?P.+)\nRegistrant Address2:\s*(?P.*)\nRegistrant City:\s*(?P.+)\nRegistrant State/Province:\s*(?P.+)\nRegistrant Postal Code:\s*(?P.+)\nRegistrant Country:\s*(?P.+)\nRegistrant Country Code:\s*(?P.+)\nRegistrant Phone Number:\s*(?P.+)\nRegistrant Email:\s*(?P.+)\n", # .CO Internet "Registrant Contact: (?P.+)\nRegistrant Organization: (?P.+)\nRegistrant Name: (?P.+)\nRegistrant Street: (?P.+)\nRegistrant City: (?P.+)\nRegistrant Postal Code: (?P.+)\nRegistrant State: (?P.+)\nRegistrant Country: (?P.+)\nRegistrant Phone: (?P.*)\nRegistrant Phone Ext: (?P.*)\nRegistrant Fax: (?P.*)\nRegistrant Fax Ext: (?P.*)\nRegistrant Email: (?P.*)\n", # Key-Systems GmbH "(?:Registrant ID:[ ]*(?P.*)\n)?Registrant Name:[ ]*(?P.*)\nRegistrant Organization:[ ]*(?P.*)\nRegistrant Street:[ ]*(?P.+)\n(?:Registrant Street:[ ]*(?P.+)\n)?Registrant City:[ ]*(?P.+)\nRegistrant State\/Province:[ ]*(?P.+)\nRegistrant Postal Code:[ ]*(?P.+)\nRegistrant Country:[ ]*(?P.+)\n(?:Registrant Phone:[ ]*(?P.*)\n)?(?:Registrant Phone Ext:[ ]*(?P.*)\n)?(?:Registrant Fax:[ ]*(?P.*)\n)?(?:Registrant Fax Ext:[ ]*(?P.*)\n)?(?:Registrant Email:[ ]*(?P.+)\n)?", # WildWestDomains, GoDaddy, Namecheap/eNom, Ascio, Musedoma (.museum) "Registrant\n(?: (?P.+)\n)? (?P.+)\n Email:(?P.+)\n (?P.+)\n(?: (?P.+)\n)? (?P.+) (?P.+)\n (?P.+)\n Tel: (?P.+)\n\n", # " Registrant Contact Details:[ ]*\n (?P.*)\n (?P.*)[ ]{2,}\((?P.*)\)\n (?P.*)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.*)\n (?P.*),(?P.*)\n (?P.*)\n Tel. (?P.*)", # "owner-id:[ ]*(?P.*)\n(?:owner-organization:[ ]*(?P.*)\n)?owner-name:[ ]*(?P.*)\nowner-street:[ ]*(?P.*)\nowner-city:[ ]*(?P.*)\nowner-zip:[ ]*(?P.*)\nowner-country:[ ]*(?P.*)\n(?:owner-phone:[ ]*(?P.*)\n)?(?:owner-fax:[ ]*(?P.*)\n)?owner-email:[ ]*(?P.*)", # InterNetworX "Holder of domain name:\n(?P[\S\s]+)\n(?P.+)\n(?P[A-Z0-9-]+)\s+(?P.+)\n(?P.+)\nContractual Language", # "\n\n(?:Owner)?\s+: (?P.*)\n(?:\s+: (?P.*)\n)?\s+: (?P.*)\n\s+: (?P.*)\n\s+: (?P.*)\n\s+: (?P.*)\n", # "Contact Information:\n\[Name\]\s*(?P.*)\n\[Email\]\s*(?P.*)\n\[Web Page\]\s*(?P.*)\n\[Postal code\]\s*(?P.*)\n\[Postal Address\]\s*(?P.*)\n(?:\s+(?P.*)\n)?(?:\s+(?P.*)\n)?\[Phone\]\s*(?P.*)\n\[Fax\]\s*(?P.*)\n", # "Registrant ID:[ ]*(?P.*)\nRegistrant Name:[ ]*(?P.*)\nRegistrant Address1:[ ]*(?P.*)\n(?:Registrant Address2:[ ]*(?P.*)\n)?(?:Registrant Address3:[ ]*(?P.*)\n)?Registrant City:[ ]*(?P.*)\nRegistrant State/Province:[ ]*(?P.*)\nRegistrant Postal Code:[ ]*(?P.*)\nRegistrant Country:[ ]*(?P.*)\nRegistrant Country Code:[ ]*.*\nRegistrant Phone Number:[ ]*(?P.*)\nRegistrant Email:[ ]*(?P.*)", # .US (NeuStar) " Organisation Name[.]* (?P.*)\n Organisation Address[.]* (?P.*)\n Organisation Address[.]* (?P.*)\n(?: Organisation Address[.]* (?P.*)\n)? Organisation Address[.]* (?P.*)\n Organisation Address[.]* (?P.*)\n Organisation Address[.]* (?P.*)\n Organisation Address[.]* (?P.*)", # Melbourne IT (what a horrid format...) "Registrant:[ ]*(?P.+)\n[\s\S]*Eligibility Name:[ ]*(?P.+)\n[\s\S]*Registrant Contact ID:[ ]*(?P.+)\n", # .au business "Eligibility Type:[ ]*Citizen\/Resident\n[\s\S]*Registrant Contact ID:[ ]*(?P.+)\n[\s\S]*Registrant Contact Name:[ ]*(?P.+)\n", # .au individual "Registrant:[ ]*(?P.+)\n[\s\S]*Eligibility Type:[ ]*(Higher Education Institution|Company|Incorporated Association|Other)\n[\s\S]*Registrant Contact ID:[ ]*(?P.+)\n[\s\S]*Registrant Contact Name:[ ]*(?P.+)\n", # .au educational, company, 'incorporated association' (non-profit?), other (spotted for, unsure if also for others) " Registrant:\n (?P.+)\n\n Registrant type:\n .*\n\n Registrant's address:\n The registrant .* opted to have", # Nominet (.uk) with hidden address " Registrant:\n (?P.+)\n\n Registrant type:\n .*\n\n Registrant's address:\n (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)", # Nominet (.uk) with visible address "person:\s+(?P.+)", # (person) "org:\s+(?P.+)", # (organization) ] tech_contact_regexes = [ " Technical Contact:[ ]*\n (?P.*)\n (?P.*)\n (?P.*)\n (?P.*), (?P.*) (?P.*)\n (?P.*)\n(?: Phone: (?P.*)\n)? Email: (?P.*)\n", # Corporate Domains, Inc. "Technical Contact:\n (?P.+)\n (?P.+)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.+), (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n\n", # OVH "Tech ID:(?P.+)\nTech Name:(?P.*)\n(:?Tech Organization:(?P.*)\n)?Tech Street1:(?P.*)\n(?:Tech Street2:(?P.*)\n)?(?:Tech Street3:(?P.*)\n)?Tech City:(?P.*)\nTech State/Province:(?P.*)\nTech Postal Code:(?P.*)\nTech Country:(?P.*)\nTech Phone:(?P.*)\n(?:Tech Phone Ext.:(?P.*)\n)?(?:Tech FAX:(?P.*)\n)?(?:Tech FAX Ext.:(?P.*)\n)?Tech Email:(?P.*)", # Public Interest Registry (.org), "Technical Contact ID:\s*(?P.+)\nTechnical Contact Name:\s*(?P.+)\nTechnical Contact Organization:\s*(?P.*)\nTechnical Contact Address1:\s*(?P.+)\nTechnical Contact Address2:\s*(?P.*)\nTechnical Contact City:\s*(?P.+)\nTechnical Contact State/Province:\s*(?P.+)\nTechnical Contact Postal Code:\s*(?P.+)\nTechnical Contact Country:\s*(?P.+)\nTechnical Contact Country Code:\s*(?P.+)\nTechnical Contact Phone Number:\s*(?P.+)\nTechnical Contact Email:\s*(?P.+)\n", # .CO Internet "Tech Contact: (?P.+)\nTech Organization: (?P.+)\nTech Name: (?P.+)\nTech Street: (?P.+)\nTech City: (?P.+)\nTech Postal Code: (?P.+)\nTech State: (?P.+)\nTech Country: (?P.+)\nTech Phone: (?P.*)\nTech Phone Ext: (?P.*)\nTech Fax: (?P.*)\nTech Fax Ext: (?P.*)\nTech Email: (?P.*)\n", # Key-Systems GmbH "(?:Tech ID:[ ]*(?P.*)\n)?Tech[ ]*Name:[ ]*(?P.*)\nTech[ ]*Organization:[ ]*(?P.*)\nTech[ ]*Street:[ ]*(?P.+)\n(?:Tech[ ]*Street:[ ]*(?P.+)\n)?Tech[ ]*City:[ ]*(?P.+)\nTech[ ]*State\/Province:[ ]*(?P.+)\nTech[ ]*Postal[ ]*Code:[ ]*(?P.+)\nTech[ ]*Country:[ ]*(?P.+)\n(?:Tech[ ]*Phone:[ ]*(?P.*)\n)?(?:Tech[ ]*Phone[ ]*Ext:[ ]*(?P.*)\n)?(?:Tech[ ]*Fax:[ ]*(?P.*)\n)?(?:Tech[ ]*Fax[ ]*Ext:\s*?(?P.*)\n)?(?:Tech[ ]*Email:[ ]*(?P.+)\n)?", # WildWestDomains, GoDaddy, Namecheap/eNom, Ascio, Musedoma (.museum) "Technical Contact\n(?: (?P.+)\n)? (?P.+)\n Email:(?P.+)\n (?P.+)\n(?: (?P.+)\n)? (?P.+) (?P.+)\n (?P.+)\n Tel: (?P.+)\n\n", # " Technical Contact Details:[ ]*\n (?P.*)\n (?P.*)[ ]{2,}\((?P.*)\)\n (?P.*)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.*)\n (?P.*),(?P.*)\n (?P.*)\n Tel. (?P.*)", # "tech-id:[ ]*(?P.*)\n(?:tech-organization:[ ]*(?P.*)\n)?tech-name:[ ]*(?P.*)\ntech-street:[ ]*(?P.*)\ntech-city:[ ]*(?P.*)\ntech-zip:[ ]*(?P.*)\ntech-country:[ ]*(?P.*)\n(?:tech-phone:[ ]*(?P.*)\n)?(?:tech-fax:[ ]*(?P.*)\n)?tech-email:[ ]*(?P.*)", # InterNetworX "Technical contact:\n(?P[\S\s]+)\n(?P.+)\n(?P[A-Z0-9-]+)\s+(?P.+)\n(?P.+)\n\n", # "Tech Contact ID:[ ]*(?P.+)\nTech Contact Name:[ ]*(?P.+)", # .au "Technical Contact ID:[ ]*(?P.*)\nTechnical Contact Name:[ ]*(?P.*)\nTechnical Contact Address1:[ ]*(?P.*)\n(?:Technical Contact Address2:[ ]*(?P.*)\n)?(?:Technical Contact Address3:[ ]*(?P.*)\n)?Technical Contact City:[ ]*(?P.*)\nTechnical Contact State/Province:[ ]*(?P.*)\nTechnical Contact Postal Code:[ ]*(?P.*)\nTechnical Contact Country:[ ]*(?P.*)\nTechnical Contact Country Code:[ ]*.*\nTechnical Contact Phone Number:[ ]*(?P.*)\nTechnical Contact Email:[ ]*(?P.*)", # .US (NeuStar) "Tech Name[.]* (?P.*)\n Tech Address[.]* (?P.*)\n Tech Address[.]* (?P.*)\n(?: Tech Address[.]* (?P.*)\n)? Tech Address[.]* (?P.*)\n Tech Address[.]* (?P.*)\n Tech Address[.]* (?P.*)\n Tech Address[.]* (?P.*)\n Tech Email[.]* (?P.*)\n Tech Phone[.]* (?P.*)\n Tech Fax[.]* (?P.*)", # Melbourne IT ] admin_contact_regexes = [ " Administrative Contact:[ ]*\n (?P.*)\n (?P.*)\n (?P.*)\n (?P.*), (?P.*) (?P.*)\n (?P.*)\n(?: Phone: (?P.*)\n)? Email: (?P.*)\n", # Corporate Domains, Inc. "Administrative Contact:\n (?P.+)\n (?P.+)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.+), (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n\n", # OVH "Admin ID:(?P.+)\nAdmin Name:(?P.*)\n(?:Admin Organization:(?P.*)\n)?Admin Street1:(?P.*)\n(?:Admin Street2:(?P.*)\n)?(?:Admin Street3:(?P.*)\n)?Admin City:(?P.*)\nAdmin State/Province:(?P.*)\nAdmin Postal Code:(?P.*)\nAdmin Country:(?P.*)\nAdmin Phone:(?P.*)\n(?:Admin Phone Ext.:(?P.*)\n)?(?:Admin FAX:(?P.*)\n)?(?:Admin FAX Ext.:(?P.*)\n)?Admin Email:(?P.*)", # Public Interest Registry (.org), "Administrative Contact ID:\s*(?P.+)\nAdministrative Contact Name:\s*(?P.+)\nAdministrative Contact Organization:\s*(?P.*)\nAdministrative Contact Address1:\s*(?P.+)\nAdministrative Contact Address2:\s*(?P.*)\nAdministrative Contact City:\s*(?P.+)\nAdministrative Contact State/Province:\s*(?P.+)\nAdministrative Contact Postal Code:\s*(?P.+)\nAdministrative Contact Country:\s*(?P.+)\nAdministrative Contact Country Code:\s*(?P.+)\nAdministrative Contact Phone Number:\s*(?P.+)\nAdministrative Contact Email:\s*(?P.+)\n", # .CO Internet "Admin Contact: (?P.+)\nAdmin Organization: (?P.+)\nAdmin Name: (?P.+)\nAdmin Street: (?P.+)\nAdmin City: (?P.+)\nAdmin State: (?P.+)\nAdmin Postal Code: (?P.+)\nAdmin Country: (?P.+)\nAdmin Phone: (?P.*)\nAdmin Phone Ext: (?P.*)\nAdmin Fax: (?P.*)\nAdmin Fax Ext: (?P.*)\nAdmin Email: (?P.*)\n", # Key-Systems GmbH "(?:Admin ID:[ ]*(?P.*)\n)?Admin[ ]*Name:[ ]*(?P.*)\nAdmin[ ]*Organization:[ ]*(?P.*)\nAdmin[ ]*Street:[ ]*(?P.+)\n(?:Admin[ ]*Street:[ ]*(?P.+)\n)?Admin[ ]*City:[ ]*(?P.+)\nAdmin[ ]*State\/Province:[ ]*(?P.+)\nAdmin[ ]*Postal[ ]*Code:[ ]*(?P.+)\nAdmin[ ]*Country:[ ]*(?P.+)\n(?:Admin[ ]*Phone:[ ]*(?P.*)\n)?(?:Admin[ ]*Phone[ ]*Ext:[ ]*(?P.*)\n)?(?:Admin[ ]*Fax:[ ]*(?P.*)\n)?(?:Admin[ ]*Fax[ ]*Ext:\s*?(?P.*)\n)?(?:Admin[ ]*Email:[ ]*(?P.+)\n)?", # WildWestDomains, GoDaddy, Namecheap/eNom, Ascio, Musedoma (.museum) "Administrative Contact\n(?: (?P.+)\n)? (?P.+)\n Email:(?P.+)\n (?P.+)\n(?: (?P.+)\n)? (?P.+) (?P.+)\n (?P.+)\n Tel: (?P.+)\n\n", # " Administrative Contact Details:[ ]*\n (?P.*)\n (?P.*)[ ]{2,}\((?P.*)\)\n (?P.*)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.*)\n (?P.*),(?P.*)\n (?P.*)\n Tel. (?P.*)", # "admin-id:[ ]*(?P.*)\n(?:admin-organization:[ ]*(?P.*)\n)?admin-name:[ ]*(?P.*)\nadmin-street:[ ]*(?P.*)\nadmin-city:[ ]*(?P.*)\nadmin-zip:[ ]*(?P.*)\nadmin-country:[ ]*(?P.*)\n(?:admin-phone:[ ]*(?P.*)\n)?(?:admin-fax:[ ]*(?P.*)\n)?admin-email:[ ]*(?P.*)", # InterNetworX "Administrative Contact ID:[ ]*(?P.*)\nAdministrative Contact Name:[ ]*(?P.*)\nAdministrative Contact Address1:[ ]*(?P.*)\n(?:Administrative Contact Address2:[ ]*(?P.*)\n)?(?:Administrative Contact Address3:[ ]*(?P.*)\n)?Administrative Contact City:[ ]*(?P.*)\nAdministrative Contact State/Province:[ ]*(?P.*)\nAdministrative Contact Postal Code:[ ]*(?P.*)\nAdministrative Contact Country:[ ]*(?P.*)\nAdministrative Contact Country Code:[ ]*.*\nAdministrative Contact Phone Number:[ ]*(?P.*)\nAdministrative Contact Email:[ ]*(?P.*)", # .US (NeuStar) "Admin Name[.]* (?P.*)\n Admin Address[.]* (?P.*)\n Admin Address[.]* (?P.*)\n(?: Admin Address[.]* (?P.*)\n)? Admin Address[.]* (?P.*)\n Admin Address[.]* (?P.*)\n Admin Address[.]* (?P.*)\n Admin Address[.]* (?P.*)\n Admin Email[.]* (?P.*)\n Admin Phone[.]* (?P.*)\n Admin Fax[.]* (?P.*)", # Melbourne IT ] billing_contact_regexes = [ "Billing ID:(?P.+)\nBilling Name:(?P.*)\nBilling Organization:(?P.*)\nBilling Street1:(?P.*)\n(?:Billing Street2:(?P.*)\n)?(?:Billing Street3:(?P.*)\n)?Billing City:(?P.*)\nBilling State/Province:(?P.*)\nBilling Postal Code:(?P.*)\nBilling Country:(?P.*)\nBilling Phone:(?P.*)\n(?:Billing Phone Ext.:(?P.*)\n)?(?:Billing FAX:(?P.*)\n)?(?:Billing FAX Ext.:(?P.*)\n)?Billing Email:(?P.*)", # "Billing Contact ID:\s*(?P.+)\nBilling Contact Name:\s*(?P.+)\nBilling Contact Organization:\s*(?P.*)\nBilling Contact Address1:\s*(?P.+)\nBilling Contact Address2:\s*(?P.*)\nBilling Contact City:\s*(?P.+)\nBilling Contact State/Province:\s*(?P.+)\nBilling Contact Postal Code:\s*(?P.+)\nBilling Contact Country:\s*(?P.+)\nBilling Contact Country Code:\s*(?P.+)\nBilling Contact Phone Number:\s*(?P.+)\nBilling Contact Email:\s*(?P.+)\n", # .CO Internet "Billing Contact: (?P.+)\nBilling Organization: (?P.+)\nBilling Name: (?P.+)\nBilling Street: (?P.+)\nBilling City: (?P.+)\nBilling Postal Code: (?P.+)\nBilling State: (?P.+)\nBilling Country: (?P.+)\nBilling Phone: (?P.*)\nBilling Phone Ext: (?P.*)\nBilling Fax: (?P.*)\nBilling Fax Ext: (?P.*)\nBilling Email: (?P.*)\n", # Key-Systems GmbH "(?:Billing ID:[ ]*(?P.*)\n)?Billing[ ]*Name:[ ]*(?P.*)\nBilling[ ]*Organization:[ ]*(?P.*)\nBilling[ ]*Street:[ ]*(?P.+)\n(?:Billing[ ]*Street:[ ]*(?P.+)\n)?Billing[ ]*City:[ ]*(?P.+)\nBilling[ ]*State\/Province:[ ]*(?P.+)\nBilling[ ]*Postal[ ]*Code:[ ]*(?P.+)\nBilling[ ]*Country:[ ]*(?P.+)\n(?:Billing[ ]*Phone:[ ]*(?P.*)\n)?(?:Billing[ ]*Phone[ ]*Ext:[ ]*(?P.*)\n)?(?:Billing[ ]*Fax:[ ]*(?P.*)\n)?(?:Billing[ ]*Fax[ ]*Ext:\s*?(?P.*)\n)?(?:Billing[ ]*Email:[ ]*(?P.+)\n)?", # Musedoma (.museum) "Billing Contact:\n (?P.+)\n (?P.+)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.+), (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n\n", # OVH " Billing Contact Details:[ ]*\n (?P.*)\n (?P.*)[ ]{2,}\((?P.*)\)\n (?P.*)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.*)\n (?P.*),(?P.*)\n (?P.*)\n Tel. (?P.*)", # "billing-id:[ ]*(?P.*)\n(?:billing-organization:[ ]*(?P.*)\n)?billing-name:[ ]*(?P.*)\nbilling-street:[ ]*(?P.*)\nbilling-city:[ ]*(?P.*)\nbilling-zip:[ ]*(?P.*)\nbilling-country:[ ]*(?P.*)\n(?:billing-phone:[ ]*(?P.*)\n)?(?:billing-fax:[ ]*(?P.*)\n)?billing-email:[ ]*(?P.*)", # InterNetworX "Billing Contact ID:[ ]*(?P.*)\nBilling Contact Name:[ ]*(?P.*)\nBilling Contact Address1:[ ]*(?P.*)\n(?:Billing Contact Address2:[ ]*(?P.*)\n)?(?:Billing Contact Address3:[ ]*(?P.*)\n)?Billing Contact City:[ ]*(?P.*)\nBilling Contact State/Province:[ ]*(?P.*)\nBilling Contact Postal Code:[ ]*(?P.*)\nBilling Contact Country:[ ]*(?P.*)\nBilling Contact Country Code:[ ]*.*\nBilling Contact Phone Number:[ ]*(?P.*)\nBilling Contact Email:[ ]*(?P.*)", # .US (NeuStar) ] # Some registries use NIC handle references instead of directly listing contacts... nic_contact_regexes = [ "personname:\s*(?P.+)\norganization:\s*(?P.+)\nstreet address:\s*(?P.+)\npostal code:\s*(?P.+)\ncity:\s*(?P.+)\ncountry:\s*(?P.+)\nphone:\s*(?P.+)\nfax-no:\s*(?P.+)\ne-mail:\s*(?P.+)\nnic-hdl:\s*(?P.+)\nchanged:\s*(?P.+)", # "nic-hdl:\s*(?P.+)\ntype:\s*(?P.+)\ncontact:\s*(?P.+)\n(?:.+\n)*?(?:address:\s*(?P.+)\naddress:\s*(?P.+)\naddress:\s*(?P.+)\naddress:\s*(?P.+)\n)?(?:phone:\s*(?P.+)\n)?(?:fax-no:\s*(?P.+)\n)?(?:.+\n)*?(?:e-mail:\s*(?P.+)\n)?(?:.+\n)*?changed:\s*(?P[0-9]{2}\/[0-9]{2}\/[0-9]{4}).*\n", # AFNIC madness without country field "nic-hdl:\s*(?P.+)\ntype:\s*(?P.+)\ncontact:\s*(?P.+)\n(?:.+\n)*?(?:address:\s*(?P.+)\n)?(?:address:\s*(?P.+)\n)?(?:address:\s*(?P.+)\n)?(?:phone:\s*(?P.+)\n)?(?:fax-no:\s*(?P.+)\n)?(?:.+\n)*?(?:e-mail:\s*(?P.+)\n)?(?:.+\n)*?changed:\s*(?P[0-9]{2}\/[0-9]{2}\/[0-9]{4}).*\n", # AFNIC madness any country -at all- "nic-hdl:\s*(?P.+)\ntype:\s*(?P.+)\ncontact:\s*(?P.+)\n(?:.+\n)*?(?:address:\s*(?P.+)\n)?(?:address:\s*(?P.+)\n)?(?:address:\s*(?P.+)\n)?(?:address:\s*(?P.+)\n)?country:\s*(?P.+)\n(?:phone:\s*(?P.+)\n)?(?:fax-no:\s*(?P.+)\n)?(?:.+\n)*?(?:e-mail:\s*(?P.+)\n)?(?:.+\n)*?changed:\s*(?P[0-9]{2}\/[0-9]{2}\/[0-9]{4}).*\n", # AFNIC madness with country field ] nic_contact_references = { "registrant": [ "registrant:\s*(?P.+)", # "holder-c:\s*(?P.+)", # AFNIC "holder:\s*(?P.+)", # (they apparently want to be difficult, and won't give you contact info for the handle over their WHOIS service) ], "tech": [ "tech-c:\s*(?P.+)", #, AFNIC, ], "admin": [ "admin-c:\s*(?P.+)", #, AFNIC, ], "billing": [ "billing-c:\s*(?P.+)" # ] } for segment in data: for regex in registrant_regexes: match =, segment) if match is not None: registrant = match.groupdict() break for segment in data: for regex in tech_contact_regexes: match =, segment) if match is not None: tech_contact = match.groupdict() break for segment in data: for regex in admin_contact_regexes: match =, segment) if match is not None: admin_contact = match.groupdict() break for segment in data: for regex in billing_contact_regexes: match =, segment) if match is not None: billing_contact = match.groupdict() break # Find NIC handle contact definitions handle_contacts = [] for regex in nic_contact_regexes: for segment in data: matches = re.finditer(regex, segment) for match in matches: handle_contacts.append(match.groupdict()) # Find NIC handle references and process them for category in nic_contact_references: for regex in nic_contact_references[category]: for segment in data: match =, segment) if match is not None: data_reference = match.groupdict() if data_reference["handle"] == "-": pass # Blank else: for contact in handle_contacts: if contact["handle"] == data_reference["handle"]: data_reference.update(contact) if category == "registrant": registrant = data_reference elif category == "tech": tech_contact = data_reference elif category == "billing": billing_contact = data_reference elif category == "admin": admin_contact = data_reference break # Post-processing for obj in (registrant, tech_contact, billing_contact, admin_contact): if obj is not None: for key in obj.keys(): if obj[key] is None or obj[key].strip() == "": # Just chomp all surrounding whitespace del obj[key] else: obj[key] = obj[key].strip() if "phone_ext" in obj: if "phone" in obj: obj["phone"] += "ext. %s" % obj["phone_ext"] del obj["phone_ext"] if "street1" in obj: street_items = [] i = 1 while True: try: street_items.append(obj["street%d" % i]) del obj["street%d" % i] except KeyError, e: break i += 1 obj["street"] = "\n".join(street_items) if 'changedate' in obj: obj['changedate'] = parse_dates([obj['changedate']])[0] if 'street' in obj and "\n" in obj["street"] and 'postalcode' not in obj: # Deal with certain mad WHOIS servers that don't properly delimit address data... (yes, AFNIC, looking at you) lines = [x.strip() for x in obj["street"].splitlines()] if " " in lines[-1]: postal_code, city = lines[-1].split(" ", 1) obj["postalcode"] = postal_code obj["city"] = city obj["street"] = "\n".join(lines[:-1]) return { "registrant": registrant, "tech": tech_contact, "admin": admin_contact, "billing": billing_contact, }