diff --git a/pythonwhois/parse.py b/pythonwhois/parse.py index 0265426..67f6290 100644 --- a/pythonwhois/parse.py +++ b/pythonwhois/parse.py @@ -2,7 +2,7 @@ from __future__ import print_function import re, sys, datetime, csv, pkgutil from . import net, shared -try: +try: from io import StringIO except ImportError: from cStringIO import StringIO @@ -25,13 +25,13 @@ def read_dataset(filename, destination, abbrev_key, name_key, is_dict=False): destination[line[abbrev_key]] = line[name_key] except IOError as e: pass - + airports = {} countries = {} states_au = {} states_us = {} states_ca = {} - + try: reader = csv.reader(pkgdata("airports.dat").splitlines()) @@ -50,7 +50,7 @@ read_dataset("states_ca.dat", states_ca, "abbreviation", "name", is_dict=True) def precompile_regexes(source, flags=0): return [re.compile(regex, flags) for regex in source] - + grammar = { "_data": { 'id': ['Domain ID:[ ]*(?P.+)'], @@ -157,7 +157,7 @@ grammar = { '(?<=[ .]{2})(?P[a-z0-9-]+\.d?ns[0-9]*\.([a-z0-9-]+\.)+[a-z0-9]+)', '(?<=[ .]{2})(?P([a-z0-9-]+\.)+[a-z0-9]+)(\s+([0-9]{1,3}\.){3}[0-9]{1,3})', '(?<=[ .]{2})[^a-z0-9.-](?Pd?ns\.([a-z0-9-]+\.)+[a-z0-9]+)', - 'Nserver:\s*(?P.+)'], + 'Nserver:\s*(?P.+)'], 'emails': ['(?P[\w.-]+@[\w.-]+\.[\w]{2,6})', # Really need to fix this, much longer TLDs now exist... '(?P[\w.-]+\sAT\s[\w.-]+\sDOT\s[\w]{2,6})'] }, @@ -235,7 +235,7 @@ registrant_regexes = [ "Domain Owner:\n\t(?P.+)\n\n[\s\S]*?(?:Registrant Contact:\n\t(?P.+))?\n\nRegistrant(?:'s)? (?:a|A)ddress:(?:\n\t(?P.+)\n(?:\t(?P.+)\n)?(?:\t(?P.+)\n)?\t(?P.+)\n\t(?P.+))?\n\t(?P.+)(?:\n\t(?P.+) \(Phone\)\n\t(?P.+) \(FAX\)\n\t(?P.+))?\n\n", # .ac.uk - what a mess... "Registrant ID: (?P.+)\nRegistrant: (?P.+)\nRegistrant Contact Email: (?P.+)", # .cn (CNNIC) "Registrant contact:\n (?P.+)\n (?P.*)\n (?P.+), (?P.+) (?P.+) (?P.+)\n\n", # Fabulous.com - "registrant-name:\s*(?P.+)\n(registrant-organization:\s*(?P.*)\n)?registrant-type:\s*(?P.+)\nregistrant-address:\s*(?P.+)\nregistrant-postcode:\s*(?P.+)\nregistrant-city:\s*(?P.+)\nregistrant-country:\s*(?P.+)\n(?:registrant-phone:\s*(?P.+)\n)?(?:registrant-email:\s*(?P.+)\n)?", # Hetzner + "registrant-name:\s*(?P.+)\n(?:registrant-organization:\s*(?P.*)\n)?registrant-type:\s*(?P.+)\nregistrant-address:\s*(?P.+)\nregistrant-postcode:\s*(?P.+)\nregistrant-city:\s*(?P.+)\nregistrant-country:\s*(?P.+)\n(?:registrant-phone:\s*(?P.+)\n)?(?:registrant-email:\s*(?P.+)\n)?", # Hetzner "Registrant Contact Information :[ ]*\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n\n", # GAL Communication "Contact Information : For Customer # [0-9]+[ ]*\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n\n", # GAL Communication alternative (private WHOIS) format? "Registrant:\n Name: (?P.+)\n City: (?P.+)\n State: (?P.+)\n Country: (?P.+)\n", # Akky (.com.mx) @@ -271,7 +271,7 @@ tech_contact_regexes = [ "Technical Contacts\n Name: (?P.+)\n(?: Organization: (?P.+)\n)? ContactID: (?P.+)\n(?: Address: (?P.+)\n(?: (?P.+)\n(?: (?P.+)\n)?)? (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n)?(?: Created: (?P.+)\n)?(?: Last Update: (?P.+)\n)?", # nic.it // NOTE: Why does this say 'Contacts'? Can it have multiple? "Tech Name[.]* (?P.*)\n Tech Address[.]* (?P.*)\n Tech Address[.]* (?P.*)\n(?: Tech Address[.]* (?P.*)\n)? Tech Address[.]* (?P.*)\n Tech Address[.]* (?P.*)\n Tech Address[.]* (?P.*)\n Tech Address[.]* (?P.*)\n Tech Email[.]* (?P.*)\n Tech Phone[.]* (?P.*)\n Tech Fax[.]* (?P.*)", # Melbourne IT "Technical contact:\n(?: (?P.+)\n)? (?P.+)\n (?P.+)\n (?P.+)\n (?P.+), (?P.+) (?P.+) (?P.+)\n Phone: (?P.*)\n Fax: (?P.*)\n", # Fabulous.com - "tech-c-name:\s*(?P.+)\n(tech-c-organization:\s*(?P.*)\n)?tech-c-type:\s*(?P.+)\ntech-c-address:\s*(?P.+)\ntech-c-postcode:\s*(?P.+)\ntech-c-city:\s*(?P.+)\ntech-c-country:\s*(?P.+)\n(?:tech-c-phone:\s*(?P.+)\n)?(?:tech-c-email:\s*(?P.+)\n)?", # Hetzner + "tech-c-name:\s*(?P.+)\n(?:tech-c-organization:\s*(?P.*)\n)?tech-c-type:\s*(?P.+)\ntech-c-address:\s*(?P.+)\ntech-c-postcode:\s*(?P.+)\ntech-c-city:\s*(?P.+)\ntech-c-country:\s*(?P.+)\n(?:tech-c-phone:\s*(?P.+)\n)?(?:tech-c-email:\s*(?P.+)\n)?", # Hetzner "Admin Contact Information :[ ]*\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n\n", # GAL Communication " Technical contact:\n (?P.+)\n (?P.*)\n (?P.+)\n (?P.+) (?P\S+),[ ]+(?P.+)\n (?P.+)\n (?P.+)\n (?P.*)\n (?P.*)", # .am "Technical:\n\s*Name:\s*(?P.*)\n\s*Organisation:\s*(?P.*)\n\s*Language:.*\n\s*Phone:\s*(?P.*)\n\s*Fax:\s*(?P.*)\n\s*Email:\s*(?P.*)\n", # EURid @@ -304,7 +304,7 @@ admin_contact_regexes = [ "Admin Contact\n Name: (?P.+)\n(?: Organization: (?P.+)\n)? ContactID: (?P.+)\n(?: Address: (?P.+)\n(?: (?P.+)\n(?: (?P.+)\n)?)? (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n)?(?: Created: (?P.+)\n)?(?: Last Update: (?P.+)\n)?", # nic.it "Admin Name[.]* (?P.*)\n Admin Address[.]* (?P.*)\n Admin Address[.]* (?P.*)\n(?: Admin Address[.]* (?P.*)\n)? Admin Address[.]* (?P.*)\n Admin Address[.]* (?P.*)\n Admin Address[.]* (?P.*)\n Admin Address[.]* (?P.*)\n Admin Email[.]* (?P.*)\n Admin Phone[.]* (?P.*)\n Admin Fax[.]* (?P.*)", # Melbourne IT "Administrative contact:\n(?: (?P.+)\n)? (?P.+)\n (?P.+)\n (?P.+)\n (?P.+), (?P.+) (?P.+) (?P.+)\n Phone: (?P.*)\n Fax: (?P.*)\n", # Fabulous.com - "admin-c-name:\s*(?P.+)\n(admin-c-organization:\s*(?P.*)\n)?admin-c-type:\s*(?P.+)\nadmin-c-address:\s*(?P.+)\nadmin-c-postcode:\s*(?P.+)\nadmin-c-city:\s*(?P.+)\nadmin-c-country:\s*(?P.+)\n(?:admin-c-phone:\s*(?P.+)\n)?(?:admin-c-email:\s*(?P.+)\n)?", # Hetzner + "admin-c-name:\s*(?P.+)\n(?:admin-c-organization:\s*(?P.*)\n)?admin-c-type:\s*(?P.+)\nadmin-c-address:\s*(?P.+)\nadmin-c-postcode:\s*(?P.+)\nadmin-c-city:\s*(?P.+)\nadmin-c-country:\s*(?P.+)\n(?:admin-c-phone:\s*(?P.+)\n)?(?:admin-c-email:\s*(?P.+)\n)?", # Hetzner "Tech Contact Information :[ ]*\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n\n", # GAL Communication " Administrative contact:\n (?P.+)\n (?P.*)\n (?P.+)\n (?P.+) (?P\S+),[ ]+(?P.+)\n (?P.+)\n (?P.+)\n (?P.*)\n (?P.*)", # .am "Administrative Contact:\n Name: (?P.+)\n City: (?P.+)\n State: (?P.+)\n Country: (?P.+)\n", # Akky (.com.mx) @@ -553,7 +553,7 @@ def parse_raw_whois(raw_data, normalized=None, never_query_handles=True, handle_ data["nameservers"].append(match.strip()) except KeyError as e: data["nameservers"] = [match.strip()] - + data["contacts"] = parse_registrants(raw_data, never_query_handles, handle_server) @@ -645,7 +645,7 @@ def normalize_data(data, normalized): for country, source in (("united states", states_us), ("australia", states_au), ("canada", states_ca)): if country in contact["country"].lower() and contact["state"] in source: contact["state"] = source[contact["state"]] - + for key in ("email",): if key in contact and contact[key] is not None and (normalized == True or key in normalized): if is_string(contact[key]): @@ -660,7 +660,7 @@ def normalize_data(data, normalized): for key in ("city", "organization", "state", "country"): if key in contact and contact[key] is not None and (normalized == True or key in normalized): contact[key] = normalize_name(contact[key], abbreviation_threshold=3, length_threshold=3) - + if "name" in contact and "organization" not in contact: lines = [x.strip() for x in contact["name"].splitlines()] new_lines = [] @@ -674,10 +674,10 @@ def normalize_data(data, normalized): contact["name"] = "\n".join(lines) else: del contact["name"] - + if len(new_lines) > 0: contact["organization"] = "\n".join(new_lines) - + if "street" in contact and "organization" not in contact: lines = [x.strip() for x in contact["street"].splitlines()] if len(lines) > 1: @@ -686,7 +686,7 @@ def normalize_data(data, normalized): contact["organization"] = lines[0] contact["street"] = "\n".join(lines[1:]) break - + for key in list(contact.keys()): try: contact[key] = contact[key].strip(", ") @@ -831,10 +831,10 @@ def remove_suffixes(data): # Removes everything before and after the first non-whitespace continuous string. # Used to get rid of IP suffixes for nameservers. cleaned_list = [] - + for entry in data: cleaned_list.append(re.search("([^\s]+)\s*[\s]*", entry).group(1).lstrip()) - + return cleaned_list def parse_registrants(data, never_query_handles=True, handle_server=""): @@ -911,7 +911,7 @@ def parse_registrants(data, never_query_handles=True, handle_server=""): elif category == "admin": admin_contact = data_reference break - + # Post-processing for obj in (registrant, tech_contact, billing_contact, admin_contact): if obj is not None: @@ -986,12 +986,12 @@ def fetch_nic_contact(handle, lookup_server): response = net.get_whois_raw(handle, lookup_server) response = [segment.replace("\r", "") for segment in response] # Carriage returns are the devil results = parse_nic_contact(response) - + if len(results) > 0: return results[0] else: raise shared.WhoisException("No contact data found in the response.") - + def parse_nic_contact(data): handle_contacts = [] for regex in nic_contact_regexes: @@ -999,5 +999,5 @@ def parse_nic_contact(data): matches = re.finditer(regex, segment) for match in matches: handle_contacts.append(match.groupdict()) - + return handle_contacts