diff --git a/.gitignore b/.gitignore index 674f765..b20f88c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ build dist *.egg-info +.tox diff --git a/README.md b/README.md index 7f008f2..629179d 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ Make sure to verify (using `pwhois` or otherwise) that the WHOIS data for the do ./test.py update thedomain.com -### Running the full test suite +### Running all tests ./test.py run all @@ -77,6 +77,10 @@ Make sure to verify (using `pwhois` or otherwise) that the WHOIS data for the do ./test.py run thedomain.com +### Running the full test suite including support for multiple python versions + + tox + ### Generating documentation You need [ZippyDoc](http://cryto.net/zippydoc) (which can be installed through `pip install zippydoc`). diff --git a/pythonwhois/parse.py b/pythonwhois/parse.py index c15d6d2..b3d5168 100644 --- a/pythonwhois/parse.py +++ b/pythonwhois/parse.py @@ -1,4 +1,5 @@ -import re, datetime +from __future__ import print_function +import re, sys, datetime grammar = { "_data": { @@ -136,26 +137,37 @@ grammar = { } } + +if sys.version_info < (3, 0): + def is_string(data): + """Test for string with support for python 2.""" + return isinstance(data, basestring) +else: + def is_string(data): + """Test for string with support for python 3.""" + return isinstance(data, str) + + def parse_raw_whois(raw_data, normalized=[]): data = {} - + raw_data = [segment.replace("\r", "") for segment in raw_data] # Carriage returns are the devil - + for segment in raw_data: - for rule_key, rule_regexes in grammar['_data'].iteritems(): - if data.has_key(rule_key) == False: + for rule_key, rule_regexes in grammar['_data'].items(): + if (rule_key in data) == False: for line in segment.splitlines(): for regex in rule_regexes: result = re.search(regex, line, re.IGNORECASE) - + if result is not None: val = result.group("val").strip() if val != "": try: data[rule_key].append(val) - except KeyError, e: + except KeyError as e: data[rule_key] = [val] - + # Whois.com is a bit special... Fabulous.com also seems to use this format. match = re.search("Name Servers:([/s/S]+)\n\n", segment) if match is not None: @@ -163,7 +175,7 @@ def parse_raw_whois(raw_data, normalized=[]): for match in re.findall("[ ]+(.+)\n", chunk): try: data["nameservers"].append(match.strip()) - except KeyError, e: + except KeyError as e: data["nameservers"] = [match.strip()] # Nominet also needs some special attention match = re.search(" Registrar:\n (.+)\n", segment) @@ -176,7 +188,7 @@ def parse_raw_whois(raw_data, normalized=[]): match = match.split()[0] try: data["nameservers"].append(match.strip()) - except KeyError, e: + except KeyError as e: data["nameservers"] = [match.strip()] # .am plays the same game match = re.search(" DNS servers:([\s\S]*?\n)\n", segment) @@ -186,7 +198,7 @@ def parse_raw_whois(raw_data, normalized=[]): match = match.split()[0] try: data["nameservers"].append(match.strip()) - except KeyError, e: + except KeyError as e: data["nameservers"] = [match.strip()] # SIDN isn't very standard either. match = re.search("Registrar:\n\s+(\S.*)", segment) @@ -199,109 +211,109 @@ def parse_raw_whois(raw_data, normalized=[]): match = match.split()[0] try: data["nameservers"].append(match.strip()) - except KeyError, e: + except KeyError as e: data["nameservers"] = [match.strip()] # The .ie WHOIS server puts ambiguous status information in an unhelpful order match = re.search('ren-status:\s*(.+)', segment) if match is not None: data["status"].insert(0, match.group(1).strip()) - + data["contacts"] = parse_registrants(raw_data) - + # Parse dates try: data['expiration_date'] = remove_duplicates(data['expiration_date']) data['expiration_date'] = parse_dates(data['expiration_date']) - except KeyError, e: + except KeyError as e: pass # Not present - + try: data['creation_date'] = remove_duplicates(data['creation_date']) data['creation_date'] = parse_dates(data['creation_date']) - except KeyError, e: + except KeyError as e: pass # Not present - + try: data['updated_date'] = remove_duplicates(data['updated_date']) data['updated_date'] = parse_dates(data['updated_date']) - except KeyError, e: + except KeyError as e: pass # Not present - + try: data['nameservers'] = remove_duplicates([ns.rstrip(".") for ns in data['nameservers']]) - except KeyError, e: + except KeyError as e: pass # Not present - + try: data['emails'] = remove_duplicates(data['emails']) - except KeyError, e: + except KeyError as e: pass # Not present - + try: data['registrar'] = remove_duplicates(data['registrar']) - except KeyError, e: + except KeyError as e: pass # Not present - + # Remove e-mail addresses if they are already listed for any of the contacts known_emails = [] for contact in ("registrant", "tech", "admin", "billing"): if data["contacts"][contact] is not None: try: known_emails.append(data["contacts"][contact]["email"]) - except KeyError, e: + except KeyError as e: pass # No e-mail recorded for this contact... try: data['emails'] = [email for email in data["emails"] if email not in known_emails] - except KeyError, e: + except KeyError as e: pass # Not present - - for key in data.keys(): + + for key in list(data.keys()): if data[key] is None or len(data[key]) == 0: del data[key] - + data["raw"] = raw_data - + if normalized != []: data = normalize_data(data, normalized) - + return data def normalize_data(data, normalized): for key in ("nameservers", "emails", "whois_server"): if key in data and data[key] is not None and (normalized == True or key in normalized): - if isinstance(data[key], basestring): + if is_string(data[key]): data[key] = data[key].lower() else: data[key] = [item.lower() for item in data[key]] - + for key, threshold in (("registrar", 4), ("status", 3)): if key in data and data[key] is not None and (normalized == True or key in normalized): - if isinstance(data[key], basestring): + if is_string(data[key]): data[key] = normalize_name(data[key], abbreviation_threshold=threshold, length_threshold=1) else: data[key] = [normalize_name(item, abbreviation_threshold=threshold, length_threshold=1) for item in data[key]] - - for contact_type, contact in data['contacts'].iteritems(): + + for contact_type, contact in data['contacts'].items(): if contact is not None: for key in ("email",): if key in contact and contact[key] is not None and (normalized == True or key in normalized): - if isinstance(contact[key], basestring): + if isinstance(contact[key], str): contact[key] = contact[key].lower() else: contact[key] = [item.lower() for item in contact[key]] - + for key in ("name", "street"): if key in contact and contact[key] is not None and (normalized == True or key in normalized): contact[key] = normalize_name(contact[key], abbreviation_threshold=3) - + for key in ("city", "organization", "state", "country"): if key in contact and contact[key] is not None and (normalized == True or key in normalized): contact[key] = normalize_name(contact[key], abbreviation_threshold=3, length_threshold=3) - - for key in contact.keys(): + + for key in list(contact.keys()): try: contact[key] = contact[key].strip(", ") - except AttributeError, e: + except AttributeError as e: pass # Not a string return data @@ -348,57 +360,57 @@ def normalize_name(value, abbreviation_threshold=4, length_threshold=8, lowercas def parse_dates(dates): global grammar parsed_dates = [] - + for date in dates: for rule in grammar['_dateformats']: result = re.match(rule, date, re.IGNORECASE) - + if result is not None: try: # These are always numeric. If they fail, there is no valid date present. year = int(result.group("year")) day = int(result.group("day")) - + # Detect and correct shorthand year notation if year < 60: year += 2000 elif year < 100: year += 1900 - + # This will require some more guesswork - some WHOIS servers present the name of the month try: month = int(result.group("month")) - except ValueError, e: + except ValueError as e: # Apparently not a number. Look up the corresponding number. try: month = grammar['_months'][result.group("month").lower()] - except KeyError, e: + except KeyError as e: # Unknown month name, default to 0 month = 0 - + try: hour = int(result.group("hour")) - except IndexError, e: + except IndexError as e: hour = 0 - except TypeError, e: + except TypeError as e: hour = 0 - + try: minute = int(result.group("minute")) - except IndexError, e: + except IndexError as e: minute = 0 - except TypeError, e: + except TypeError as e: minute = 0 - + try: second = int(result.group("second")) - except IndexError, e: + except IndexError as e: second = 0 - except TypeError, e: + except TypeError as e: second = 0 - + break - except ValueError, e: + except ValueError as e: # Something went horribly wrong, maybe there is no valid date present? year = 0 month = 0 @@ -406,18 +418,18 @@ def parse_dates(dates): hour = 0 minute = 0 second = 0 - print e.message + print(e.message) try: if year > 0: try: parsed_dates.append(datetime.datetime(year, month, day, hour, minute, second)) - except ValueError, e: + except ValueError as e: # We might have gotten the day and month the wrong way around, let's try it the other way around # If you're not using an ISO-standard date format, you're an evil registrar! parsed_dates.append(datetime.datetime(year, day, month, hour, minute, second)) - except UnboundLocalError, e: + except UnboundLocalError as e: pass - + if len(parsed_dates) > 0: return parsed_dates else: @@ -425,11 +437,11 @@ def parse_dates(dates): def remove_duplicates(data): cleaned_list = [] - + for entry in data: if entry not in cleaned_list: cleaned_list.append(entry) - + return cleaned_list def preprocess_regex(regex): @@ -440,7 +452,7 @@ def parse_registrants(data): tech_contact = None billing_contact = None admin_contact = None - + registrant_regexes = [ " Registrant:[ ]*\n (?P.*)\n (?P.*)\n (?P.*)\n (?P.*), (?P.*) (?P.*)\n (?P.*)\n(?: Phone: (?P.*)\n)? Email: (?P.*)\n", # Corporate Domains, Inc. "Registrant:\n (?P.+)\n (?P.+)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.+), (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n\n", # OVH @@ -492,7 +504,7 @@ def parse_registrants(data): "Admin Contact Information :[ ]*\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n\n", # GAL Communication " Technical contact:\n (?P.+)\n (?P.*)\n (?P.+)\n (?P.+) (?P\S+),[ ]+(?P.+)\n (?P.+)\n (?P.+)\n (?P.*)\n (?P.*)", # .am ] - + admin_contact_regexes = [ " Administrative Contact:[ ]*\n (?P.*)\n (?P.*)\n (?P.*)\n (?P.*), (?P.*) (?P.*)\n (?P.*)\n(?: Phone: (?P.*)\n)? Email: (?P.*)\n", # Corporate Domains, Inc. "Administrative Contact:\n (?P.+)\n (?P.+)\n(?: (?P.*)\n)?(?: (?P.*)\n)? (?P.+), (?P.+)\n (?P.+)\n (?P.+)\n (?P.+)\n\n", # OVH @@ -511,7 +523,7 @@ def parse_registrants(data): "Tech Contact Information :[ ]*\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n\n", # GAL Communication " Administrative contact:\n (?P.+)\n (?P.*)\n (?P.+)\n (?P.+) (?P\S+),[ ]+(?P.+)\n (?P.+)\n (?P.+)\n (?P.*)\n (?P.*)", # .am ] - + billing_contact_regexes = [ "Billing ID:(?P.+)\nBilling Name:(?P.*)\nBilling Organization:(?P.*)\nBilling Street1:(?P.*)\n(?:Billing Street2:(?P.*)\n)?(?:Billing Street3:(?P.*)\n)?Billing City:(?P.*)\nBilling State/Province:(?P.*)\nBilling Postal Code:(?P.*)\nBilling Country:(?P.*)\nBilling Phone:(?P.*)\n(?:Billing Phone Ext.:(?P.*)\n)?(?:Billing FAX:(?P.*)\n)?(?:Billing FAX Ext.:(?P.*)\n)?Billing Email:(?P.*)", # nic.pw "Billing Contact ID:\s*(?P.+)\nBilling Contact Name:\s*(?P.+)\nBilling Contact Organization:\s*(?P.*)\nBilling Contact Address1:\s*(?P.+)\nBilling Contact Address2:\s*(?P.*)\nBilling Contact City:\s*(?P.+)\nBilling Contact State/Province:\s*(?P.+)\nBilling Contact Postal Code:\s*(?P.+)\nBilling Contact Country:\s*(?P.+)\nBilling Contact Country Code:\s*(?P.+)\nBilling Contact Phone Number:\s*(?P.+)\nBilling Contact Email:\s*(?P.+)\n", # .CO Internet @@ -525,18 +537,18 @@ def parse_registrants(data): "Billing contact:\n(?: (?P.+)\n)? (?P.+)\n (?P.+)\n (?P.+)\n (?P.+), (?P.+) (?P.+) (?P.+)\n Phone: (?P.*)\n Fax: (?P.*)\n", # Fabulous.com "Billing Contact Information :[ ]*\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n[ ]+(?P.*)\n\n", # GAL Communication ] - + # Some registries use NIC handle references instead of directly listing contacts... - + nic_contact_regexes = [ "personname:\s*(?P.+)\norganization:\s*(?P.+)\nstreet address:\s*(?P.+)\npostal code:\s*(?P.+)\ncity:\s*(?P.+)\ncountry:\s*(?P.+)\n(?:phone:\s*(?P.+)\n)?(?:fax-no:\s*(?P.+)\n)?(?:e-mail:\s*(?P.+)\n)?nic-hdl:\s*(?P.+)\nchanged:\s*(?P.+)", # nic.at "person:\s*(?P.+)\nnic-hdl:\s*(?P.+)\n", # .ie "nic-hdl:\s*(?P.+)\ntype:\s*(?P.+)\ncontact:\s*(?P.+)\n(?:.+\n)*?(?:address:\s*(?P.+)\naddress:\s*(?P.+)\naddress:\s*(?P.+)\naddress:\s*(?P.+)\n)?(?:phone:\s*(?P.+)\n)?(?:fax-no:\s*(?P.+)\n)?(?:.+\n)*?(?:e-mail:\s*(?P.+)\n)?(?:.+\n)*?changed:\s*(?P[0-9]{2}\/[0-9]{2}\/[0-9]{4}).*\n", # AFNIC madness without country field "nic-hdl:\s*(?P.+)\ntype:\s*(?P.+)\ncontact:\s*(?P.+)\n(?:.+\n)*?(?:address:\s*(?P.+)\n)?(?:address:\s*(?P.+)\n)?(?:address:\s*(?P.+)\n)?(?:phone:\s*(?P.+)\n)?(?:fax-no:\s*(?P.+)\n)?(?:.+\n)*?(?:e-mail:\s*(?P.+)\n)?(?:.+\n)*?changed:\s*(?P[0-9]{2}\/[0-9]{2}\/[0-9]{4}).*\n", # AFNIC madness any country -at all- "nic-hdl:\s*(?P.+)\ntype:\s*(?P.+)\ncontact:\s*(?P.+)\n(?:.+\n)*?(?:address:\s*(?P.+)\n)?(?:address:\s*(?P.+)\n)?(?:address:\s*(?P.+)\n)?(?:address:\s*(?P.+)\n)?country:\s*(?P.+)\n(?:phone:\s*(?P.+)\n)?(?:fax-no:\s*(?P.+)\n)?(?:.+\n)*?(?:e-mail:\s*(?P.+)\n)?(?:.+\n)*?changed:\s*(?P[0-9]{2}\/[0-9]{2}\/[0-9]{4}).*\n", # AFNIC madness with country field - + ] - + nic_contact_references = { "registrant": [ "registrant:\s*(?P.+)", # nic.at @@ -553,7 +565,7 @@ def parse_registrants(data): "billing-c:\s*(?P.+)" # iis.se ] } - + # Why do the below? The below is meant to handle with an edge case (issue #2) where a partial match followed # by a failure, for a regex containing the \s*.+ pattern, would send the regex module on a wild goose hunt for # matching positions. The workaround is to use \S.* instead of .+, but in the interest of keeping the regexes @@ -563,35 +575,35 @@ def parse_registrants(data): tech_contact_regexes = [preprocess_regex(regex) for regex in tech_contact_regexes] admin_contact_regexes = [preprocess_regex(regex) for regex in admin_contact_regexes] billing_contact_regexes = [preprocess_regex(regex) for regex in billing_contact_regexes] - + for segment in data: for regex in registrant_regexes: match = re.search(regex, segment) if match is not None: registrant = match.groupdict() break - + for segment in data: for regex in tech_contact_regexes: match = re.search(regex, segment) if match is not None: tech_contact = match.groupdict() break - + for segment in data: for regex in admin_contact_regexes: match = re.search(regex, segment) if match is not None: admin_contact = match.groupdict() break - + for segment in data: for regex in billing_contact_regexes: match = re.search(regex, segment) if match is not None: billing_contact = match.groupdict() break - + # Find NIC handle contact definitions handle_contacts = [] for regex in nic_contact_regexes: @@ -599,7 +611,7 @@ def parse_registrants(data): matches = re.finditer(regex, segment) for match in matches: handle_contacts.append(match.groupdict()) - + # Find NIC handle references and process them for category in nic_contact_references: for regex in nic_contact_references[category]: @@ -622,11 +634,11 @@ def parse_registrants(data): elif category == "admin": admin_contact = data_reference break - - # Post-processing + + # Post-processing for obj in (registrant, tech_contact, billing_contact, admin_contact): if obj is not None: - for key in obj.keys(): + for key in list(obj.keys()): if obj[key] is None or obj[key].strip() == "": # Just chomp all surrounding whitespace del obj[key] else: @@ -642,7 +654,7 @@ def parse_registrants(data): try: street_items.append(obj["street%d" % i]) del obj["street%d" % i] - except KeyError, e: + except KeyError as e: break i += 1 obj["street"] = "\n".join(street_items) @@ -663,7 +675,7 @@ def parse_registrants(data): if 'lastname' in obj: elements.append(obj["lastname"]) obj["name"] = " ".join(elements) - + return { "registrant": registrant, "tech": tech_contact, diff --git a/setup.py b/setup.py index b9637bf..d9b46bf 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,13 @@ from setuptools import setup setup(name='pythonwhois', - version='2.0.5', + version='2.1.0', description='Module for retrieving and parsing the WHOIS data for a domain. Supports most domains. No dependencies.', author='Sven Slootweg', author_email='pythonwhois@cryto.net', url='http://cryto.net/pythonwhois', packages=['pythonwhois'], + install_requires=['argparse'], provides=['pythonwhois'], scripts=["pwhois"], license="WTFPL" diff --git a/test.py b/test.py index 78dc30b..f478747 100755 --- a/test.py +++ b/test.py @@ -1,6 +1,27 @@ #!/usr/bin/env python2 -import sys, argparse, os, pythonwhois, json, datetime +import sys, argparse, os, pythonwhois, json, datetime, codecs +import pkgutil +import encodings + + +def get_codecs(): + """Dynamically get list of codecs in python.""" + false_positives = set(["aliases"]) + found = set(name for imp, name, ispkg in pkgutil.iter_modules(encodings.__path__) if not ispkg) + found.difference_update(false_positives) + return found + + +def read_encoded_file(file_path): + """Try reading file using all codecs. Return the first succesfull one.""" + for encoding in get_codecs(): + try: + with codecs.open(file_path, "r", encoding) as f: + return f.read() + except Exception: + pass + parser = argparse.ArgumentParser(description="Runs or modifies the test suite for python-whois.") parser.add_argument("mode", nargs=1, choices=["run", "update"], default="run", help="Whether to run or update the tests. Only update if you know what you're doing!") @@ -14,7 +35,7 @@ ENDC = '\033[0m' def encoded_json_dumps(obj): try: return json.dumps(obj, default=json_fallback) - except UnicodeDecodeError, e: + except UnicodeDecodeError as e: return json.dumps(recursive_encode(obj, "latin-1"), default=json_fallback) def json_fallback(obj): @@ -24,7 +45,7 @@ def json_fallback(obj): return obj def recursive_encode(obj, encoding): - for key in obj.keys(): + for key in list(obj.keys()): if isinstance(obj[key], dict): obj[key] = recursive_encode(obj[key], encoding) elif isinstance(obj[key], list): @@ -74,18 +95,26 @@ if args.mode[0] == "run": suites = [] for target in targets: try: - with open(os.path.join("test/data", target), "r") as f: + with codecs.open(os.path.join("test/data", target), "r") as f: data = f.read().split("\n--\n") - except IOError, e: + except IOError as e: sys.stderr.write("Invalid domain %(domain)s specified. No test case or base data exists.\n" % {"domain": target}) errors = True continue - try: - with open(os.path.join("test/target_default", target), "r") as f: + except UnicodeDecodeError: + try: + # Try cp1252 (ufpa.br uses that) + with codecs.open(os.path.join("test/data", target), "r", 'cp1252') as f: + data = f.read().split("\n--\n") + except UnicodeDecodeError as e: + # Fall back to trying all registered codecs + data = read_encoded_file(os.path.join("test/data", target)).split("\n--\n") + try: + with codecs.open(os.path.join("test/target_default", target), "r") as f: default = f.read() - with open(os.path.join("test/target_normalized", target), "r") as f: + with codecs.open(os.path.join("test/target_normalized", target), "r") as f: normalized = f.read() - except IOError, e: + except IOError as e: sys.stderr.write("Missing target data for domain %(domain)s. Run `./test.py update %(domain)s` to correct this, after verifying that pythonwhois can correctly parse this particular domain.\n" % {"domain": target}) errors = True continue @@ -152,10 +181,10 @@ elif args.mode[0] == "update": updates = [] for target in targets: try: - with open(os.path.join("test/data", target), "r") as f: + with codecs.open(os.path.join("test/data", target), "r") as f: data = f.read().split("\n--\n") updates.append((target, data)) - except IOError, e: + except IOError as e: sys.stderr.write("Invalid domain %(domain)s specified. No base data exists.\n" % {"domain": target}) errors = True continue @@ -166,8 +195,8 @@ elif args.mode[0] == "update": for target, data in updates: default = pythonwhois.parse.parse_raw_whois(data) normalized = pythonwhois.parse.parse_raw_whois(data, normalized=True) - with open(os.path.join("test/target_default", target), "w") as f: + with codecs.open(os.path.join("test/target_default", target), "w") as f: f.write(encoded_json_dumps(default)) - with open(os.path.join("test/target_normalized", target), "w") as f: - f.write(encoded_json_dumps(normalized)) - print "Generated target data for %s." % target + with codecs.open(os.path.join("test/target_normalized", target), "w") as f: + f.write(encoded_json_dumps(normalized)) + print("Generated target data for %s." % target) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..cd1c703 --- /dev/null +++ b/tox.ini @@ -0,0 +1,7 @@ +[tox] +envlist = py26,py27,py33 + +[testenv] +usedevelop = True +setenv = VIRTUAL_ENV={envdir} +commands = python test.py run all