From 71b2e3b1420c3188db7b15743cf3dd382d425a9a Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Thu, 21 Nov 2013 07:52:44 +0100 Subject: [PATCH] Data normalization --- pwhois | 2 +- pythonwhois/parse.py | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/pwhois b/pwhois index 1f3c077..768300d 100755 --- a/pwhois +++ b/pwhois @@ -18,7 +18,7 @@ else: if args.raw == True: print "\n--\n".join(data) else: - parsed = pythonwhois.parse.parse_raw_whois(data) + parsed = pythonwhois.parse.parse_raw_whois(data, normalized=True) data_map = OrderedDict({}) # This defines the fields shown in the output diff --git a/pythonwhois/parse.py b/pythonwhois/parse.py index 6ac0bff..02dccc2 100644 --- a/pythonwhois/parse.py +++ b/pythonwhois/parse.py @@ -116,7 +116,7 @@ grammar = { } } -def parse_raw_whois(raw_data): +def parse_raw_whois(raw_data, normalized=[]): data = {} raw_data = [segment.replace("\r", "") for segment in raw_data] # Carriage returns are the devil @@ -178,6 +178,42 @@ def parse_raw_whois(raw_data): data["raw"] = raw_data + if normalized != []: + data = normalize_data(data, normalized) + + return data + +def normalize_data(data, normalized): + for key in ("name_servers", "emails", "whois_server"): + if key in data and data[key] is not None and (normalized == True or key in normalized): + if isinstance(data[key], basestring): + data[key] = data[key].lower() + else: + data[key] = [item.lower() for item in data[key]] + + for key in ("registrar", "status"): + if key in data and data[key] is not None and (normalized == True or key in normalized): + if isinstance(data[key], basestring) and data[key].isupper(): + data[key] = " ".join(word.capitalize() for word in data[key].split(" ")) + else: + data[key] = [" ".join(word.capitalize() for word in item.split(" ")) for item in data[key] if item.isupper()] + [item for item in data[key] if not item.isupper()] + + for contact_type, contact in data['contacts'].iteritems(): + if contact is not None: + for key in ("email",): + if key in contact and contact[key] is not None and (normalized == True or key in normalized): + if isinstance(contact[key], basestring): + contact[key] = contact[key].lower() + else: + contact[key] = [item.lower() for item in contact[key]] + + for key in ("name", "street", "city", "state"): + if key in contact and contact[key] is not None and (normalized == True or key in normalized): + if isinstance(contact[key], basestring) and contact[key].isupper(): + contact[key] = " ".join(word.capitalize() for word in contact[key].split(" ")) + else: + contact[key] = [" ".join(word.capitalize() for word in item.split(" ")) for item in contact[key] if item.isupper()] + [item for item in contact[key] if not item.isupper()] + return data def parse_dates(dates):