Improved normalization

master
Sven Slootweg 11 years ago
parent e8b7742387
commit 05d148fa7f

@ -244,15 +244,13 @@ def normalize_data(data, normalized):
else:
contact[key] = [item.lower() for item in contact[key]]
for key in ("state", "country", "organization"):
if key in contact and contact[key] is not None and (normalized == True or key in normalized) and contact[key].isupper():
if len(contact[key]) > 2: # Two letter values are usually abbreviations and need to be in uppercase
contact[key] = " ".join(word.capitalize() for word in contact[key].strip(", ").split(" "))
for key in ("name", "street"):
if key in contact and contact[key] is not None and (normalized == True or key in normalized):
contact[key] = normalize_name(contact[key], abbreviation_threshold=3)
for key in ("name", "street", "city"):
if key in contact and contact[key] is not None and (normalized == True or key in normalized) and (contact[key].islower() or contact[key].isupper()):
if len(contact[key]) > 2: # Two letter values are usually abbreviations and need to be in original case
contact[key] = " ".join(word.capitalize() for word in contact[key].strip(", ").split(" "))
for key in ("city", "organization", "state", "country"):
if key in contact and contact[key] is not None and (normalized == True or key in normalized):
contact[key] = normalize_name(contact[key], abbreviation_threshold=3, length_threshold=3)
for key in contact.keys():
try:
@ -261,6 +259,40 @@ def normalize_data(data, normalized):
pass # Not a string
return data
def normalize_name(value, abbreviation_threshold=4, length_threshold=8):
normalized_lines = []
for line in value.split("\n"):
line = line.strip(",") # Get rid of useless comma's
if (line.isupper() or line.islower()) and len(line) >= length_threshold:
# This line is likely not capitalized properly
words = line.split()
normalized_words = []
if len(words) >= 1:
# First word
if len(words[0]) >= abbreviation_threshold and "." not in words[0]:
normalized_words.append(words[0].capitalize())
else:
# Probably an abbreviation or domain, leave it alone
normalized_words.append(words[0])
if len(words) >= 3:
# Words between the first and last
for word in words[1:-1]:
if len(word) >= abbreviation_threshold and "." not in word:
normalized_words.append(word.capitalize())
else:
# Probably an abbreviation or domain, leave it alone
normalized_words.append(word)
if len(words) >= 2:
# Last word
if len(words[-1]) >= abbreviation_threshold and "." not in words[-1]:
normalized_words.append(words[-1].capitalize())
else:
# Probably an abbreviation or domain, leave it alone
normalized_words.append(words[-1])
line = " ".join(normalized_words)
normalized_lines.append(line)
return "\n".join(normalized_lines)
def parse_dates(dates):
global grammar
parsed_dates = []

File diff suppressed because one or more lines are too long

@ -1 +1 @@
{"status": ["OK"], "updated_date": ["2013-04-30T15:06:57"], "contacts": {"admin": {"city": "N/a", "handle": "H2661317", "name": "Registry Manager", "state": "N/a", "street": "N/a", "country": "PW", "postalcode": "N/A", "organization": ".PW Registry", "email": "contact@registry.pw"}, "tech": {"city": "N/a", "handle": "H2661317", "name": "Registry Manager", "state": "N/a", "street": "N/a", "country": "PW", "postalcode": "N/A", "organization": ".PW Registry", "email": "contact@registry.pw"}, "registrant": {"city": "N/a", "handle": "H2661317", "name": "Registry Manager", "state": "N/a", "street": "N/a", "country": "PW", "postalcode": "N/A", "organization": ".PW Registry", "email": "contact@registry.pw"}, "billing": {"city": "N/a", "handle": "H2661317", "name": "Registry Manager", "state": "N/a", "street": "N/a", "country": "PW", "postalcode": "N/A", "organization": ".PW Registry", "email": "contact@registry.pw"}}, "expiration_date": ["2020-01-01T23:59:59"], "emails": [], "raw": ["This whois service is provided by CentralNic Ltd and only contains\ninformation pertaining to Internet domain names we have registered for\nour customers. By using this service you are agreeing (1) not to use any\ninformation presented here for any purpose other than determining\nownership of domain names, (2) not to store or reproduce this data in \nany way, (3) not to use any high-volume, automated, electronic processes\nto obtain data from this service. Abuse of this service is monitored and\nactions in contravention of these terms will result in being permanently\nblacklisted. All data is (c) CentralNic Ltd https://www.centralnic.com/\n\nDomain ID:CNIC-DO949898\nDomain Name:NIC.PW\nCreated On:2012-10-12T10:19:46.0Z\nLast Updated On:2013-04-30T15:06:57.0Z\nExpiration Date:2020-01-01T23:59:59.0Z\nStatus:OK\nRegistrant ID:H2661317\nRegistrant Name:Registry Manager\nRegistrant Organization:.PW Registry\nRegistrant Street1:N/A\nRegistrant City:N/A\nRegistrant State/Province:N/A\nRegistrant Postal Code:N/A\nRegistrant Country:PW\nRegistrant Phone:\nRegistrant Email:contact@registry.pw\nAdmin ID:H2661317\nAdmin Name:Registry Manager\nAdmin Organization:.PW Registry\nAdmin Street1:N/A\nAdmin City:N/A\nAdmin State/Province:N/A\nAdmin Postal Code:N/A\nAdmin Country:PW\nAdmin Phone:\nAdmin Email:contact@registry.pw\nTech ID:H2661317\nTech Name:Registry Manager\nTech Organization:.PW Registry\nTech Street1:N/A\nTech City:N/A\nTech State/Province:N/A\nTech Postal Code:N/A\nTech Country:PW\nTech Phone:\nTech Email:contact@registry.pw\nBilling ID:H2661317\nBilling Name:Registry Manager\nBilling Organization:.PW Registry\nBilling Street1:N/A\nBilling City:N/A\nBilling State/Province:N/A\nBilling Postal Code:N/A\nBilling Country:PW\nBilling Phone:\nBilling Email:contact@registry.pw\nSponsoring Registrar ID:H2661317\nSponsoring Registrar Organization:.PW Registry\nSponsoring Registrar Street1:N/A\nSponsoring Registrar City:N/A\nSponsoring Registrar State/Province:N/A\nSponsoring Registrar Postal Code:N/A\nSponsoring Registrar Country:PW\nSponsoring Registrar Phone:N/A\nSponsoring Registrar Website:http://www.registry.pw\nName Server:NS0.CENTRALNIC-DNS.COM\nName Server:NS1.CENTRALNIC-DNS.COM\nName Server:NS2.CENTRALNIC-DNS.COM\nName Server:NS3.CENTRALNIC-DNS.COM\nName Server:NS4.CENTRALNIC-DNS.COM\nName Server:NS5.CENTRALNIC-DNS.COM\nDNSSEC:Unsigned\n\n\n\n"], "whois_server": null, "registrar": [".PW Registry"], "name_servers": ["ns0.centralnic-dns.com", "ns1.centralnic-dns.com", "ns2.centralnic-dns.com", "ns3.centralnic-dns.com", "ns4.centralnic-dns.com", "ns5.centralnic-dns.com"], "creation_date": ["2012-10-12T10:19:46", "2012-10-12T10:19:46", "2012-10-12T10:19:46"], "id": ["CNIC-DO949898"]}
{"status": ["OK"], "updated_date": ["2013-04-30T15:06:57"], "contacts": {"admin": {"city": "N/a", "handle": "H2661317", "name": "Registry Manager", "state": "N/a", "street": "N/A", "country": "PW", "postalcode": "N/A", "organization": ".PW Registry", "email": "contact@registry.pw"}, "tech": {"city": "N/a", "handle": "H2661317", "name": "Registry Manager", "state": "N/a", "street": "N/A", "country": "PW", "postalcode": "N/A", "organization": ".PW Registry", "email": "contact@registry.pw"}, "registrant": {"city": "N/a", "handle": "H2661317", "name": "Registry Manager", "state": "N/a", "street": "N/A", "country": "PW", "postalcode": "N/A", "organization": ".PW Registry", "email": "contact@registry.pw"}, "billing": {"city": "N/a", "handle": "H2661317", "name": "Registry Manager", "state": "N/a", "street": "N/A", "country": "PW", "postalcode": "N/A", "organization": ".PW Registry", "email": "contact@registry.pw"}}, "expiration_date": ["2020-01-01T23:59:59"], "emails": [], "raw": ["This whois service is provided by CentralNic Ltd and only contains\ninformation pertaining to Internet domain names we have registered for\nour customers. By using this service you are agreeing (1) not to use any\ninformation presented here for any purpose other than determining\nownership of domain names, (2) not to store or reproduce this data in \nany way, (3) not to use any high-volume, automated, electronic processes\nto obtain data from this service. Abuse of this service is monitored and\nactions in contravention of these terms will result in being permanently\nblacklisted. All data is (c) CentralNic Ltd https://www.centralnic.com/\n\nDomain ID:CNIC-DO949898\nDomain Name:NIC.PW\nCreated On:2012-10-12T10:19:46.0Z\nLast Updated On:2013-04-30T15:06:57.0Z\nExpiration Date:2020-01-01T23:59:59.0Z\nStatus:OK\nRegistrant ID:H2661317\nRegistrant Name:Registry Manager\nRegistrant Organization:.PW Registry\nRegistrant Street1:N/A\nRegistrant City:N/A\nRegistrant State/Province:N/A\nRegistrant Postal Code:N/A\nRegistrant Country:PW\nRegistrant Phone:\nRegistrant Email:contact@registry.pw\nAdmin ID:H2661317\nAdmin Name:Registry Manager\nAdmin Organization:.PW Registry\nAdmin Street1:N/A\nAdmin City:N/A\nAdmin State/Province:N/A\nAdmin Postal Code:N/A\nAdmin Country:PW\nAdmin Phone:\nAdmin Email:contact@registry.pw\nTech ID:H2661317\nTech Name:Registry Manager\nTech Organization:.PW Registry\nTech Street1:N/A\nTech City:N/A\nTech State/Province:N/A\nTech Postal Code:N/A\nTech Country:PW\nTech Phone:\nTech Email:contact@registry.pw\nBilling ID:H2661317\nBilling Name:Registry Manager\nBilling Organization:.PW Registry\nBilling Street1:N/A\nBilling City:N/A\nBilling State/Province:N/A\nBilling Postal Code:N/A\nBilling Country:PW\nBilling Phone:\nBilling Email:contact@registry.pw\nSponsoring Registrar ID:H2661317\nSponsoring Registrar Organization:.PW Registry\nSponsoring Registrar Street1:N/A\nSponsoring Registrar City:N/A\nSponsoring Registrar State/Province:N/A\nSponsoring Registrar Postal Code:N/A\nSponsoring Registrar Country:PW\nSponsoring Registrar Phone:N/A\nSponsoring Registrar Website:http://www.registry.pw\nName Server:NS0.CENTRALNIC-DNS.COM\nName Server:NS1.CENTRALNIC-DNS.COM\nName Server:NS2.CENTRALNIC-DNS.COM\nName Server:NS3.CENTRALNIC-DNS.COM\nName Server:NS4.CENTRALNIC-DNS.COM\nName Server:NS5.CENTRALNIC-DNS.COM\nDNSSEC:Unsigned\n\n\n\n"], "whois_server": null, "registrar": [".PW Registry"], "name_servers": ["ns0.centralnic-dns.com", "ns1.centralnic-dns.com", "ns2.centralnic-dns.com", "ns3.centralnic-dns.com", "ns4.centralnic-dns.com", "ns5.centralnic-dns.com"], "creation_date": ["2012-10-12T10:19:46", "2012-10-12T10:19:46", "2012-10-12T10:19:46"], "id": ["CNIC-DO949898"]}

@ -1 +1 @@
{"status": ["Active", "ok"], "updated_date": ["2006-10-11T00:00:00", "2013-10-28T00:00:00", "2009-04-03T00:00:00"], "contacts": {"admin": {"fax": "+33 3 20 20 09 58", "handle": "OK62-FRNIC", "phone": "+33 3 20 20 09 57", "street": "Sarl Ovh\n140, quai du Sartel", "postalcode": "59100", "city": "Roubaix", "name": "Octave Klaba", "country": "FR", "type": "PERSON", "changedate": "2009-04-03T00:00:00"}, "tech": {"handle": "OVH5-FRNIC", "phone": "+33 8 99 70 17 61", "street": "OVH\n140, quai du Sartel", "postalcode": "59100", "city": "Roubaix", "name": "Ovh Net", "country": "FR", "type": "ROLE", "email": "tech@ovh.net", "changedate": "2006-10-11T00:00:00"}, "registrant": {"fax": "+33 3 20 20 09 58", "handle": "SO255-FRNIC", "phone": "+33 8 99 70 17 61", "street": "140, Quai Du Sartel", "postalcode": "59100", "city": "Roubaix", "name": "Ovh Sas", "country": "FR", "type": "ORGANIZATION", "email": "oles@ovh.net", "changedate": "2013-10-28T00:00:00"}, "billing": null}, "expiration_date": null, "id": null, "creation_date": ["1999-11-12T00:00:00", "1999-10-21T00:00:00"], "raw": ["%%\n%% This is the AFNIC Whois server.\n%%\n%% complete date format : DD/MM/YYYY\n%% short date format : DD/MM\n%% version : FRNIC-2.5\n%%\n%% Rights restricted by copyright.\n%% See http://www.afnic.fr/afnic/web/mentions-legales-whois_en\n%%\n%% Use '-h' option to obtain more information about this service.\n%%\n%% [77.162.55.23 REQUEST] >> -V Md5.0 ovh.fr\n%%\n%% RL Net [##########] - RL IP [#########.]\n%%\n\ndomain: ovh.fr\nstatus: ACTIVE\nhold: NO\nholder-c: SO255-FRNIC\nadmin-c: OK62-FRNIC\ntech-c: OVH5-FRNIC\nzone-c: NFC1-FRNIC\nnsl-id: NSL16790-FRNIC\nregistrar: OVH\nanniversary: 12/11\ncreated: 12/11/1999\nlast-update: 03/04/2009\nsource: FRNIC\n\nns-list: NSL16790-FRNIC\nnserver: dns.ovh.net\nnserver: dns10.ovh.net\nnserver: ns.ovh.net\nnserver: ns10.ovh.net\nsource: FRNIC\n\nregistrar: OVH\ntype: Isp Option 1\naddress: 2 Rue Kellermann\naddress: ROUBAIX\ncountry: FR\nphone: +33 8 99 70 17 61\nfax-no: +33 3 20 20 09 58\ne-mail: support@ovh.net\nwebsite: http://www.ovh.com\nanonymous: NO\nregistered: 21/10/1999\nsource: FRNIC\n\nnic-hdl: OVH5-FRNIC\ntype: ROLE\ncontact: OVH NET\naddress: OVH\naddress: 140, quai du Sartel\naddress: 59100 Roubaix\ncountry: FR\nphone: +33 8 99 70 17 61\ne-mail: tech@ovh.net\ntrouble: Information: http://www.ovh.fr\ntrouble: Questions: mailto:tech@ovh.net\ntrouble: Spam: mailto:abuse@ovh.net\nadmin-c: OK217-FRNIC\ntech-c: OK217-FRNIC\nnotify: tech@ovh.net\nregistrar: OVH\nchanged: 11/10/2006 tech@ovh.net\nanonymous: NO\nobsoleted: NO\nsource: FRNIC\n\nnic-hdl: SO255-FRNIC\ntype: ORGANIZATION\ncontact: OVH SAS\naddress: 140, quai du sartel\naddress: 59100 Roubaix\ncountry: FR\nphone: +33 8 99 70 17 61\nfax-no: +33 3 20 20 09 58\ne-mail: oles@ovh.net\nregistrar: OVH\nchanged: 28/10/2013 nic@nic.fr\nanonymous: NO\nobsoleted: NO\neligstatus: ok\neligdate: 01/09/2011 12:03:35\nsource: FRNIC\n\nnic-hdl: OK62-FRNIC\ntype: PERSON\ncontact: Octave Klaba\naddress: Sarl Ovh\naddress: 140, quai du Sartel\naddress: 59100 Roubaix\ncountry: FR\nphone: +33 3 20 20 09 57\nfax-no: +33 3 20 20 09 58\nregistrar: OVH\nchanged: 03/04/2009 nic@nic.fr\nanonymous: NO\nobsoleted: NO\nsource: FRNIC\n"], "whois_server": null, "registrar": ["OVH"], "name_servers": ["dns.ovh.net", "dns10.ovh.net", "ns.ovh.net", "ns10.ovh.net"], "emails": ["support@ovh.net", "abuse@ovh.net", "nic@nic.fr"]}
{"status": ["Active", "ok"], "updated_date": ["2006-10-11T00:00:00", "2013-10-28T00:00:00", "2009-04-03T00:00:00"], "contacts": {"admin": {"fax": "+33 3 20 20 09 58", "handle": "OK62-FRNIC", "phone": "+33 3 20 20 09 57", "street": "Sarl Ovh\n140, quai du Sartel", "postalcode": "59100", "city": "Roubaix", "name": "Octave Klaba", "country": "FR", "type": "PERSON", "changedate": "2009-04-03T00:00:00"}, "tech": {"handle": "OVH5-FRNIC", "phone": "+33 8 99 70 17 61", "street": "OVH\n140, quai du Sartel", "postalcode": "59100", "city": "Roubaix", "name": "OVH NET", "country": "FR", "type": "ROLE", "email": "tech@ovh.net", "changedate": "2006-10-11T00:00:00"}, "registrant": {"fax": "+33 3 20 20 09 58", "handle": "SO255-FRNIC", "phone": "+33 8 99 70 17 61", "street": "140, Quai du Sartel", "postalcode": "59100", "city": "Roubaix", "name": "OVH SAS", "country": "FR", "type": "ORGANIZATION", "email": "oles@ovh.net", "changedate": "2013-10-28T00:00:00"}, "billing": null}, "expiration_date": null, "id": null, "creation_date": ["1999-11-12T00:00:00", "1999-10-21T00:00:00"], "raw": ["%%\n%% This is the AFNIC Whois server.\n%%\n%% complete date format : DD/MM/YYYY\n%% short date format : DD/MM\n%% version : FRNIC-2.5\n%%\n%% Rights restricted by copyright.\n%% See http://www.afnic.fr/afnic/web/mentions-legales-whois_en\n%%\n%% Use '-h' option to obtain more information about this service.\n%%\n%% [77.162.55.23 REQUEST] >> -V Md5.0 ovh.fr\n%%\n%% RL Net [##########] - RL IP [#########.]\n%%\n\ndomain: ovh.fr\nstatus: ACTIVE\nhold: NO\nholder-c: SO255-FRNIC\nadmin-c: OK62-FRNIC\ntech-c: OVH5-FRNIC\nzone-c: NFC1-FRNIC\nnsl-id: NSL16790-FRNIC\nregistrar: OVH\nanniversary: 12/11\ncreated: 12/11/1999\nlast-update: 03/04/2009\nsource: FRNIC\n\nns-list: NSL16790-FRNIC\nnserver: dns.ovh.net\nnserver: dns10.ovh.net\nnserver: ns.ovh.net\nnserver: ns10.ovh.net\nsource: FRNIC\n\nregistrar: OVH\ntype: Isp Option 1\naddress: 2 Rue Kellermann\naddress: ROUBAIX\ncountry: FR\nphone: +33 8 99 70 17 61\nfax-no: +33 3 20 20 09 58\ne-mail: support@ovh.net\nwebsite: http://www.ovh.com\nanonymous: NO\nregistered: 21/10/1999\nsource: FRNIC\n\nnic-hdl: OVH5-FRNIC\ntype: ROLE\ncontact: OVH NET\naddress: OVH\naddress: 140, quai du Sartel\naddress: 59100 Roubaix\ncountry: FR\nphone: +33 8 99 70 17 61\ne-mail: tech@ovh.net\ntrouble: Information: http://www.ovh.fr\ntrouble: Questions: mailto:tech@ovh.net\ntrouble: Spam: mailto:abuse@ovh.net\nadmin-c: OK217-FRNIC\ntech-c: OK217-FRNIC\nnotify: tech@ovh.net\nregistrar: OVH\nchanged: 11/10/2006 tech@ovh.net\nanonymous: NO\nobsoleted: NO\nsource: FRNIC\n\nnic-hdl: SO255-FRNIC\ntype: ORGANIZATION\ncontact: OVH SAS\naddress: 140, quai du sartel\naddress: 59100 Roubaix\ncountry: FR\nphone: +33 8 99 70 17 61\nfax-no: +33 3 20 20 09 58\ne-mail: oles@ovh.net\nregistrar: OVH\nchanged: 28/10/2013 nic@nic.fr\nanonymous: NO\nobsoleted: NO\neligstatus: ok\neligdate: 01/09/2011 12:03:35\nsource: FRNIC\n\nnic-hdl: OK62-FRNIC\ntype: PERSON\ncontact: Octave Klaba\naddress: Sarl Ovh\naddress: 140, quai du Sartel\naddress: 59100 Roubaix\ncountry: FR\nphone: +33 3 20 20 09 57\nfax-no: +33 3 20 20 09 58\nregistrar: OVH\nchanged: 03/04/2009 nic@nic.fr\nanonymous: NO\nobsoleted: NO\nsource: FRNIC\n"], "whois_server": null, "registrar": ["OVH"], "name_servers": ["dns.ovh.net", "dns10.ovh.net", "ns.ovh.net", "ns10.ovh.net"], "emails": ["support@ovh.net", "abuse@ovh.net", "nic@nic.fr"]}
Loading…
Cancel
Save