@ -1,4 +1,5 @@
import re , datetime
from __future__ import print_function
import re , sys , datetime
grammar = {
" _data " : {
@ -136,26 +137,37 @@ grammar = {
}
}
if sys . version_info < ( 3 , 0 ) :
def is_string ( data ) :
""" Test for string with support for python 2. """
return isinstance ( data , basestring )
else :
def is_string ( data ) :
""" Test for string with support for python 3. """
return isinstance ( data , str )
def parse_raw_whois ( raw_data , normalized = [ ] ) :
data = { }
raw_data = [ segment . replace ( " \r " , " " ) for segment in raw_data ] # Carriage returns are the devil
for segment in raw_data :
for rule_key , rule_regexes in grammar [ ' _data ' ] . iteritems ( ) :
if data . has_key ( rule_key ) == False :
for rule_key , rule_regexes in grammar [ ' _data ' ] . ite ms( ) :
if ( rule_key in data ) == False :
for line in segment . splitlines ( ) :
for regex in rule_regexes :
result = re . search ( regex , line , re . IGNORECASE )
if result is not None :
val = result . group ( " val " ) . strip ( )
if val != " " :
try :
data [ rule_key ] . append ( val )
except KeyError , e :
except KeyError as e :
data [ rule_key ] = [ val ]
# Whois.com is a bit special... Fabulous.com also seems to use this format.
match = re . search ( " Name Servers:([/s/S]+) \n \n " , segment )
if match is not None :
@ -163,7 +175,7 @@ def parse_raw_whois(raw_data, normalized=[]):
for match in re . findall ( " [ ]+(.+) \n " , chunk ) :
try :
data [ " nameservers " ] . append ( match . strip ( ) )
except KeyError , e :
except KeyError as e :
data [ " nameservers " ] = [ match . strip ( ) ]
# Nominet also needs some special attention
match = re . search ( " Registrar: \n (.+) \n " , segment )
@ -176,7 +188,7 @@ def parse_raw_whois(raw_data, normalized=[]):
match = match . split ( ) [ 0 ]
try :
data [ " nameservers " ] . append ( match . strip ( ) )
except KeyError , e :
except KeyError as e :
data [ " nameservers " ] = [ match . strip ( ) ]
# .am plays the same game
match = re . search ( " DNS servers:([ \ s \ S]*? \n ) \n " , segment )
@ -186,7 +198,7 @@ def parse_raw_whois(raw_data, normalized=[]):
match = match . split ( ) [ 0 ]
try :
data [ " nameservers " ] . append ( match . strip ( ) )
except KeyError , e :
except KeyError as e :
data [ " nameservers " ] = [ match . strip ( ) ]
# SIDN isn't very standard either.
match = re . search ( " Registrar: \n \ s+( \ S.*) " , segment )
@ -199,109 +211,109 @@ def parse_raw_whois(raw_data, normalized=[]):
match = match . split ( ) [ 0 ]
try :
data [ " nameservers " ] . append ( match . strip ( ) )
except KeyError , e :
except KeyError as e :
data [ " nameservers " ] = [ match . strip ( ) ]
# The .ie WHOIS server puts ambiguous status information in an unhelpful order
match = re . search ( ' ren-status: \ s*(.+) ' , segment )
if match is not None :
data [ " status " ] . insert ( 0 , match . group ( 1 ) . strip ( ) )
data [ " contacts " ] = parse_registrants ( raw_data )
# Parse dates
try :
data [ ' expiration_date ' ] = remove_duplicates ( data [ ' expiration_date ' ] )
data [ ' expiration_date ' ] = parse_dates ( data [ ' expiration_date ' ] )
except KeyError , e :
except KeyError as e :
pass # Not present
try :
data [ ' creation_date ' ] = remove_duplicates ( data [ ' creation_date ' ] )
data [ ' creation_date ' ] = parse_dates ( data [ ' creation_date ' ] )
except KeyError , e :
except KeyError as e :
pass # Not present
try :
data [ ' updated_date ' ] = remove_duplicates ( data [ ' updated_date ' ] )
data [ ' updated_date ' ] = parse_dates ( data [ ' updated_date ' ] )
except KeyError , e :
except KeyError as e :
pass # Not present
try :
data [ ' nameservers ' ] = remove_duplicates ( [ ns . rstrip ( " . " ) for ns in data [ ' nameservers ' ] ] )
except KeyError , e :
except KeyError as e :
pass # Not present
try :
data [ ' emails ' ] = remove_duplicates ( data [ ' emails ' ] )
except KeyError , e :
except KeyError as e :
pass # Not present
try :
data [ ' registrar ' ] = remove_duplicates ( data [ ' registrar ' ] )
except KeyError , e :
except KeyError as e :
pass # Not present
# Remove e-mail addresses if they are already listed for any of the contacts
known_emails = [ ]
for contact in ( " registrant " , " tech " , " admin " , " billing " ) :
if data [ " contacts " ] [ contact ] is not None :
try :
known_emails . append ( data [ " contacts " ] [ contact ] [ " email " ] )
except KeyError , e :
except KeyError as e :
pass # No e-mail recorded for this contact...
try :
data [ ' emails ' ] = [ email for email in data [ " emails " ] if email not in known_emails ]
except KeyError , e :
except KeyError as e :
pass # Not present
for key in data . keys ( ) :
for key in list ( data . keys ( ) ) :
if data [ key ] is None or len ( data [ key ] ) == 0 :
del data [ key ]
data [ " raw " ] = raw_data
if normalized != [ ] :
data = normalize_data ( data , normalized )
return data
def normalize_data ( data , normalized ) :
for key in ( " nameservers " , " emails " , " whois_server " ) :
if key in data and data [ key ] is not None and ( normalized == True or key in normalized ) :
if isinstance ( data [ key ] , basestring ) :
if is_string ( data [ key ] ) :
data [ key ] = data [ key ] . lower ( )
else :
data [ key ] = [ item . lower ( ) for item in data [ key ] ]
for key , threshold in ( ( " registrar " , 4 ) , ( " status " , 3 ) ) :
if key in data and data [ key ] is not None and ( normalized == True or key in normalized ) :
if isinstance ( data [ key ] , basestring ) :
if is_string ( data [ key ] ) :
data [ key ] = normalize_name ( data [ key ] , abbreviation_threshold = threshold , length_threshold = 1 )
else :
data [ key ] = [ normalize_name ( item , abbreviation_threshold = threshold , length_threshold = 1 ) for item in data [ key ] ]
for contact_type , contact in data [ ' contacts ' ] . ite rite ms( ) :
for contact_type , contact in data [ ' contacts ' ] . ite ms( ) :
if contact is not None :
for key in ( " email " , ) :
if key in contact and contact [ key ] is not None and ( normalized == True or key in normalized ) :
if isinstance ( contact [ key ] , basestring ) :
if isinstance ( contact [ key ] , str ) :
contact [ key ] = contact [ key ] . lower ( )
else :
contact [ key ] = [ item . lower ( ) for item in contact [ key ] ]
for key in ( " name " , " street " ) :
if key in contact and contact [ key ] is not None and ( normalized == True or key in normalized ) :
contact [ key ] = normalize_name ( contact [ key ] , abbreviation_threshold = 3 )
for key in ( " city " , " organization " , " state " , " country " ) :
if key in contact and contact [ key ] is not None and ( normalized == True or key in normalized ) :
contact [ key ] = normalize_name ( contact [ key ] , abbreviation_threshold = 3 , length_threshold = 3 )
for key in contact . keys ( ) :
for key in list ( contact . keys ( ) ) :
try :
contact [ key ] = contact [ key ] . strip ( " , " )
except AttributeError , e :
except AttributeError as e :
pass # Not a string
return data
@ -348,57 +360,57 @@ def normalize_name(value, abbreviation_threshold=4, length_threshold=8, lowercas
def parse_dates ( dates ) :
global grammar
parsed_dates = [ ]
for date in dates :
for rule in grammar [ ' _dateformats ' ] :
result = re . match ( rule , date , re . IGNORECASE )
if result is not None :
try :
# These are always numeric. If they fail, there is no valid date present.
year = int ( result . group ( " year " ) )
day = int ( result . group ( " day " ) )
# Detect and correct shorthand year notation
if year < 60 :
year + = 2000
elif year < 100 :
year + = 1900
# This will require some more guesswork - some WHOIS servers present the name of the month
try :
month = int ( result . group ( " month " ) )
except ValueError , e :
except ValueError as e :
# Apparently not a number. Look up the corresponding number.
try :
month = grammar [ ' _months ' ] [ result . group ( " month " ) . lower ( ) ]
except KeyError , e :
except KeyError as e :
# Unknown month name, default to 0
month = 0
try :
hour = int ( result . group ( " hour " ) )
except IndexError , e :
except IndexError as e :
hour = 0
except TypeError , e :
except TypeError as e :
hour = 0
try :
minute = int ( result . group ( " minute " ) )
except IndexError , e :
except IndexError as e :
minute = 0
except TypeError , e :
except TypeError as e :
minute = 0
try :
second = int ( result . group ( " second " ) )
except IndexError , e :
except IndexError as e :
second = 0
except TypeError , e :
except TypeError as e :
second = 0
break
except ValueError , e :
except ValueError as e :
# Something went horribly wrong, maybe there is no valid date present?
year = 0
month = 0
@ -406,18 +418,18 @@ def parse_dates(dates):
hour = 0
minute = 0
second = 0
print e . message
print ( e . message )
try :
if year > 0 :
try :
parsed_dates . append ( datetime . datetime ( year , month , day , hour , minute , second ) )
except ValueError , e :
except ValueError as e :
# We might have gotten the day and month the wrong way around, let's try it the other way around
# If you're not using an ISO-standard date format, you're an evil registrar!
parsed_dates . append ( datetime . datetime ( year , day , month , hour , minute , second ) )
except UnboundLocalError , e :
except UnboundLocalError as e :
pass
if len ( parsed_dates ) > 0 :
return parsed_dates
else :
@ -425,11 +437,11 @@ def parse_dates(dates):
def remove_duplicates ( data ) :
cleaned_list = [ ]
for entry in data :
if entry not in cleaned_list :
cleaned_list . append ( entry )
return cleaned_list
def preprocess_regex ( regex ) :
@ -440,7 +452,7 @@ def parse_registrants(data):
tech_contact = None
billing_contact = None
admin_contact = None
registrant_regexes = [
" Registrant:[ ]* \n (?P<organization>.*) \n (?P<name>.*) \n (?P<street>.*) \n (?P<city>.*), (?P<state>.*) (?P<postalcode>.*) \n (?P<country>.*) \n (?: Phone: (?P<phone>.*) \n )? Email: (?P<email>.*) \n " , # Corporate Domains, Inc.
" Registrant: \n (?P<name>.+) \n (?P<street1>.+) \n (?: (?P<street2>.*) \n )?(?: (?P<street3>.*) \n )? (?P<postalcode>.+), (?P<city>.+) \n (?P<country>.+) \n (?P<phone>.+) \n (?P<email>.+) \n \n " , # OVH
@ -492,7 +504,7 @@ def parse_registrants(data):
" Admin Contact Information :[ ]* \n [ ]+(?P<firstname>.*) \n [ ]+(?P<lastname>.*) \n [ ]+(?P<organization>.*) \n [ ]+(?P<email>.*) \n [ ]+(?P<street>.*) \n [ ]+(?P<city>.*) \n [ ]+(?P<postalcode>.*) \n [ ]+(?P<phone>.*) \n [ ]+(?P<fax>.*) \n \n " , # GAL Communication
" Technical contact: \n (?P<name>.+) \n (?P<organization>.*) \n (?P<street>.+) \n (?P<city>.+) (?P<state> \ S+),[ ]+(?P<postalcode>.+) \n (?P<country>.+) \n (?P<email>.+) \n (?P<phone>.*) \n (?P<fax>.*) " , # .am
]
admin_contact_regexes = [
" Administrative Contact:[ ]* \n (?P<organization>.*) \n (?P<name>.*) \n (?P<street>.*) \n (?P<city>.*), (?P<state>.*) (?P<postalcode>.*) \n (?P<country>.*) \n (?: Phone: (?P<phone>.*) \n )? Email: (?P<email>.*) \n " , # Corporate Domains, Inc.
" Administrative Contact: \n (?P<name>.+) \n (?P<street1>.+) \n (?: (?P<street2>.*) \n )?(?: (?P<street3>.*) \n )? (?P<postalcode>.+), (?P<city>.+) \n (?P<country>.+) \n (?P<phone>.+) \n (?P<email>.+) \n \n " , # OVH
@ -511,7 +523,7 @@ def parse_registrants(data):
" Tech Contact Information :[ ]* \n [ ]+(?P<firstname>.*) \n [ ]+(?P<lastname>.*) \n [ ]+(?P<organization>.*) \n [ ]+(?P<email>.*) \n [ ]+(?P<street>.*) \n [ ]+(?P<city>.*) \n [ ]+(?P<postalcode>.*) \n [ ]+(?P<phone>.*) \n [ ]+(?P<fax>.*) \n \n " , # GAL Communication
" Administrative contact: \n (?P<name>.+) \n (?P<organization>.*) \n (?P<street>.+) \n (?P<city>.+) (?P<state> \ S+),[ ]+(?P<postalcode>.+) \n (?P<country>.+) \n (?P<email>.+) \n (?P<phone>.*) \n (?P<fax>.*) " , # .am
]
billing_contact_regexes = [
" Billing ID:(?P<handle>.+) \n Billing Name:(?P<name>.*) \n Billing Organization:(?P<organization>.*) \n Billing Street1:(?P<street1>.*) \n (?:Billing Street2:(?P<street2>.*) \n )?(?:Billing Street3:(?P<street3>.*) \n )?Billing City:(?P<city>.*) \n Billing State/Province:(?P<state>.*) \n Billing Postal Code:(?P<postalcode>.*) \n Billing Country:(?P<country>.*) \n Billing Phone:(?P<phone>.*) \n (?:Billing Phone Ext.:(?P<phone_ext>.*) \n )?(?:Billing FAX:(?P<fax>.*) \n )?(?:Billing FAX Ext.:(?P<fax_ext>.*) \n )?Billing Email:(?P<email>.*) " , # nic.pw
" Billing Contact ID: \ s*(?P<handle>.+) \n Billing Contact Name: \ s*(?P<name>.+) \n Billing Contact Organization: \ s*(?P<organization>.*) \n Billing Contact Address1: \ s*(?P<street1>.+) \n Billing Contact Address2: \ s*(?P<street2>.*) \n Billing Contact City: \ s*(?P<city>.+) \n Billing Contact State/Province: \ s*(?P<state>.+) \n Billing Contact Postal Code: \ s*(?P<postalcode>.+) \n Billing Contact Country: \ s*(?P<country>.+) \n Billing Contact Country Code: \ s*(?P<country_code>.+) \n Billing Contact Phone Number: \ s*(?P<phone>.+) \n Billing Contact Email: \ s*(?P<email>.+) \n " , # .CO Internet
@ -525,18 +537,18 @@ def parse_registrants(data):
" Billing contact: \n (?: (?P<organization>.+) \n )? (?P<name>.+) \n (?P<email>.+) \n (?P<street>.+) \n (?P<city>.+), (?P<state>.+) (?P<postalcode>.+) (?P<country>.+) \n Phone: (?P<phone>.*) \n Fax: (?P<fax>.*) \n " , # Fabulous.com
" Billing Contact Information :[ ]* \n [ ]+(?P<firstname>.*) \n [ ]+(?P<lastname>.*) \n [ ]+(?P<organization>.*) \n [ ]+(?P<email>.*) \n [ ]+(?P<street>.*) \n [ ]+(?P<city>.*) \n [ ]+(?P<postalcode>.*) \n [ ]+(?P<phone>.*) \n [ ]+(?P<fax>.*) \n \n " , # GAL Communication
]
# Some registries use NIC handle references instead of directly listing contacts...
nic_contact_regexes = [
" personname: \ s*(?P<name>.+) \n organization: \ s*(?P<organization>.+) \n street address: \ s*(?P<street>.+) \n postal code: \ s*(?P<postalcode>.+) \n city: \ s*(?P<city>.+) \n country: \ s*(?P<country>.+) \n (?:phone: \ s*(?P<phone>.+) \n )?(?:fax-no: \ s*(?P<fax>.+) \n )?(?:e-mail: \ s*(?P<email>.+) \n )?nic-hdl: \ s*(?P<handle>.+) \n changed: \ s*(?P<changedate>.+) " , # nic.at
" person: \ s*(?P<name>.+) \n nic-hdl: \ s*(?P<handle>.+) \n " , # .ie
" nic-hdl: \ s*(?P<handle>.+) \n type: \ s*(?P<type>.+) \n contact: \ s*(?P<name>.+) \n (?:.+ \n )*?(?:address: \ s*(?P<street1>.+) \n address: \ s*(?P<street2>.+) \n address: \ s*(?P<street3>.+) \n address: \ s*(?P<country>.+) \n )?(?:phone: \ s*(?P<phone>.+) \n )?(?:fax-no: \ s*(?P<fax>.+) \n )?(?:.+ \n )*?(?:e-mail: \ s*(?P<email>.+) \n )?(?:.+ \n )*?changed: \ s*(?P<changedate>[0-9] {2} \ /[0-9] {2} \ /[0-9] {4} ).* \n " , # AFNIC madness without country field
" nic-hdl: \ s*(?P<handle>.+) \n type: \ s*(?P<type>.+) \n contact: \ s*(?P<name>.+) \n (?:.+ \n )*?(?:address: \ s*(?P<street1>.+) \n )?(?:address: \ s*(?P<street2>.+) \n )?(?:address: \ s*(?P<street3>.+) \n )?(?:phone: \ s*(?P<phone>.+) \n )?(?:fax-no: \ s*(?P<fax>.+) \n )?(?:.+ \n )*?(?:e-mail: \ s*(?P<email>.+) \n )?(?:.+ \n )*?changed: \ s*(?P<changedate>[0-9] {2} \ /[0-9] {2} \ /[0-9] {4} ).* \n " , # AFNIC madness any country -at all-
" nic-hdl: \ s*(?P<handle>.+) \n type: \ s*(?P<type>.+) \n contact: \ s*(?P<name>.+) \n (?:.+ \n )*?(?:address: \ s*(?P<street1>.+) \n )?(?:address: \ s*(?P<street2>.+) \n )?(?:address: \ s*(?P<street3>.+) \n )?(?:address: \ s*(?P<street4>.+) \n )?country: \ s*(?P<country>.+) \n (?:phone: \ s*(?P<phone>.+) \n )?(?:fax-no: \ s*(?P<fax>.+) \n )?(?:.+ \n )*?(?:e-mail: \ s*(?P<email>.+) \n )?(?:.+ \n )*?changed: \ s*(?P<changedate>[0-9] {2} \ /[0-9] {2} \ /[0-9] {4} ).* \n " , # AFNIC madness with country field
]
nic_contact_references = {
" registrant " : [
" registrant: \ s*(?P<handle>.+) " , # nic.at
@ -553,7 +565,7 @@ def parse_registrants(data):
" billing-c: \ s*(?P<handle>.+) " # iis.se
]
}
# Why do the below? The below is meant to handle with an edge case (issue #2) where a partial match followed
# by a failure, for a regex containing the \s*.+ pattern, would send the regex module on a wild goose hunt for
# matching positions. The workaround is to use \S.* instead of .+, but in the interest of keeping the regexes
@ -563,35 +575,35 @@ def parse_registrants(data):
tech_contact_regexes = [ preprocess_regex ( regex ) for regex in tech_contact_regexes ]
admin_contact_regexes = [ preprocess_regex ( regex ) for regex in admin_contact_regexes ]
billing_contact_regexes = [ preprocess_regex ( regex ) for regex in billing_contact_regexes ]
for segment in data :
for regex in registrant_regexes :
match = re . search ( regex , segment )
if match is not None :
registrant = match . groupdict ( )
break
for segment in data :
for regex in tech_contact_regexes :
match = re . search ( regex , segment )
if match is not None :
tech_contact = match . groupdict ( )
break
for segment in data :
for regex in admin_contact_regexes :
match = re . search ( regex , segment )
if match is not None :
admin_contact = match . groupdict ( )
break
for segment in data :
for regex in billing_contact_regexes :
match = re . search ( regex , segment )
if match is not None :
billing_contact = match . groupdict ( )
break
# Find NIC handle contact definitions
handle_contacts = [ ]
for regex in nic_contact_regexes :
@ -599,7 +611,7 @@ def parse_registrants(data):
matches = re . finditer ( regex , segment )
for match in matches :
handle_contacts . append ( match . groupdict ( ) )
# Find NIC handle references and process them
for category in nic_contact_references :
for regex in nic_contact_references [ category ] :
@ -622,11 +634,11 @@ def parse_registrants(data):
elif category == " admin " :
admin_contact = data_reference
break
# Post-processing
# Post-processing
for obj in ( registrant , tech_contact , billing_contact , admin_contact ) :
if obj is not None :
for key in obj . keys ( ) :
for key in list ( obj . keys ( ) ) :
if obj [ key ] is None or obj [ key ] . strip ( ) == " " : # Just chomp all surrounding whitespace
del obj [ key ]
else :
@ -642,7 +654,7 @@ def parse_registrants(data):
try :
street_items . append ( obj [ " street %d " % i ] )
del obj [ " street %d " % i ]
except KeyError , e :
except KeyError as e :
break
i + = 1
obj [ " street " ] = " \n " . join ( street_items )
@ -663,7 +675,7 @@ def parse_registrants(data):
if ' lastname ' in obj :
elements . append ( obj [ " lastname " ] )
obj [ " name " ] = " " . join ( elements )
return {
" registrant " : registrant ,
" tech " : tech_contact ,