@ -1,4 +1,5 @@
import re , datetime
from __future__ import print_function
import re , sys , datetime
grammar = {
" _data " : {
@ -136,14 +137,25 @@ grammar = {
}
}
if sys . version_info < ( 3 , 0 ) :
def is_string ( data ) :
""" Test for string with support for python 2. """
return isinstance ( data , basestring )
else :
def is_string ( data ) :
""" Test for string with support for python 3. """
return isinstance ( data , str )
def parse_raw_whois ( raw_data , normalized = [ ] ) :
data = { }
raw_data = [ segment . replace ( " \r " , " " ) for segment in raw_data ] # Carriage returns are the devil
for segment in raw_data :
for rule_key , rule_regexes in grammar [ ' _data ' ] . iteritems ( ) :
if data . has_key ( rule_key ) == False :
for rule_key , rule_regexes in grammar [ ' _data ' ] . ite ms( ) :
if ( rule_key in data ) == False :
for line in segment . splitlines ( ) :
for regex in rule_regexes :
result = re . search ( regex , line , re . IGNORECASE )
@ -153,7 +165,7 @@ def parse_raw_whois(raw_data, normalized=[]):
if val != " " :
try :
data [ rule_key ] . append ( val )
except KeyError , e :
except KeyError as e :
data [ rule_key ] = [ val ]
# Whois.com is a bit special... Fabulous.com also seems to use this format.
@ -163,7 +175,7 @@ def parse_raw_whois(raw_data, normalized=[]):
for match in re . findall ( " [ ]+(.+) \n " , chunk ) :
try :
data [ " nameservers " ] . append ( match . strip ( ) )
except KeyError , e :
except KeyError as e :
data [ " nameservers " ] = [ match . strip ( ) ]
# Nominet also needs some special attention
match = re . search ( " Registrar: \n (.+) \n " , segment )
@ -176,7 +188,7 @@ def parse_raw_whois(raw_data, normalized=[]):
match = match . split ( ) [ 0 ]
try :
data [ " nameservers " ] . append ( match . strip ( ) )
except KeyError , e :
except KeyError as e :
data [ " nameservers " ] = [ match . strip ( ) ]
# .am plays the same game
match = re . search ( " DNS servers:([ \ s \ S]*? \n ) \n " , segment )
@ -186,7 +198,7 @@ def parse_raw_whois(raw_data, normalized=[]):
match = match . split ( ) [ 0 ]
try :
data [ " nameservers " ] . append ( match . strip ( ) )
except KeyError , e :
except KeyError as e :
data [ " nameservers " ] = [ match . strip ( ) ]
# SIDN isn't very standard either.
match = re . search ( " Registrar: \n \ s+( \ S.*) " , segment )
@ -199,7 +211,7 @@ def parse_raw_whois(raw_data, normalized=[]):
match = match . split ( ) [ 0 ]
try :
data [ " nameservers " ] . append ( match . strip ( ) )
except KeyError , e :
except KeyError as e :
data [ " nameservers " ] = [ match . strip ( ) ]
# The .ie WHOIS server puts ambiguous status information in an unhelpful order
match = re . search ( ' ren-status: \ s*(.+) ' , segment )
@ -212,34 +224,34 @@ def parse_raw_whois(raw_data, normalized=[]):
try :
data [ ' expiration_date ' ] = remove_duplicates ( data [ ' expiration_date ' ] )
data [ ' expiration_date ' ] = parse_dates ( data [ ' expiration_date ' ] )
except KeyError , e :
except KeyError as e :
pass # Not present
try :
data [ ' creation_date ' ] = remove_duplicates ( data [ ' creation_date ' ] )
data [ ' creation_date ' ] = parse_dates ( data [ ' creation_date ' ] )
except KeyError , e :
except KeyError as e :
pass # Not present
try :
data [ ' updated_date ' ] = remove_duplicates ( data [ ' updated_date ' ] )
data [ ' updated_date ' ] = parse_dates ( data [ ' updated_date ' ] )
except KeyError , e :
except KeyError as e :
pass # Not present
try :
data [ ' nameservers ' ] = remove_duplicates ( [ ns . rstrip ( " . " ) for ns in data [ ' nameservers ' ] ] )
except KeyError , e :
except KeyError as e :
pass # Not present
try :
data [ ' emails ' ] = remove_duplicates ( data [ ' emails ' ] )
except KeyError , e :
except KeyError as e :
pass # Not present
try :
data [ ' registrar ' ] = remove_duplicates ( data [ ' registrar ' ] )
except KeyError , e :
except KeyError as e :
pass # Not present
# Remove e-mail addresses if they are already listed for any of the contacts
@ -248,14 +260,14 @@ def parse_raw_whois(raw_data, normalized=[]):
if data [ " contacts " ] [ contact ] is not None :
try :
known_emails . append ( data [ " contacts " ] [ contact ] [ " email " ] )
except KeyError , e :
except KeyError as e :
pass # No e-mail recorded for this contact...
try :
data [ ' emails ' ] = [ email for email in data [ " emails " ] if email not in known_emails ]
except KeyError , e :
except KeyError as e :
pass # Not present
for key in data . keys ( ) :
for key in list ( data . keys ( ) ) :
if data [ key ] is None or len ( data [ key ] ) == 0 :
del data [ key ]
@ -269,23 +281,23 @@ def parse_raw_whois(raw_data, normalized=[]):
def normalize_data ( data , normalized ) :
for key in ( " nameservers " , " emails " , " whois_server " ) :
if key in data and data [ key ] is not None and ( normalized == True or key in normalized ) :
if isinstance ( data [ key ] , basestring ) :
if is_string ( data [ key ] ) :
data [ key ] = data [ key ] . lower ( )
else :
data [ key ] = [ item . lower ( ) for item in data [ key ] ]
for key , threshold in ( ( " registrar " , 4 ) , ( " status " , 3 ) ) :
if key in data and data [ key ] is not None and ( normalized == True or key in normalized ) :
if isinstance ( data [ key ] , basestring ) :
if is_string ( data [ key ] ) :
data [ key ] = normalize_name ( data [ key ] , abbreviation_threshold = threshold , length_threshold = 1 )
else :
data [ key ] = [ normalize_name ( item , abbreviation_threshold = threshold , length_threshold = 1 ) for item in data [ key ] ]
for contact_type , contact in data [ ' contacts ' ] . ite rite ms( ) :
for contact_type , contact in data [ ' contacts ' ] . ite ms( ) :
if contact is not None :
for key in ( " email " , ) :
if key in contact and contact [ key ] is not None and ( normalized == True or key in normalized ) :
if isinstance ( contact [ key ] , basestring ) :
if isinstance ( contact [ key ] , str ) :
contact [ key ] = contact [ key ] . lower ( )
else :
contact [ key ] = [ item . lower ( ) for item in contact [ key ] ]
@ -298,10 +310,10 @@ def normalize_data(data, normalized):
if key in contact and contact [ key ] is not None and ( normalized == True or key in normalized ) :
contact [ key ] = normalize_name ( contact [ key ] , abbreviation_threshold = 3 , length_threshold = 3 )
for key in contact . keys ( ) :
for key in list ( contact . keys ( ) ) :
try :
contact [ key ] = contact [ key ] . strip ( " , " )
except AttributeError , e :
except AttributeError as e :
pass # Not a string
return data
@ -368,37 +380,37 @@ def parse_dates(dates):
# This will require some more guesswork - some WHOIS servers present the name of the month
try :
month = int ( result . group ( " month " ) )
except ValueError , e :
except ValueError as e :
# Apparently not a number. Look up the corresponding number.
try :
month = grammar [ ' _months ' ] [ result . group ( " month " ) . lower ( ) ]
except KeyError , e :
except KeyError as e :
# Unknown month name, default to 0
month = 0
try :
hour = int ( result . group ( " hour " ) )
except IndexError , e :
except IndexError as e :
hour = 0
except TypeError , e :
except TypeError as e :
hour = 0
try :
minute = int ( result . group ( " minute " ) )
except IndexError , e :
except IndexError as e :
minute = 0
except TypeError , e :
except TypeError as e :
minute = 0
try :
second = int ( result . group ( " second " ) )
except IndexError , e :
except IndexError as e :
second = 0
except TypeError , e :
except TypeError as e :
second = 0
break
except ValueError , e :
except ValueError as e :
# Something went horribly wrong, maybe there is no valid date present?
year = 0
month = 0
@ -406,16 +418,16 @@ def parse_dates(dates):
hour = 0
minute = 0
second = 0
print e . message
print ( e . message )
try :
if year > 0 :
try :
parsed_dates . append ( datetime . datetime ( year , month , day , hour , minute , second ) )
except ValueError , e :
except ValueError as e :
# We might have gotten the day and month the wrong way around, let's try it the other way around
# If you're not using an ISO-standard date format, you're an evil registrar!
parsed_dates . append ( datetime . datetime ( year , day , month , hour , minute , second ) )
except UnboundLocalError , e :
except UnboundLocalError as e :
pass
if len ( parsed_dates ) > 0 :
@ -626,7 +638,7 @@ def parse_registrants(data):
# Post-processing
for obj in ( registrant , tech_contact , billing_contact , admin_contact ) :
if obj is not None :
for key in obj . keys ( ) :
for key in list ( obj . keys ( ) ) :
if obj [ key ] is None or obj [ key ] . strip ( ) == " " : # Just chomp all surrounding whitespace
del obj [ key ]
else :
@ -642,7 +654,7 @@ def parse_registrants(data):
try :
street_items . append ( obj [ " street %d " % i ] )
del obj [ " street %d " % i ]
except KeyError , e :
except KeyError as e :
break
i + = 1
obj [ " street " ] = " \n " . join ( street_items )