import re , datetime
grammar = {
" _data " : {
' status ' : [ ' \ [Status \ ] \ s*(?P<val>.+) ' ,
' Status \ s*: \ s?(?P<val>.+) ' ,
' state: \ s*(?P<val>.+) ' ] ,
' creation_date ' : [ ' \ [Created on \ ] \ s*(?P<val>.+) ' ,
' Creation Date: \ s?(?P<val>.+) ' ,
' Created on: \ s?(?P<val>.+) ' ,
' Created on \ s?[.]*: \ s?(?P<val>.+) \ . ' ,
' Date Registered \ s?[.]*: \ s?(?P<val>.+) ' ,
' Domain Created \ s?[.]*: \ s?(?P<val>.+) ' ,
' Domain registered \ s?[.]*: \ s?(?P<val>.+) ' ,
' Domain record activated \ s?[.]*: \ s*?(?P<val>.+) ' ,
' Record created on \ s?[.]*:? \ s*?(?P<val>.+) ' ,
' Record created \ s?[.]*:? \ s*?(?P<val>.+) ' ,
' Created \ s?[.]*:? \ s*?(?P<val>.+) ' ,
' Registered on \ s?[.]*:? \ s*?(?P<val>.+) ' ,
' Registered \ s?[.]*:? \ s*?(?P<val>.+) ' ,
' Domain Create Date \ s?[.]*:? \ s*?(?P<val>.+) ' ,
' Domain Registration Date \ s?[.]*:? \ s*?(?P<val>.+) ' ,
' created: \ s*(?P<val>.+) ' ,
' registered: \ s*(?P<val>.+) ' ] ,
' expiration_date ' : [ ' \ [Expires on \ ] \ s*(?P<val>.+) ' ,
' Expiration Date: \ s?(?P<val>.+) ' ,
' Expires on: \ s?(?P<val>.+) ' ,
' Expires on \ s?[.]*: \ s?(?P<val>.+) \ . ' ,
' Expiry Date \ s?[.]*: \ s?(?P<val>.+) ' ,
' Expiry \ s*: \ s?(?P<val>.+) ' ,
' Domain Currently Expires \ s?[.]*: \ s?(?P<val>.+) ' ,
' Record will expire on \ s?[.]*: \ s?(?P<val>.+) ' ,
' Domain expires \ s?[.]*: \ s*?(?P<val>.+) ' ,
' Record expires on \ s?[.]*:? \ s*?(?P<val>.+) ' ,
' Record expires \ s?[.]*:? \ s*?(?P<val>.+) ' ,
' Expires \ s?[.]*:? \ s*?(?P<val>.+) ' ,
' Expire Date \ s?[.]*:? \ s*?(?P<val>.+) ' ,
' Expired \ s?[.]*:? \ s*?(?P<val>.+) ' ,
' Domain Expiration Date \ s?[.]*:? \ s*?(?P<val>.+) ' ,
' paid-till: \ s*(?P<val>.+) ' ,
' expire: \ s*(?P<val>.+) ' ] ,
' updated_date ' : [ ' \ [Last Updated \ ] \ s*(?P<val>.+) ' ,
' Updated Date: \ s?(?P<val>.+) ' ,
#'Database last updated on\s?[.]*:?\s*?(?P<val>.+)\s[a-z]+\.?',
' Record last updated on \ s?[.]*:? \ s?(?P<val>.+) \ . ' ,
' Domain record last updated \ s?[.]*: \ s*?(?P<val>.+) ' ,
' Domain Last Updated \ s?[.]*: \ s*?(?P<val>.+) ' ,
' Last updated on: \ s?(?P<val>.+) ' ,
' Date Modified \ s?[.]*: \ s?(?P<val>.+) ' ,
' Last Modified \ s?[.]*: \ s?(?P<val>.+) ' ,
' Domain Last Updated Date \ s?[.]*: \ s?(?P<val>.+) ' ,
' Record last updated \ s?[.]*: \ s?(?P<val>.+) ' ,
' Modified \ s?[.]*: \ s?(?P<val>.+) ' ,
' changed: \ s*(?P<val>.+) ' ,
' Last Update \ s?[.]*: \ s?(?P<val>.+) ' ,
' Last updated on (?P<val>.+) [a-z] {3} ' ,
' Last update of whois database: \ s?[a-z] {3} , (?P<val>.+) [a-z] {3} ' ] ,
' registrar ' : [ ' registrar: \ s*(?P<val>.+) ' ,
' Registrar: \ s*(?P<val>.+) ' ,
' Registered through: \ s?(?P<val>.+) ' ,
' Registrar Name: \ s?(?P<val>.+) ' ,
' Record maintained by: \ s?(?P<val>.+) ' ,
' Registration Service Provided By: \ s?(?P<val>.+) ' ,
' Registrar of Record: \ s?(?P<val>.+) ' ,
' \t Name: \t \ s(?P<val>.+) ' ] ,
' whois_server ' : [ ' Whois Server: \ s?(?P<val>.+) ' ,
' Registrar Whois: \ s?(?P<val>.+) ' ] ,
' name_servers ' : [ ' Name Server:[ ]*(?P<val>[^ ]+) ' ,
' (?P<val>[a-z]*d?ns[0-9]+([a-z] {3} )? \ .([a-z0-9-]+ \ .)+[a-z0-9]+) ' ,
' nameserver: \ s*(?P<val>.+) ' ,
' nserver: \ s*(?P<val>[^[ \ s]+) ' ,
' DNS[0-9]+: \ s*(?P<val>.+) ' ,
' ns[0-9]+: \ s*(?P<val>.+) ' ,
' NS [0-9]+ \ s*: \ s*(?P<val>.+) ' ,
' (?P<val>[a-z0-9-]+ \ .d?ns[0-9]* \ .([a-z0-9-]+ \ .)+[a-z0-9]+) ' ,
' (?P<val>([a-z0-9-]+ \ .)+[a-z0-9]+)( \ s+([0-9] { 1,3} \ .) {3} [0-9] { 1,3}) ' ,
' [^a-z0-9.-](?P<val>d?ns \ .([a-z0-9-]+ \ .)+[a-z0-9]+) ' ] ,
' emails ' : [ ' (?P<val>[ \ w.-]+@[ \ w.-]+ \ .[ \ w] { 2,6}) ' , # Really need to fix this, much longer TLDs now exist...
' (?P<val>[ \ w.-]+ \ sAT \ s[ \ w.-]+ \ sDOT \ s[ \ w] { 2,6}) ' ]
} ,
" _dateformats " : (
' (?P<day>[0-9] { 1,2})[./ -](?P<month>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[./ -](?P<year>[0-9] {4} |[0-9] {2} ) '
' ( \ s+(?P<hour>[0-9] { 1,2})[:.](?P<minute>[0-9] { 1,2})[:.](?P<second>[0-9] { 1,2}))? ' ,
' [a-z] {3} \ s(?P<month>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[./ -](?P<day>[0-9] { 1,2}) '
' ( \ s+(?P<hour>[0-9] { 1,2})[:.](?P<minute>[0-9] { 1,2})[:.](?P<second>[0-9] { 1,2}))? '
' \ s[a-z] {3} \ s(?P<year>[0-9] {4} |[0-9] {2} ) ' ,
' (?P<year>[0-9] {4} )[./-](?P<month>[0-9] { 1,2})[./-](?P<day>[0-9] { 1,2}) ' ,
' (?P<day>[0-9] { 1,2})[./ -](?P<month>[0-9] { 1,2})[./ -](?P<year>[0-9] {4} |[0-9] {2} ) ' ,
' (?P<year>[0-9] {4} )(?P<month>[0-9] {2} )(?P<day>[0-9] {2} ) \ s((?P<hour>[0-9] { 1,2})[:.](?P<minute>[0-9] { 1,2})[:.](?P<second>[0-9] { 1,2})) '
) ,
" _months " : {
' jan ' : 1 ,
' january ' : 1 ,
' feb ' : 2 ,
' february ' : 2 ,
' mar ' : 3 ,
' march ' : 3 ,
' apr ' : 4 ,
' april ' : 4 ,
' may ' : 5 ,
' jun ' : 6 ,
' june ' : 6 ,
' jul ' : 7 ,
' july ' : 7 ,
' aug ' : 8 ,
' august ' : 8 ,
' sep ' : 9 ,
' sept ' : 9 ,
' september ' : 9 ,
' oct ' : 10 ,
' october ' : 10 ,
' nov ' : 11 ,
' november ' : 11 ,
' dec ' : 12 ,
' december ' : 12
}
}
def parse_raw_whois ( raw_data , normalized = [ ] ) :
data = { }
raw_data = [ segment . replace ( " \r " , " " ) for segment in raw_data ] # Carriage returns are the devil
for segment in raw_data :
for rule_key , rule_regexes in grammar [ ' _data ' ] . iteritems ( ) :
if data . has_key ( rule_key ) == False :
for line in segment . splitlines ( ) :
for regex in rule_regexes :
result = re . search ( regex , line , re . IGNORECASE )
if result is not None :
val = result . group ( " val " ) . strip ( )
if val != " " :
try :
data [ rule_key ] . append ( val )
except KeyError , e :
data [ rule_key ] = [ val ]
# Fill all missing values with None
for rule_key , rule_regexes in grammar [ ' _data ' ] . iteritems ( ) :
if data . has_key ( rule_key ) == False :
data [ rule_key ] = None
data [ " contacts " ] = parse_registrants ( raw_data )
# Parse dates
if data [ ' expiration_date ' ] is not None :
data [ ' expiration_date ' ] = remove_duplicates ( data [ ' expiration_date ' ] )
data [ ' expiration_date ' ] = parse_dates ( data [ ' expiration_date ' ] )
if data [ ' creation_date ' ] is not None :
data [ ' creation_date ' ] = remove_duplicates ( data [ ' creation_date ' ] )
data [ ' creation_date ' ] = parse_dates ( data [ ' creation_date ' ] )
if data [ ' updated_date ' ] is not None :
data [ ' updated_date ' ] = remove_duplicates ( data [ ' updated_date ' ] )
data [ ' updated_date ' ] = parse_dates ( data [ ' updated_date ' ] )
if data [ ' name_servers ' ] is not None :
data [ ' name_servers ' ] = remove_duplicates ( [ ns . rstrip ( " . " ) for ns in data [ ' name_servers ' ] ] )
if data [ ' emails ' ] is not None :
data [ ' emails ' ] = remove_duplicates ( data [ ' emails ' ] )
if data [ ' registrar ' ] is not None :
data [ ' registrar ' ] = remove_duplicates ( data [ ' registrar ' ] )
# Remove e-mail addresses if they are already listed for any of the contacts
known_emails = [ ]
for contact in ( " registrant " , " tech " , " admin " , " billing " ) :
if data [ " contacts " ] [ contact ] is not None :
try :
known_emails . append ( data [ " contacts " ] [ contact ] [ " email " ] )
except KeyError , e :
pass # No e-mail recorded for this contact...
if data [ ' emails ' ] is not None :
data [ ' emails ' ] = [ email for email in data [ " emails " ] if email not in known_emails ]
data [ " raw " ] = raw_data
if normalized != [ ] :
data = normalize_data ( data , normalized )
return data
def normalize_data ( data , normalized ) :
for key in ( " name_servers " , " emails " , " whois_server " ) :
if key in data and data [ key ] is not None and ( normalized == True or key in normalized ) :
if isinstance ( data [ key ] , basestring ) :
data [ key ] = data [ key ] . lower ( )
else :
data [ key ] = [ item . lower ( ) for item in data [ key ] ]
for key in ( " registrar " , " status " ) :
if key in data and data [ key ] is not None and ( normalized == True or key in normalized ) :
if isinstance ( data [ key ] , basestring ) and data [ key ] . isupper ( ) :
data [ key ] = " " . join ( word . capitalize ( ) for word in data [ key ] . split ( " " ) )
else :
# This might mess up the order? Also seems like there may be another bug here...
data [ key ] = [ " " . join ( word . capitalize ( ) for word in item . split ( " " ) ) for item in data [ key ] if item . isupper ( ) ] + [ item for item in data [ key ] if not item . isupper ( ) ]
for contact_type , contact in data [ ' contacts ' ] . iteritems ( ) :
if contact is not None :
for key in ( " email " , ) :
if key in contact and contact [ key ] is not None and ( normalized == True or key in normalized ) :
if isinstance ( contact [ key ] , basestring ) :
contact [ key ] = contact [ key ] . lower ( )
else :
contact [ key ] = [ item . lower ( ) for item in contact [ key ] ]
for key in ( " name " , " street " , " city " , " state " ) :
if key in contact and contact [ key ] is not None and ( normalized == True or key in normalized ) :
contact [ key ] = " " . join ( word . capitalize ( ) for word in contact [ key ] . split ( " " ) )
return data
def parse_dates ( dates ) :
global grammar
parsed_dates = [ ]
for date in dates :
for rule in grammar [ ' _dateformats ' ] :
result = re . match ( rule , date , re . IGNORECASE )
if result is not None :
try :
# These are always numeric. If they fail, there is no valid date present.
year = int ( result . group ( " year " ) )
day = int ( result . group ( " day " ) )
# Detect and correct shorthand year notation
if year < 60 :
year + = 2000
elif year < 100 :
year + = 1900
# This will require some more guesswork - some WHOIS servers present the name of the month
try :
month = int ( result . group ( " month " ) )
except ValueError , e :
# Apparently not a number. Look up the corresponding number.
try :
month = grammar [ ' _months ' ] [ result . group ( " month " ) . lower ( ) ]
except KeyError , e :
# Unknown month name, default to 0
month = 0
try :
hour = int ( result . group ( " hour " ) )
except IndexError , e :
hour = 0
except TypeError , e :
hour = 0
try :
minute = int ( result . group ( " minute " ) )
except IndexError , e :
minute = 0
except TypeError , e :
minute = 0
try :
second = int ( result . group ( " second " ) )
except IndexError , e :
second = 0
except TypeError , e :
second = 0
break
except ValueError , e :
# Something went horribly wrong, maybe there is no valid date present?
year = 0
month = 0
day = 0
hour = 0
minute = 0
second = 0
print e . message
try :
if year > 0 :
try :
parsed_dates . append ( datetime . datetime ( year , month , day , hour , minute , second ) )
except ValueError , e :
# We might have gotten the day and month the wrong way around, let's try it the other way around
# If you're not using an ISO-standard date format, you're an evil registrar!
parsed_dates . append ( datetime . datetime ( year , day , month , hour , minute , second ) )
except UnboundLocalError , e :
pass
if len ( parsed_dates ) > 0 :
return parsed_dates
else :
return None
def remove_duplicates ( data ) :
cleaned_list = [ ]
for entry in data :
if entry not in cleaned_list :
cleaned_list . append ( entry )
return cleaned_list
def parse_registrants ( data ) :
registrant = None
tech_contact = None
billing_contact = None
admin_contact = None
registrant_regexes = [
" Registrant: \n (?P<name>.+) \n (?P<street1>.+) \n (?: (?P<street2>.*) \n )?(?: (?P<street3>.*) \n )? (?P<postalcode>.+), (?P<city>.+) \n (?P<country>.+) \n (?P<phone>.+) \n (?P<email>.+) \n \n " , # OVH
" Registrant ID:(?P<handle>.+) \n Registrant Name:(?P<name>.*) \n Registrant Organization:(?P<organization>.*) \n Registrant Street1:(?P<street1>.*) \n Registrant Street2:(?P<street2>.*) \n Registrant Street3:(?P<street3>.*) \n Registrant City:(?P<city>.*) \n Registrant State/Province:(?P<state>.*) \n Registrant Postal Code:(?P<postalcode>.*) \n Registrant Country:(?P<country>.*) \n Registrant Phone:(?P<phone>.*) \n Registrant Phone Ext.:(?P<phone_ext>.*) \n Registrant FAX:(?P<fax>.*) \n Registrant FAX Ext.:(?P<fax_ext>.*) \n Registrant Email:(?P<email>.*) " , # Public Interest Registry (.org)
" Registrant ID: \ s*(?P<handle>.+) \n Registrant Name: \ s*(?P<name>.+) \n Registrant Organization: \ s*(?P<organization>.*) \n Registrant Address1: \ s*(?P<street1>.+) \n Registrant Address2: \ s*(?P<street2>.*) \n Registrant City: \ s*(?P<city>.+) \n Registrant State/Province: \ s*(?P<state>.+) \n Registrant Postal Code: \ s*(?P<postalcode>.+) \n Registrant Country: \ s*(?P<country>.+) \n Registrant Country Code: \ s*(?P<country_code>.+) \n Registrant Phone Number: \ s*(?P<phone>.+) \n Registrant Email: \ s*(?P<email>.+) \n " , # .CO Internet
" Registrant Contact: (?P<handle>.+) \n Registrant Organization: (?P<organization>.+) \n Registrant Name: (?P<name>.+) \n Registrant Street: (?P<street>.+) \n Registrant City: (?P<city>.+) \n Registrant Postal Code: (?P<postalcode>.+) \n Registrant State: (?P<state>.+) \n Registrant Country: (?P<country>.+) \n Registrant Phone: (?P<phone>.*) \n Registrant Phone Ext: (?P<phone_ext>.*) \n Registrant Fax: (?P<fax>.*) \n Registrant Fax Ext: (?P<fax_ext>.*) \n Registrant Email: (?P<email>.*) \n " , # Key-Systems GmbH
" (?:Registrant ID:[ ]*(?P<handle>.*) \n )?Registrant Name:[ ]*(?P<name>.*) \n Registrant Organization:[ ]*(?P<organization>.*) \n Registrant Street:[ ]*(?P<street1>.+) \n (?:Registrant Street:[ ]*(?P<street2>.+) \n )?Registrant City:[ ]*(?P<city>.+) \n Registrant State \ /Province:[ ]*(?P<state>.+) \n Registrant Postal Code:[ ]*(?P<postalcode>.+) \n Registrant Country:[ ]*(?P<country>.+) \n (?:Registrant Phone:[ ]*(?P<phone>.*) \n )?(?:Registrant Phone Ext:[ ]*(?P<phone_ext>.*) \n )?(?:Registrant Fax:[ ]*(?P<fax>.*) \n )?(?:Registrant Fax Ext:[ ]*(?P<fax_ext>.*) \n )?(?:Registrant Email:[ ]*(?P<email>.+) \n )? " , # WildWestDomains, GoDaddy, Namecheap/eNom, Ascio, Musedoma (.museum)
" Registrant \n (?P<name>.+) \n Email:(?P<email>.+) \n (?P<street1>.+) \n (?: (?P<street2>.+) \n )? (?P<postalcode>.+) (?P<city>.+) \n (?P<country>.+) \n Tel: (?P<phone>.+) \n \n " , # internet.bs
" Holder of domain name: \n (?P<name>[ \ S \ s]+) \n (?P<street>.+) \n (?P<postalcode>[A-Z0-9-]+) \ s+(?P<city>.+) \n (?P<country>.+) \n Contractual Language " , # nic.ch
" \n \n (?:Owner)? \ s+: (?P<name>.*) \n (?: \ s+: (?P<organization>.*) \n )? \ s+: (?P<street>.*) \n \ s+: (?P<city>.*) \n \ s+: (?P<state>.*) \n \ s+: (?P<country>.*) \n " , # nic.io
" Contact Information: \n \ [Name \ ] \ s*(?P<name>.*) \n \ [Email \ ] \ s*(?P<email>.*) \n \ [Web Page \ ] \ s*(?P<url>.*) \n \ [Postal code \ ] \ s*(?P<postalcode>.*) \n \ [Postal Address \ ] \ s*(?P<street1>.*) \n (?: \ s+(?P<street2>.*) \n )?(?: \ s+(?P<street3>.*) \n )? \ [Phone \ ] \ s*(?P<phone>.*) \n \ [Fax \ ] \ s*(?P<fax>.*) \n " , # jprs.jp
" Registrant:[ ]*(?P<name>.+) \n [ \ s \ S]*Eligibility Name:[ ]*(?P<organization>.+) \n [ \ s \ S]*Registrant Contact ID:[ ]*(?P<handle>.+) \n " , # .au business
" Eligibility Type:[ ]*Citizen \ /Resident \n [ \ s \ S]*Registrant Contact ID:[ ]*(?P<handle>.+) \n [ \ s \ S]*Registrant Contact Name:[ ]*(?P<name>.+) \n " , # .au individual
" Registrant:[ ]*(?P<organization>.+) \n [ \ s \ S]*Eligibility Type:[ ]*(Higher Education Institution|Company|Incorporated Association|Other) \n [ \ s \ S]*Registrant Contact ID:[ ]*(?P<handle>.+) \n [ \ s \ S]*Registrant Contact Name:[ ]*(?P<name>.+) \n " , # .au educational, company, 'incorporated association' (non-profit?), other (spotted for linux.conf.au, unsure if also for others)
" person: \ s+(?P<name>.+) " , # nic.ru (person)
" org: \ s+(?P<organization>.+) " , # nic.ru (organization)
]
tech_contact_regexes = [
" Technical Contact: \n (?P<name>.+) \n (?P<street1>.+) \n (?: (?P<street2>.*) \n )?(?: (?P<street3>.*) \n )? (?P<postalcode>.+), (?P<city>.+) \n (?P<country>.+) \n (?P<phone>.+) \n (?P<email>.+) \n \n " , # OVH
" Tech ID:(?P<handle>.+) \n Tech Name:(?P<name>.*) \n Tech Organization:(?P<organization>.*) \n Tech Street1:(?P<street1>.*) \n Tech Street2:(?P<street2>.*) \n Tech Street3:(?P<street3>.*) \n Tech City:(?P<city>.*) \n Tech State/Province:(?P<state>.*) \n Tech Postal Code:(?P<postalcode>.*) \n Tech Country:(?P<country>.*) \n Tech Phone:(?P<phone>.*) \n Tech Phone Ext.:(?P<phone_ext>.*) \n Tech FAX:(?P<fax>.*) \n Tech FAX Ext.:(?P<fax_ext>.*) \n Tech Email:(?P<email>.*) " , # Public Interest Registry (.org)
" Technical Contact ID: \ s*(?P<handle>.+) \n Technical Contact Name: \ s*(?P<name>.+) \n Technical Contact Organization: \ s*(?P<organization>.*) \n Technical Contact Address1: \ s*(?P<street1>.+) \n Technical Contact Address2: \ s*(?P<street2>.*) \n Technical Contact City: \ s*(?P<city>.+) \n Technical Contact State/Province: \ s*(?P<state>.+) \n Technical Contact Postal Code: \ s*(?P<postalcode>.+) \n Technical Contact Country: \ s*(?P<country>.+) \n Technical Contact Country Code: \ s*(?P<country_code>.+) \n Technical Contact Phone Number: \ s*(?P<phone>.+) \n Technical Contact Email: \ s*(?P<email>.+) \n " , # .CO Internet
" Tech Contact: (?P<handle>.+) \n Tech Organization: (?P<organization>.+) \n Tech Name: (?P<name>.+) \n Tech Street: (?P<street>.+) \n Tech City: (?P<city>.+) \n Tech Postal Code: (?P<postalcode>.+) \n Tech State: (?P<state>.+) \n Tech Country: (?P<country>.+) \n Tech Phone: (?P<phone>.*) \n Tech Phone Ext: (?P<phone_ext>.*) \n Tech Fax: (?P<fax>.*) \n Tech Fax Ext: (?P<fax_ext>.*) \n Tech Email: (?P<email>.*) \n " , # Key-Systems GmbH
" (?:Tech ID:[ ]*(?P<handle>.*) \n )?Tech[ ]*Name:[ ]*(?P<name>.*) \n Tech[ ]*Organization:[ ]*(?P<organization>.*) \n Tech[ ]*Street:[ ]*(?P<street1>.+) \n (?:Tech[ ]*Street:[ ]*(?P<street2>.+) \n )?Tech[ ]*City:[ ]*(?P<city>.+) \n Tech[ ]*State \ /Province:[ ]*(?P<state>.+) \n Tech[ ]*Postal[ ]*Code:[ ]*(?P<postalcode>.+) \n Tech[ ]*Country:[ ]*(?P<country>.+) \n (?:Tech[ ]*Phone:[ ]*(?P<phone>.*) \n )?(?:Tech[ ]*Phone[ ]*Ext:[ ]*(?P<phone_ext>.*) \n )?(?:Tech[ ]*Fax:[ ]*(?P<fax>.*) \n )?(?:Tech[ ]*Fax[ ]*Ext: \ s*?(?P<fax_ext>.*) \n )?(?:Tech[ ]*Email:[ ]*(?P<email>.+) \n )? " , # WildWestDomains, GoDaddy, Namecheap/eNom, Ascio, Musedoma (.museum)
" Technical Contact \n (?P<name>.+) \n Email:(?P<email>.+) \n (?P<street1>.+) \n (?: (?P<street2>.+) \n )? (?P<postalcode>.+) (?P<city>.+) \n (?P<country>.+) \n Tel: (?P<phone>.+) \n \n " , # internet.bs
" Technical contact: \n (?P<name>[ \ S \ s]+) \n (?P<street>.+) \n (?P<postalcode>[A-Z0-9-]+) \ s+(?P<city>.+) \n (?P<country>.+) \n \n " , # nic.ch
" Tech Contact ID:[ ]*(?P<handle>.+) \n Tech Contact Name:[ ]*(?P<name>.+) " # .au
]
admin_contact_regexes = [
" Administrative Contact: \n (?P<name>.+) \n (?P<street1>.+) \n (?: (?P<street2>.*) \n )?(?: (?P<street3>.*) \n )? (?P<postalcode>.+), (?P<city>.+) \n (?P<country>.+) \n (?P<phone>.+) \n (?P<email>.+) \n \n " , # OVH
" Admin ID:(?P<handle>.+) \n Admin Name:(?P<name>.*) \n Admin Organization:(?P<organization>.*) \n Admin Street1:(?P<street1>.*) \n Admin Street2:(?P<street2>.*) \n Admin Street3:(?P<street3>.*) \n Admin City:(?P<city>.*) \n Admin State/Province:(?P<state>.*) \n Admin Postal Code:(?P<postalcode>.*) \n Admin Country:(?P<country>.*) \n Admin Phone:(?P<phone>.*) \n Admin Phone Ext.:(?P<phone_ext>.*) \n Admin FAX:(?P<fax>.*) \n Admin FAX Ext.:(?P<fax_ext>.*) \n Admin Email:(?P<email>.*) " , # Public Interest Registry (.org)
" Administrative Contact ID: \ s*(?P<handle>.+) \n Administrative Contact Name: \ s*(?P<name>.+) \n Administrative Contact Organization: \ s*(?P<organization>.*) \n Administrative Contact Address1: \ s*(?P<street1>.+) \n Administrative Contact Address2: \ s*(?P<street2>.*) \n Administrative Contact City: \ s*(?P<city>.+) \n Administrative Contact State/Province: \ s*(?P<state>.+) \n Administrative Contact Postal Code: \ s*(?P<postalcode>.+) \n Administrative Contact Country: \ s*(?P<country>.+) \n Administrative Contact Country Code: \ s*(?P<country_code>.+) \n Administrative Contact Phone Number: \ s*(?P<phone>.+) \n Administrative Contact Email: \ s*(?P<email>.+) \n " , # .CO Internet
" Admin Contact: (?P<handle>.+) \n Admin Organization: (?P<organization>.+) \n Admin Name: (?P<name>.+) \n Admin Street: (?P<street>.+) \n Admin City: (?P<city>.+) \n Admin State: (?P<state>.+) \n Admin Postal Code: (?P<postalcode>.+) \n Admin Country: (?P<country>.+) \n Admin Phone: (?P<phone>.*) \n Admin Phone Ext: (?P<phone_ext>.*) \n Admin Fax: (?P<fax>.*) \n Admin Fax Ext: (?P<fax_ext>.*) \n Admin Email: (?P<email>.*) \n " , # Key-Systems GmbH
" (?:Admin ID:[ ]*(?P<handle>.*) \n )?Admin[ ]*Name:[ ]*(?P<name>.*) \n Admin[ ]*Organization:[ ]*(?P<organization>.*) \n Admin[ ]*Street:[ ]*(?P<street1>.+) \n (?:Admin[ ]*Street:[ ]*(?P<street2>.+) \n )?Admin[ ]*City:[ ]*(?P<city>.+) \n Admin[ ]*State \ /Province:[ ]*(?P<state>.+) \n Admin[ ]*Postal[ ]*Code:[ ]*(?P<postalcode>.+) \n Admin[ ]*Country:[ ]*(?P<country>.+) \n (?:Admin[ ]*Phone:[ ]*(?P<phone>.*) \n )?(?:Admin[ ]*Phone[ ]*Ext:[ ]*(?P<phone_ext>.*) \n )?(?:Admin[ ]*Fax:[ ]*(?P<fax>.*) \n )?(?:Admin[ ]*Fax[ ]*Ext: \ s*?(?P<fax_ext>.*) \n )?(?:Admin[ ]*Email:[ ]*(?P<email>.+) \n )? " , # WildWestDomains, GoDaddy, Namecheap/eNom, Ascio, Musedoma (.museum)
" Administrative Contact \n (?P<name>.+) \n Email:(?P<email>.+) \n (?P<street1>.+) \n (?: (?P<street2>.+) \n )? (?P<postalcode>.+) (?P<city>.+) \n (?P<country>.+) \n Tel: (?P<phone>.+) \n \n " , # internet.bs
]
billing_contact_regexes = [
" Billing Contact ID: \ s*(?P<handle>.+) \n Billing Contact Name: \ s*(?P<name>.+) \n Billing Contact Organization: \ s*(?P<organization>.*) \n Billing Contact Address1: \ s*(?P<street1>.+) \n Billing Contact Address2: \ s*(?P<street2>.*) \n Billing Contact City: \ s*(?P<city>.+) \n Billing Contact State/Province: \ s*(?P<state>.+) \n Billing Contact Postal Code: \ s*(?P<postalcode>.+) \n Billing Contact Country: \ s*(?P<country>.+) \n Billing Contact Country Code: \ s*(?P<country_code>.+) \n Billing Contact Phone Number: \ s*(?P<phone>.+) \n Billing Contact Email: \ s*(?P<email>.+) \n " , # .CO Internet
" Billing Contact: (?P<handle>.+) \n Billing Organization: (?P<organization>.+) \n Billing Name: (?P<name>.+) \n Billing Street: (?P<street>.+) \n Billing City: (?P<city>.+) \n Billing Postal Code: (?P<postalcode>.+) \n Billing State: (?P<state>.+) \n Billing Country: (?P<country>.+) \n Billing Phone: (?P<phone>.*) \n Billing Phone Ext: (?P<phone_ext>.*) \n Billing Fax: (?P<fax>.*) \n Billing Fax Ext: (?P<fax_ext>.*) \n Billing Email: (?P<email>.*) \n " , # Key-Systems GmbH
" (?:Billing ID:[ ]*(?P<handle>.*) \n )?Billing[ ]*Name:[ ]*(?P<name>.*) \n Billing[ ]*Organization:[ ]*(?P<organization>.*) \n Billing[ ]*Street:[ ]*(?P<street1>.+) \n (?:Billing[ ]*Street:[ ]*(?P<street2>.+) \n )?Billing[ ]*City:[ ]*(?P<city>.+) \n Billing[ ]*State \ /Province:[ ]*(?P<state>.+) \n Billing[ ]*Postal[ ]*Code:[ ]*(?P<postalcode>.+) \n Billing[ ]*Country:[ ]*(?P<country>.+) \n (?:Billing[ ]*Phone:[ ]*(?P<phone>.*) \n )?(?:Billing[ ]*Phone[ ]*Ext:[ ]*(?P<phone_ext>.*) \n )?(?:Billing[ ]*Fax:[ ]*(?P<fax>.*) \n )?(?:Billing[ ]*Fax[ ]*Ext: \ s*?(?P<fax_ext>.*) \n )?(?:Billing[ ]*Email:[ ]*(?P<email>.+) \n )? " , # Musedoma (.museum)
" Billing Contact: \n (?P<name>.+) \n (?P<street1>.+) \n (?: (?P<street2>.*) \n )?(?: (?P<street3>.*) \n )? (?P<postalcode>.+), (?P<city>.+) \n (?P<country>.+) \n (?P<phone>.+) \n (?P<email>.+) \n \n " , # OVH
]
# Some registries use NIC handle references instead of directly listing contacts...
nic_contact_regexes = [
" personname: \ s*(?P<name>.+) \n organization: \ s*(?P<organization>.+) \n street address: \ s*(?P<street>.+) \n postal code: \ s*(?P<postalcode>.+) \n city: \ s*(?P<city>.+) \n country: \ s*(?P<country>.+) \n phone: \ s*(?P<phone>.+) \n fax-no: \ s*(?P<fax>.+) \n e-mail: \ s*(?P<email>.+) \n nic-hdl: \ s*(?P<handle>.+) \n changed: \ s*(?P<changedate>.+) " , # nic.at
" nic-hdl: \ s*(?P<handle>.+) \n type: \ s*(?P<type>.+) \n contact: \ s*(?P<name>.+) \n (?:.+ \n )*?(?:address: \ s*(?P<street1>.+) \n address: \ s*(?P<street2>.+) \n address: \ s*(?P<street3>.+) \n address: \ s*(?P<country>.+) \n )?(?:phone: \ s*(?P<phone>.+) \n )?(?:fax-no: \ s*(?P<fax>.+) \n )?(?:.+ \n )*?(?:e-mail: \ s*(?P<email>.+) \n )?(?:.+ \n )*?changed: \ s*(?P<changedate>[0-9] {2} \ /[0-9] {2} \ /[0-9] {4} ).* \n " , # AFNIC madness without country field
" nic-hdl: \ s*(?P<handle>.+) \n type: \ s*(?P<type>.+) \n contact: \ s*(?P<name>.+) \n (?:.+ \n )*?(?:address: \ s*(?P<street1>.+) \n )?(?:address: \ s*(?P<street2>.+) \n )?(?:address: \ s*(?P<street3>.+) \n )?(?:phone: \ s*(?P<phone>.+) \n )?(?:fax-no: \ s*(?P<fax>.+) \n )?(?:.+ \n )*?(?:e-mail: \ s*(?P<email>.+) \n )?(?:.+ \n )*?changed: \ s*(?P<changedate>[0-9] {2} \ /[0-9] {2} \ /[0-9] {4} ).* \n " , # AFNIC madness any country -at all-
" nic-hdl: \ s*(?P<handle>.+) \n type: \ s*(?P<type>.+) \n contact: \ s*(?P<name>.+) \n (?:.+ \n )*?(?:address: \ s*(?P<street1>.+) \n )?(?:address: \ s*(?P<street2>.+) \n )?(?:address: \ s*(?P<street3>.+) \n )?(?:address: \ s*(?P<street4>.+) \n )?country: \ s*(?P<country>.+) \n (?:phone: \ s*(?P<phone>.+) \n )?(?:fax-no: \ s*(?P<fax>.+) \n )?(?:.+ \n )*?(?:e-mail: \ s*(?P<email>.+) \n )?(?:.+ \n )*?changed: \ s*(?P<changedate>[0-9] {2} \ /[0-9] {2} \ /[0-9] {4} ).* \n " , # AFNIC madness with country field
]
nic_contact_references = {
" registrant " : [
" registrant: \ s*(?P<handle>.+) " , # nic.at
" holder-c: \ s*(?P<handle>.+) " , # AFNIC
] ,
" tech " : [
" tech-c: \ s*(?P<handle>.+) " , # nic.at, AFNIC
] ,
" admin " : [
" admin-c: \ s*(?P<handle>.+) " , # nic.at, AFNIC
] ,
}
for regex in registrant_regexes :
for segment in data :
match = re . search ( regex , segment )
if match is not None :
registrant = match . groupdict ( )
break
for regex in tech_contact_regexes :
for segment in data :
match = re . search ( regex , segment )
if match is not None :
tech_contact = match . groupdict ( )
break
for regex in admin_contact_regexes :
for segment in data :
match = re . search ( regex , segment )
if match is not None :
admin_contact = match . groupdict ( )
break
for regex in billing_contact_regexes :
for segment in data :
match = re . search ( regex , segment )
if match is not None :
billing_contact = match . groupdict ( )
break
# Find NIC handle contact definitions
handle_contacts = [ ]
for regex in nic_contact_regexes :
for segment in data :
matches = re . finditer ( regex , segment )
for match in matches :
handle_contacts . append ( match . groupdict ( ) )
# Find NIC handle references and process them
for category in nic_contact_references :
for regex in nic_contact_references [ category ] :
for segment in data :
match = re . search ( regex , segment )
if match is not None :
data_reference = match . groupdict ( )
for contact in handle_contacts :
if contact [ " handle " ] == data_reference [ " handle " ] :
data_reference . update ( contact )
if category == " registrant " :
registrant = data_reference
elif category == " tech " :
tech_contact = data_reference
elif category == " billing " :
billing_contact = data_reference
elif category == " admin " :
admin_contact = data_reference
break
# Post-processing
for obj in ( registrant , tech_contact , billing_contact , admin_contact ) :
if obj is not None :
for key in obj . keys ( ) :
if obj [ key ] is None or obj [ key ] . strip ( ) == " " : # Just chomp all surrounding whitespace
del obj [ key ]
if " phone_ext " in obj :
if " phone " in obj :
obj [ " phone " ] + = " ext. %s " % obj [ " phone_ext " ]
del obj [ " phone_ext " ]
if " street1 " in obj :
street_items = [ ]
i = 1
while True :
try :
street_items . append ( obj [ " street %d " % i ] )
del obj [ " street %d " % i ]
except KeyError , e :
break
i + = 1
obj [ " street " ] = " \n " . join ( street_items )
if ' changedate ' in obj :
obj [ ' changedate ' ] = parse_dates ( [ obj [ ' changedate ' ] ] ) [ 0 ]
if ' street ' in obj and " \n " in obj [ " street " ] and ' postalcode ' not in obj :
# Deal with certain mad WHOIS servers that don't properly delimit address data... (yes, AFNIC, looking at you)
lines = [ x . strip ( ) for x in obj [ " street " ] . splitlines ( ) ]
if " " in lines [ - 1 ] :
postal_code , city = lines [ - 1 ] . split ( " " , 1 )
obj [ " postalcode " ] = postal_code
obj [ " city " ] = city
obj [ " street " ] = " \n " . join ( lines [ : - 1 ] )
return {
" registrant " : registrant ,
" tech " : tech_contact ,
" admin " : admin_contact ,
" billing " : billing_contact ,
}