Python 3 compatibility

Including converting one of the test data files to UTF-8. It was Windows encoded which failed with python 3's file.read encoding.
10 years ago · 1e79e33126
parent 1af983ad4f
commit 1e79e33126
2 changed files with 112 additions and 80 deletions
--- a/pythonwhois/parse.py
+++ b/pythonwhois/parse.py
@ -1,14 +1,6 @@
 from __future__ import print_function
 import re, sys, datetime

-if sys.version_info[0] >= 3:
-	def iteritems(d):
-		return iter(d.items())
-else:
-	def iteritems(d):
-		return d.iteritems()
-
-
 grammar = {
 	"_data": {
 		'id':			['Domain ID:[ ]*(?P<val>.+)'],
@ -145,14 +137,25 @@ grammar = {
 	}
 }

+
+if sys.version_info < (3, 0):
+	def is_string(data):
+		"""Test for string with support for python 2."""
+		return isinstance(data, basestring)
+else:
+	def is_string(data):
+		"""Test for string with support for python 3."""
+		return isinstance(data, str)
+
+
 def parse_raw_whois(raw_data, normalized=[]):
 	data = {}

 	raw_data = [segment.replace("\r", "") for segment in raw_data] # Carriage returns are the devil

 	for segment in raw_data:
-		for rule_key, rule_regexes in iteritems(grammar['_data']):
-			if rule_key not in data:
+		for rule_key, rule_regexes in grammar['_data'].items():
+			if (rule_key in data) == False:
 				for line in segment.splitlines():
 					for regex in rule_regexes:
 						result = re.search(regex, line, re.IGNORECASE)
@ -264,7 +267,7 @@ def parse_raw_whois(raw_data, normalized=[]):
 	except KeyError as e:
 		pass # Not present

-	for key in data.keys():
+	for key in list(data.keys()):
 		if data[key] is None or len(data[key]) == 0:
 			del data[key]

@ -278,23 +281,23 @@ def parse_raw_whois(raw_data, normalized=[]):
 def normalize_data(data, normalized):
 	for key in ("nameservers", "emails", "whois_server"):
 		if key in data and data[key] is not None and (normalized == True or key in normalized):
-			if isinstance(data[key], basestring):
+			if is_string(data[key]):
 				data[key] = data[key].lower()
 			else:
 				data[key] = [item.lower() for item in data[key]]

 	for key, threshold in (("registrar", 4), ("status", 3)):
 		if key in data and data[key] is not None and (normalized == True or key in normalized):
-			if isinstance(data[key], basestring):
+			if is_string(data[key]):
 				data[key] = normalize_name(data[key], abbreviation_threshold=threshold, length_threshold=1)
 			else:
 				data[key] = [normalize_name(item, abbreviation_threshold=threshold, length_threshold=1) for item in data[key]]

-	for contact_type, contact in iteritems(data['contacts']):
+	for contact_type, contact in data['contacts'].items():
 		if contact is not None:
 			for key in ("email",):
 				if key in contact and contact[key] is not None and (normalized == True or key in normalized):
-					if isinstance(contact[key], basestring):
+					if isinstance(contact[key], str):
 						contact[key] = contact[key].lower()
 					else:
 						contact[key] = [item.lower() for item in contact[key]]
@ -307,7 +310,7 @@ def normalize_data(data, normalized):
 				if key in contact and contact[key] is not None and (normalized == True or key in normalized):
 					contact[key] = normalize_name(contact[key], abbreviation_threshold=3, length_threshold=3)

-			for key in contact.keys():
+			for key in list(contact.keys()):
 				try:
 					contact[key] = contact[key].strip(", ")
 				except AttributeError as e:
@ -635,7 +638,7 @@ def parse_registrants(data):
 	# Post-processing
 	for obj in (registrant, tech_contact, billing_contact, admin_contact):
 		if obj is not None:
-			for key in obj.keys():
+			for key in list(obj.keys()):
 				if obj[key] is None or obj[key].strip() == "": # Just chomp all surrounding whitespace
 					del obj[key]
 				else:
--- a/test.py
+++ b/test.py
@ -1,6 +1,27 @@
 #!/usr/bin/env python2

-import sys, argparse, os, pythonwhois, json, datetime
+import sys, argparse, os, pythonwhois, json, datetime, codecs
+import pkgutil
+import encodings
+
+
+def get_codecs():
+	"""Dynamically get list of codecs in python."""
+	false_positives = set(["aliases"])
+	found = set(name for imp, name, ispkg in pkgutil.iter_modules(encodings.__path__) if not ispkg)
+	found.difference_update(false_positives)
+	return found
+
+
+def read_encoded_file(file_path):
+	"""Try reading file using all codecs. Return the first succesfull one."""
+	for encoding in get_codecs():
+		try:
+			with codecs.open(file_path, "r", encoding) as f:
+				return f.read()
+		except Exception:
+			pass
+

 parser = argparse.ArgumentParser(description="Runs or modifies the test suite for python-whois.")
 parser.add_argument("mode", nargs=1, choices=["run", "update"], default="run", help="Whether to run or update the tests. Only update if you know what you're doing!")
@ -14,7 +35,7 @@ ENDC = '\033[0m'
 def encoded_json_dumps(obj):
 	try:
 		return json.dumps(obj, default=json_fallback)
-	except UnicodeDecodeError, e:
+	except UnicodeDecodeError as e:
 		return json.dumps(recursive_encode(obj, "latin-1"), default=json_fallback)

 def json_fallback(obj):
@ -24,7 +45,7 @@ def json_fallback(obj):
 		return obj

 def recursive_encode(obj, encoding):
-	for key in obj.keys():
+	for key in list(obj.keys()):
 		if isinstance(obj[key], dict):
 			obj[key] = recursive_encode(obj[key], encoding)
 		elif isinstance(obj[key], list):
@ -74,18 +95,26 @@ if args.mode[0] == "run":
 	suites = []
 	for target in targets:
 		try:
-			with open(os.path.join("test/data", target), "r") as f:
+			with codecs.open(os.path.join("test/data", target), "r") as f:
 				data = f.read().split("\n--\n")
-		except IOError, e:
+		except IOError as e:
 			sys.stderr.write("Invalid domain %(domain)s specified. No test case or base data exists.\n" % {"domain": target})
 			errors = True
 			continue
+		except UnicodeDecodeError:
+			try:
+				# Try cp1252 (ufpa.br uses that)
+				with codecs.open(os.path.join("test/data", target), "r", 'cp1252') as f:
+					data = f.read().split("\n--\n")
+			except UnicodeDecodeError as e:
+				# Fall back to trying all registered codecs
+				data = read_encoded_file(os.path.join("test/data", target)).split("\n--\n")
 		try:
-			with open(os.path.join("test/target_default", target), "r") as f:
+			with codecs.open(os.path.join("test/target_default", target), "r") as f:
 				default = f.read()
-			with open(os.path.join("test/target_normalized", target), "r") as f:
+			with codecs.open(os.path.join("test/target_normalized", target), "r") as f:
 				normalized = f.read()
-		except IOError, e:
+		except IOError as e:
 			sys.stderr.write("Missing target data for domain %(domain)s. Run `./test.py update %(domain)s` to correct this, after verifying that pythonwhois can correctly parse this particular domain.\n" % {"domain": target})
 			errors = True
 			continue
@ -152,10 +181,10 @@ elif args.mode[0] == "update":
 	updates = []
 	for target in targets:
 		try:
-			with open(os.path.join("test/data", target), "r") as f:
+			with codecs.open(os.path.join("test/data", target), "r") as f:
 				data = f.read().split("\n--\n")
 			updates.append((target, data))
-		except IOError, e:
+		except IOError as e:
 			sys.stderr.write("Invalid domain %(domain)s specified. No base data exists.\n" % {"domain": target})
 			errors = True
 			continue
@ -166,8 +195,8 @@ elif args.mode[0] == "update":
 	for target, data in updates:
 		default = pythonwhois.parse.parse_raw_whois(data)
 		normalized = pythonwhois.parse.parse_raw_whois(data, normalized=True)
-		with open(os.path.join("test/target_default", target), "w") as f:
+		with codecs.open(os.path.join("test/target_default", target), "w") as f:
 			f.write(encoded_json_dumps(default))
-		with open(os.path.join("test/target_normalized", target), "w") as f:
+		with codecs.open(os.path.join("test/target_normalized", target), "w") as f:
 			f.write(encoded_json_dumps(normalized))
-		print "Generated target data for %s." % target
+		print("Generated target data for %s." % target)