From 3f7b94687160ac7f5d38d9cb4c88dd695afdb39f Mon Sep 17 00:00:00 2001
From: Sven Slootweg <jamsoftgamedev@gmail.com>
Date: Sat, 8 Feb 2014 20:38:10 +0100
Subject: [PATCH] Fix regular expression corner case that led to long parsing
 times

---
 pythonwhois/parse.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/pythonwhois/parse.py b/pythonwhois/parse.py
index 8437424..192001a 100644
--- a/pythonwhois/parse.py
+++ b/pythonwhois/parse.py
@@ -397,6 +397,9 @@ def remove_duplicates(data):
 	
 	return cleaned_list
 
+def preprocess_regex(regex):
+	return re.sub(r"\\s\*\(\?P<([^>]+)>\.\+\)", r"\s*(?P<\1>\S.*)", regex)
+
 def parse_registrants(data):
 	registrant = None
 	tech_contact = None
@@ -499,6 +502,17 @@ def parse_registrants(data):
 		]
 	}
 	
+	# Why do the below? The below is meant to handle with an edge case (issue #2) where a partial match followed
+	# by a failure, for a regex containing the \s*.+ pattern, would send the regex module on a wild goose hunt for
+	# matching positions. The workaround is to use \S.* instead of .+, but in the interest of keeping the regexes
+	# consistent and compact, it's more practical to do this (predictable) conversion on runtime.
+	registrant_regexes = [preprocess_regex(regex) for regex in registrant_regexes]
+	tech_contact_regexes = [preprocess_regex(regex) for regex in tech_contact_regexes]
+	admin_contact_regexes = [preprocess_regex(regex) for regex in admin_contact_regexes]
+	billing_contact_regexes = [preprocess_regex(regex) for regex in billing_contact_regexes]
+	nic_contact_regexes = [preprocess_regex(regex) for regex in nic_contact_regexes]
+	nic_contact_references = {field: [preprocess_regex(regex) for regex in items] for field, items in nic_contact_references.iteritems()}
+	
 	for segment in data:
 		for regex in registrant_regexes:
 			match = re.search(regex, segment)