From 3f7b94687160ac7f5d38d9cb4c88dd695afdb39f Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sat, 8 Feb 2014 20:38:10 +0100 Subject: [PATCH] Fix regular expression corner case that led to long parsing times --- pythonwhois/parse.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pythonwhois/parse.py b/pythonwhois/parse.py index 8437424..192001a 100644 --- a/pythonwhois/parse.py +++ b/pythonwhois/parse.py @@ -397,6 +397,9 @@ def remove_duplicates(data): return cleaned_list +def preprocess_regex(regex): + return re.sub(r"\\s\*\(\?P<([^>]+)>\.\+\)", r"\s*(?P<\1>\S.*)", regex) + def parse_registrants(data): registrant = None tech_contact = None @@ -499,6 +502,17 @@ def parse_registrants(data): ] } + # Why do the below? The below is meant to handle with an edge case (issue #2) where a partial match followed + # by a failure, for a regex containing the \s*.+ pattern, would send the regex module on a wild goose hunt for + # matching positions. The workaround is to use \S.* instead of .+, but in the interest of keeping the regexes + # consistent and compact, it's more practical to do this (predictable) conversion on runtime. + registrant_regexes = [preprocess_regex(regex) for regex in registrant_regexes] + tech_contact_regexes = [preprocess_regex(regex) for regex in tech_contact_regexes] + admin_contact_regexes = [preprocess_regex(regex) for regex in admin_contact_regexes] + billing_contact_regexes = [preprocess_regex(regex) for regex in billing_contact_regexes] + nic_contact_regexes = [preprocess_regex(regex) for regex in nic_contact_regexes] + nic_contact_references = {field: [preprocess_regex(regex) for regex in items] for field, items in nic_contact_references.iteritems()} + for segment in data: for regex in registrant_regexes: match = re.search(regex, segment)