Fix regular expression corner case that led to long parsing times

master
Sven Slootweg 10 years ago
parent d7f1cc181b
commit 3f7b946871

@ -397,6 +397,9 @@ def remove_duplicates(data):
return cleaned_list
def preprocess_regex(regex):
return re.sub(r"\\s\*\(\?P<([^>]+)>\.\+\)", r"\s*(?P<\1>\S.*)", regex)
def parse_registrants(data):
registrant = None
tech_contact = None
@ -499,6 +502,17 @@ def parse_registrants(data):
]
}
# Why do the below? The below is meant to handle with an edge case (issue #2) where a partial match followed
# by a failure, for a regex containing the \s*.+ pattern, would send the regex module on a wild goose hunt for
# matching positions. The workaround is to use \S.* instead of .+, but in the interest of keeping the regexes
# consistent and compact, it's more practical to do this (predictable) conversion on runtime.
registrant_regexes = [preprocess_regex(regex) for regex in registrant_regexes]
tech_contact_regexes = [preprocess_regex(regex) for regex in tech_contact_regexes]
admin_contact_regexes = [preprocess_regex(regex) for regex in admin_contact_regexes]
billing_contact_regexes = [preprocess_regex(regex) for regex in billing_contact_regexes]
nic_contact_regexes = [preprocess_regex(regex) for regex in nic_contact_regexes]
nic_contact_references = {field: [preprocess_regex(regex) for regex in items] for field, items in nic_contact_references.iteritems()}
for segment in data:
for regex in registrant_regexes:
match = re.search(regex, segment)

Loading…
Cancel
Save