Skip pages that are already scraped

master
Sven Slootweg 12 years ago
parent 1ed4df5fa1
commit 8a8db12690

@ -31,12 +31,15 @@ def fetch_archives():
status_code, headers, response = fetch_page_headers(start_page, default_headers)
if status_code == 200:
archive_pages = re.findall("http:\/\/www\.devilskitchen\.me\.uk\/[0-9]{4}_[0-9]{2}_[0-9]{2}_archive\.html", response)
archive_pages = re.findall("(http:\/\/www\.devilskitchen\.me\.uk\/([0-9]{4})_([0-9]{2})_[0-9]{2}_archive\.html)", response)
for page in archive_pages:
print "Scraping %s..." % page
fetch_articles(page)
time.sleep(20)
if os.path.exists("%s/%s-%s" % (options['output_dir'], page[1], page[2])):
print "%s-%s already exists, skipping..." % (page[1], page[2])
else:
print "Scraping %s..." % page[0]
fetch_articles(page[0])
time.sleep(5)
else:
print "ERROR: Failed to retrieve archive index! Exiting..."
exit(1)

Loading…
Cancel
Save