From 1811276fbd83f8d44058c4f7aa809045f60c1307 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Thu, 4 Oct 2012 04:43:29 +0200 Subject: [PATCH] Initial commit --- run.py | 38 ++++++++++++++++++++++++++++++++++++++ webshotslib.py | 22 ++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 run.py create mode 100644 webshotslib.py diff --git a/run.py b/run.py new file mode 100644 index 0000000..bf5875b --- /dev/null +++ b/run.py @@ -0,0 +1,38 @@ +#!/usr/bin/python +import webshotslib, time + +to_parse = [] +to_parse_count = 0 +users = [] +users_count = 0 +users_last_save = 0 + +print "Starting...", + +for category in webshotslib.get_category_listings("http://community.webshots.com/"): + to_parse.append(category) + to_parse_count += 1 + print "\rTotal pages to be parsed: %d" % to_parse_count, + + for listing_page in webshotslib.get_user_listings(category): + to_parse.append(listing_page) + to_parse_count += 1 + print "\rTotal pages to be parsed: %d" % to_parse_count, + + time.sleep(0.5) + +print "" + +for listing_page in to_parse: + for user in webshotslib.get_users(listing_page): + if user not in users: + users.append(user) + users_count += 1 + print "\rUsers found: %d" % users_count, + + if users_count % 1000 < 100 and users_last_save != users_count: + userfile = open("users.txt", "w") + userfile.write("\n".join(users)) + userfile.close() + users_last_save = users_count + time.sleep(0.5) diff --git a/webshotslib.py b/webshotslib.py new file mode 100644 index 0000000..af00ec3 --- /dev/null +++ b/webshotslib.py @@ -0,0 +1,22 @@ +#!/usr/bin/python + +import urllib, re + +def get_category_listings(url): + contents = urllib.urlopen(url).read() + matches = re.findall('top members', contents) + return matches + +def get_user_listings(url): + contents = urllib.urlopen(url).read() + matches = re.findall('[0-9]+-[0-9]+', contents) + + if len(matches) > 0: + matches.pop(0) + + return matches + +def get_users(url): + contents = urllib.urlopen(url).read() + matches = re.findall('http:\/\/community\.webshots\.com\/user\/([^/\'"]+)', contents) + return [value for value in matches if value != "my"]