Initial commit

2012-10-04 04:43:29 +02:00 · 2012-10-04 04:43:29 +02:00 · 1811276fbd
commit 1811276fbd
2 changed files with 60 additions and 0 deletions
--- a/run.py
+++ b/run.py
@ -0,0 +1,38 @@
+#!/usr/bin/python
+import webshotslib, time
+
+to_parse = []
+to_parse_count = 0
+users = []
+users_count = 0
+users_last_save = 0
+
+print "Starting...",
+
+for category in webshotslib.get_category_listings("http://community.webshots.com/"):
+	to_parse.append(category)
+	to_parse_count += 1
+	print "\rTotal pages to be parsed: %d" % to_parse_count,
+	
+	for listing_page in webshotslib.get_user_listings(category):
+		to_parse.append(listing_page)
+		to_parse_count += 1
+		print "\rTotal pages to be parsed: %d" % to_parse_count,
+		
+	time.sleep(0.5)
+
+print ""
+
+for listing_page in to_parse:
+	for user in webshotslib.get_users(listing_page):
+		if user not in users:
+			users.append(user)
+			users_count += 1
+			print "\rUsers found: %d" % users_count,
+			
+			if users_count % 1000 < 100 and users_last_save != users_count:
+				userfile = open("users.txt", "w")
+				userfile.write("\n".join(users))
+				userfile.close()
+				users_last_save = users_count
+	time.sleep(0.5)
--- a/webshotslib.py
+++ b/webshotslib.py
@ -0,0 +1,22 @@
+#!/usr/bin/python
+
+import urllib, re
+
+def get_category_listings(url):
+	contents = urllib.urlopen(url).read()
+	matches = re.findall('<a href="(http:\/\/www\.webshots\.com\/members\/[^/]+\/[^/]+\.html)">top members</a>', contents)
+	return matches
+
+def get_user_listings(url):
+	contents = urllib.urlopen(url).read()
+	matches = re.findall('<a href="(http:\/\/www\.webshots\.com\/members\/[^/]+\/[^/]+\.html)">[0-9]+-[0-9]+</a>', contents)
+	
+	if len(matches) > 0:
+		matches.pop(0)
+		
+	return matches
+
+def get_users(url):
+	contents = urllib.urlopen(url).read()
+	matches = re.findall('http:\/\/community\.webshots\.com\/user\/([^/\'"]+)', contents)
+	return [value for value in matches if value != "my"]