From c738ec4a6cd48f9960e98d4b6deaee9779e5ccc1 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 20 May 2012 23:46:39 +0200 Subject: [PATCH] Find all full image URLs on a page --- 4c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/4c b/4c index 213e68f..15c280d 100644 --- a/4c +++ b/4c @@ -1,3 +1,19 @@ #!/usr/bin/python -# Regex: href="\/\/images\.4chan\.org\/b\/src\/[0-9]+\.jpg" +import re, urllib2 + +regex = 'href="(\/\/images\.4chan\.org\/b\/src\/[0-9]+\.jpg)"' +url = 'http://boards.4chan.org/b/res/400860795' + +page = urllib2.urlopen(url).read() + +search = re.compile(regex) +matches = search.finditer(page) + +urls = [] + +for match in matches: + if match.group(1) not in urls: + urls.append(match.group(1)) + +print urls