4chandownloader/4c

#!/usr/bin/python

# Protip: want to monitor a thread and download all new images every 5 seconds?
# while x=0; do 4c [-nf] url; sleep 5; done

import re, urllib, urllib2, argparse, os

parser = argparse.ArgumentParser(description='Downloads all full-size images in one or more arbitrary 4chan threads.')

parser.add_argument('urllist', metavar='url', type=str, nargs='+',
                   help='the URLs of the threads')
parser.add_argument('-n', '--newdir', dest='newdir', action='store_true',
                   help='create a new directory for each thread in the current directory')
parser.add_argument('-f', '--force', dest='force_redownload', action='store_true',
                   help='force redownloading every image, overwriting it if it already exists')
                   
args = parser.parse_args()
options = vars(args)

regex = 'href="(\/\/images\.4chan\.org\/[a-z]+\/src\/[0-9]+\.[a-z]+)"'

for url in options['urllist']:
	print "Thread URL: %s" % url
	
	try:
		page = urllib2.urlopen(url).read()
	except ValueError:
		print "That does not look like a valid URL."
		continue
	except urllib2.HTTPError:
		print "The given URL returns a HTTP 404 status code - the thread may have died."
		continue

	if options['newdir'] == True:
		thread_id = url.split('/')[-1]
		target_dir = "%s/" % thread_id

		if not os.path.exists(thread_id):
			os.makedirs(thread_id)
	else:
		target_dir = ""

	search = re.compile(regex)
	matches = search.finditer(page)

	urls = []

	for match in matches:
		if match.group(1) not in urls:
			urls.append(match.group(1))

	current = 1
	total = len(urls)

	print "  Parsed thread. Total images: %d" % total

	for downloadurl in urls:
		downloadurl = "http:%s" % downloadurl
		filename = downloadurl.split('/')[-1]
		path = target_dir + filename
		
		if os.path.exists(path) and options['force_redownload'] == False:
			print "Skipped existing file %s (%d/%d)." % (filename, current, total)
		else:
			urllib.urlretrieve(downloadurl, path)
			print "Downloaded %s (%d/%d)." % (filename, current, total)
		current += 1
	
	print "Done."
Initial commit 2012-05-20 23:28:26 +02:00			`#!/usr/bin/python`

Add useful tip for thread monitoring 2012-05-21 03:39:49 +02:00			`# Protip: want to monitor a thread and download all new images every 5 seconds?`
			`# while x=0; do 4c [-nf] url; sleep 5; done`

Add argument parsing 2012-05-21 00:18:59 +02:00			`import re, urllib, urllib2, argparse, os`

Fixed typo in help 2012-05-21 07:12:25 +02:00			`parser = argparse.ArgumentParser(description='Downloads all full-size images in one or more arbitrary 4chan threads.')`
Add argument parsing 2012-05-21 00:18:59 +02:00
Accept multiple threads as input arguments 2012-05-21 03:47:02 +02:00			`parser.add_argument('urllist', metavar='url', type=str, nargs='+',`
Update help 2012-05-21 03:49:58 +02:00			`help='the URLs of the threads')`
Add argument parsing 2012-05-21 00:18:59 +02:00			`parser.add_argument('-n', '--newdir', dest='newdir', action='store_true',`
Update help 2012-05-21 03:49:58 +02:00			`help='create a new directory for each thread in the current directory')`
Skip existing files by default, and provide -f flag to override 2012-05-21 03:30:42 +02:00			`parser.add_argument('-f', '--force', dest='force_redownload', action='store_true',`
			`help='force redownloading every image, overwriting it if it already exists')`
Add argument parsing 2012-05-21 00:18:59 +02:00
			`args = parser.parse_args()`
			`options = vars(args)`
Find all full image URLs on a page 2012-05-20 23:46:39 +02:00
Fixed regex 2012-05-21 19:03:47 +02:00			`regex = 'href="(\/\/images\.4chan\.org\/[a-z]+\/src\/[0-9]+\.[a-z]+)"'`
Add argument parsing 2012-05-21 00:18:59 +02:00
Accept multiple threads as input arguments 2012-05-21 03:47:02 +02:00			`for url in options['urllist']:`
			`print "Thread URL: %s" % url`

			`try:`
			`page = urllib2.urlopen(url).read()`
			`except ValueError:`
			`print "That does not look like a valid URL."`
Skip thread URL when 404ing instead of exiting the script altogether 2012-05-21 04:15:39 +02:00			`continue`
Accept multiple threads as input arguments 2012-05-21 03:47:02 +02:00			`except urllib2.HTTPError:`
			`print "The given URL returns a HTTP 404 status code - the thread may have died."`
Skip thread URL when 404ing instead of exiting the script altogether 2012-05-21 04:15:39 +02:00			`continue`
Add argument parsing 2012-05-21 00:18:59 +02:00
Accept multiple threads as input arguments 2012-05-21 03:47:02 +02:00			`if options['newdir'] == True:`
			`thread_id = url.split('/')[-1]`
			`target_dir = "%s/" % thread_id`
Find all full image URLs on a page 2012-05-20 23:46:39 +02:00
Accept multiple threads as input arguments 2012-05-21 03:47:02 +02:00			`if not os.path.exists(thread_id):`
			`os.makedirs(thread_id)`
			`else:`
			`target_dir = ""`
Find all full image URLs on a page 2012-05-20 23:46:39 +02:00
Accept multiple threads as input arguments 2012-05-21 03:47:02 +02:00			`search = re.compile(regex)`
			`matches = search.finditer(page)`
Find all full image URLs on a page 2012-05-20 23:46:39 +02:00
Accept multiple threads as input arguments 2012-05-21 03:47:02 +02:00			`urls = []`
Find all full image URLs on a page 2012-05-20 23:46:39 +02:00
Accept multiple threads as input arguments 2012-05-21 03:47:02 +02:00			`for match in matches:`
			`if match.group(1) not in urls:`
			`urls.append(match.group(1))`
Find all full image URLs on a page 2012-05-20 23:46:39 +02:00
Accept multiple threads as input arguments 2012-05-21 03:47:02 +02:00			`current = 1`
			`total = len(urls)`
Actual file downloading and regex fix 2012-05-21 00:03:18 +02:00
Make output a bit more clear 2012-05-21 03:47:27 +02:00			`print " Parsed thread. Total images: %d" % total`
Actual file downloading and regex fix 2012-05-21 00:03:18 +02:00
Accept multiple threads as input arguments 2012-05-21 03:47:02 +02:00			`for downloadurl in urls:`
			`downloadurl = "http:%s" % downloadurl`
			`filename = downloadurl.split('/')[-1]`
			`path = target_dir + filename`

			`if os.path.exists(path) and options['force_redownload'] == False:`
			`print "Skipped existing file %s (%d/%d)." % (filename, current, total)`
			`else:`
			`urllib.urlretrieve(downloadurl, path)`
			`print "Downloaded %s (%d/%d)." % (filename, current, total)`
			`current += 1`
Make output a bit more clear 2012-05-21 03:47:27 +02:00
			`print "Done."`