4chandownloader/4c

70 lines
2 KiB
Plaintext
Raw Permalink Normal View History

2012-05-20 23:28:26 +02:00
#!/usr/bin/python
2012-05-21 03:39:49 +02:00
# Protip: want to monitor a thread and download all new images every 5 seconds?
# while x=0; do 4c [-nf] url; sleep 5; done
2012-05-21 00:18:59 +02:00
import re, urllib, urllib2, argparse, os
2012-05-21 07:12:25 +02:00
parser = argparse.ArgumentParser(description='Downloads all full-size images in one or more arbitrary 4chan threads.')
2012-05-21 00:18:59 +02:00
parser.add_argument('urllist', metavar='url', type=str, nargs='+',
2012-05-21 03:49:58 +02:00
help='the URLs of the threads')
2012-05-21 00:18:59 +02:00
parser.add_argument('-n', '--newdir', dest='newdir', action='store_true',
2012-05-21 03:49:58 +02:00
help='create a new directory for each thread in the current directory')
parser.add_argument('-f', '--force', dest='force_redownload', action='store_true',
help='force redownloading every image, overwriting it if it already exists')
2012-05-21 00:18:59 +02:00
args = parser.parse_args()
options = vars(args)
2012-05-20 23:46:39 +02:00
2012-05-21 19:03:47 +02:00
regex = 'href="(\/\/images\.4chan\.org\/[a-z]+\/src\/[0-9]+\.[a-z]+)"'
2012-05-21 00:18:59 +02:00
for url in options['urllist']:
print "Thread URL: %s" % url
try:
page = urllib2.urlopen(url).read()
except ValueError:
print "That does not look like a valid URL."
continue
except urllib2.HTTPError:
print "The given URL returns a HTTP 404 status code - the thread may have died."
continue
2012-05-21 00:18:59 +02:00
if options['newdir'] == True:
thread_id = url.split('/')[-1]
target_dir = "%s/" % thread_id
2012-05-20 23:46:39 +02:00
if not os.path.exists(thread_id):
os.makedirs(thread_id)
else:
target_dir = ""
2012-05-20 23:46:39 +02:00
search = re.compile(regex)
matches = search.finditer(page)
2012-05-20 23:46:39 +02:00
urls = []
2012-05-20 23:46:39 +02:00
for match in matches:
if match.group(1) not in urls:
urls.append(match.group(1))
2012-05-20 23:46:39 +02:00
current = 1
total = len(urls)
2012-05-21 00:03:18 +02:00
2012-05-21 03:47:27 +02:00
print " Parsed thread. Total images: %d" % total
2012-05-21 00:03:18 +02:00
for downloadurl in urls:
downloadurl = "http:%s" % downloadurl
filename = downloadurl.split('/')[-1]
path = target_dir + filename
if os.path.exists(path) and options['force_redownload'] == False:
print "Skipped existing file %s (%d/%d)." % (filename, current, total)
else:
urllib.urlretrieve(downloadurl, path)
print "Downloaded %s (%d/%d)." % (filename, current, total)
current += 1
2012-05-21 03:47:27 +02:00
print "Done."