Release 1.0

11 years ago · 4786ea0443
parent e8cf644062 974d28973d
commit 4786ea0443
9 changed files with 479 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 *.pyc
 config.json
--- a/README.md
+++ b/README.md
@ -11,8 +11,77 @@ extend. Contributions welcome.
 ## Installing
-You'll need to `pip install oursql` (this will require having the MySQL
+You'll need to `pip install oursql requests` (this will require having 
-development libraries installed). Other than that, just run main.py.
+the MySQL development libraries installed). Other than that, just run 
 main.py.
 ## Usage
 You can use nzbspider with either a release list or a configuration 
 file. 
 Using `--iplist` you can specify a newline-delimited file that
 contains all the available IPs on your machine. nzbspider will randomly
 pick one for every search query. If not specified, the OS default is 
 used.
 Using `--skip` you can specify a newline-delimited file that contains
 all release names that should be skipped, no matter what. This works in
 both modes.
 ### Release list
 This is a text file, specified with the `--list` parameter, that
 contains a newline-delimited list of release names to search for. You
 will need to use the `--target` parameter to specify what directory to
 download the NZBs to.
 ### Configuration file
 This is a text file using a specific configuration syntax to select
 specific releases from a pre-filled MySQl database, to search for. Use
 the `--config` parameter to specify the path of the configuration file
 you wish to use.
 To use this mode, you will need to copy config.json.example to 
 config.json and change the database details to match yours. A (basic) 
 database schema is included. Only results that are at least 24 hours old
 will be matched, regardless of your configuration.
 The configuration file format is as follows:
 * Newline-delimited, a new predicate on every line.
 * Three whitespace-delimited fields: release name, section, and target 
  directory.
 * Enter `-` for any or both of the first two fields to match regardless
  of the release name or section (depending on which you fill in as `-`).
 * The `%` character is used to denote a multi-character wildcard
  anywhere in the first two fields.
 * The first two fields are enclosed in wildcard characters by default.
 * The target directory does not have to exist; it will be created if it
  doesn't.
 * You must enclose a field value in `"` quotes if it contains a space.
 An example configuration file (the real configuration format doesn't
 allow comments, so don't copy this verbatim!):
 	- MP3 ./mp3s             # Will select everything in section 'MP3'
 	- - ./everything         # Will select absolutely everything
 	IMMERSE - ./immerse      # Will select everything labeled 'IMMERSE'
 	Mad.Men%720p - ./madmen  # Will select every 720p episode of Mad Men
 Note that these searches are run against your own database, not directly
 against the NZB indexing sites! You'll still need a list of valid 
 release names pre-filled in your database.
 Using `--limit` you can override the default limit of matched results.
 The default is the 250 newest results.
 ## Notes
 The script will assume that all releasenames in your database are safe
 as a filename. No sanitation or conversion of the filenames will take
 place.
 ## License
--- a/config.json.example
+++ b/config.json.example
@ -0,0 +1,9 @@
 {
 	"db": {
 		"host": "localhost",
 		"user": "nzbspider",
 		"pass": "sekrit",
 		"db": "nzbspider",
 		"table": "releases"
 	}
 }
--- a/main.py
+++ b/main.py
@ -1,4 +1,160 @@
-import re, oursql
+import re, oursql, requests, sys, json, shlex, argparse, os, random
 from sources.nzbindex import NzbindexSpider
 from sources.binsearch import BinsearchSpider
 from shared import NotFoundException
 parser = argparse.ArgumentParser(description="Automatically download NZBs for releases")
 parser.add_argument("--config", dest="config", action="store", help="Use a configuration file to match against the database as source")
 parser.add_argument("--list", dest="list", action="store", help="Use a newline-delimited list of releases as source")
 parser.add_argument("--target", dest="target", action="store", help="Where to save the NZBs (only needed in list mode)")
 parser.add_argument("--iplist", dest="iplist", action="store", help="Bind every request to a random IP from a newline-delimited list")
 parser.add_argument("--limit", dest="limit", action="store", help="How many records to select in configuration file mode, at most (default: 250)", default=250)
 parser.add_argument("--skip", dest="skip", action="store", help="Optionally, a path to a newline-delimited list of release names to always skip")
 args = parser.parse_args()
 if args.config is not None:
 	mode = "config"
 elif args.list is not None:
 	mode = "list"
 else:
 	sys.stderr.write("You must specify either a configuration file or a release list.\n")
 	exit(1)
 if args.iplist is not None:
 	iplist_file = open(args.iplist, "r")
 	iplist = iplist_file.read().splitlines()
 else:
 	iplist = [""]
 if args.skip is not None:
 	skip_file = open(args.skip, "r")
 	skiplist = skip_file.read().splitlines()
 else:
 	skiplist = [""]
 if mode == "config":
 	try:
 		conf = json.load(open("config.json", "r"))
 	except IOError, e:
 		sys.stderr.write("You must have a valid config.json.\n")
 		exit(1)
 	if not re.match("^[a-zA-Z0-9_-]+$", conf['db']['table']):
 		sys.stderr.write("Table name must be a-z, A-Z, 0-9, _, -\n")
 		exit(1)
 	try:
 		searchconf_file = open(args.config, "r")
 	except IOError, e:
 		sys.stderr.write("The specified configuration file doesn't exist.\n")
 		exit(1)
 	queries = searchconf_file.read().splitlines()
 	searchconf_file.close()
 	db = oursql.connect(host=conf['db']['host'], user=conf['db']['user'], passwd=conf['db']['pass'], db=conf['db']['db'], autoreconnect=True)
 	c = db.cursor()
 	releases = []
 	for query in queries:
 		title, section, target = shlex.split(query)
 		fields = []
 		values = []
 		if title != "-":
 			fields.append("`release` LIKE ?")
 			values.append("%" + title + "%")
 		if section != "-":
 			fields.append("`section` LIKE ?")
 			values.append("%" + section + "%")
 		values.append(args.limit)
 		if len(fields) == 0:
 			db_query = "SELECT `release` FROM %s WHERE `time` < (UNIX_TIMESTAMP(NOW()) - 86400) ORDER BY `time` DESC LIMIT ?" % conf['db']['table']
 		else:
 			db_query = "SELECT `release` FROM %s WHERE %s AND `time` < (UNIX_TIMESTAMP(NOW()) - 86400) ORDER BY `time` DESC LIMIT ?" % (conf['db']['table'], " AND ".join(fields))
 		c.execute(db_query, values)
 		for row in c:
 			releases.append((row[0], target))
 elif mode == "list":
 	if args.target is None:
 		sys.stderr.write("You did not specify a target directory with --target.\n")
 		exit(1)
 	try:
 		list_file = open(args.list, "r")
 	except IOError, e:
 		sys.stderr.write("The specified list file doesn't exist.\n")
 		exit(1)
 	releases = [(release, args.target) for release in list_file.read().splitlines()]
 	list_file.close()
 sys.stdout.write("Found %d releases.\n" % len(releases))
 downloaded = 0
 skipped = 0
 errors = 0
 notfound = 0
 notfound_list = []
 for release in releases:
 	release_name, target_dir = release
 	target_path = os.path.join(target_dir, "%s.nzb" % release_name)
 	if os.path.exists(target_path):
 		# This NZB was already downloaded.
 		skipped += 1
 		continue
 	if release_name in notfound_list:
 		# This NZB couldn't be found before
 		notfound += 1
 		continue
 	if release_name in skiplist:
 		# This release should be skipped
 		skipped += 1
 		continue
 	try:
 		os.makedirs(target_dir)
 	except OSError, e:
 		# Target directory already exists
 		pass
 	try:
 		spider = NzbindexSpider(random.choice(iplist))
 		results = spider.find(release_name)
 	except NotFoundException, e:
 		try:
 			spider = BinsearchSpider(random.choice(iplist))
 			results = spider.find(release_name)
 		except NotFoundException, e:
 			sys.stderr.write("Could not find release %s\n" % release_name)
 			notfound_list.append(release_name)
 			notfound += 1
 			continue
 	# Process result
 	result = results[0]
 	try:
 		result.download(target_path)
 	except Exception, e:
 		errors += 1
 		sys.stderr.write("Downloading NZB for %s failed: %s\n" % (release_name, repr(e)))
 		continue
 	sys.stdout.write("Downloaded NZB for %s.\n" % release_name)
 	downloaded += 1
 sys.stdout.write("Finished. %d downloaded, %d skipped, %d errors and %d not found.\n" % (downloaded, skipped, errors, notfound))
--- a/shared.py
+++ b/shared.py
@ -0,0 +1,74 @@
 import requests, random, socket
 # These are just some random useragents, you can replace these with a different list
 user_agents = [
 	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36",
 	"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
 	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36",
 	"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36",
 	"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36",
 	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
 	"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
 	"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
 	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1",
 	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.63 Safari/537.36"
 ]
 class NotFoundException(Exception):
 	pass
 class DownloadException(Exception):
 	pass
 # Very nasty monkeypatching ahead!
 socket.real_create_connection = socket.create_connection
 class ModifiedSession(requests.Session):
 	def __init__(self, *args, **kwargs):
 		try:
 			self.bound_ip = kwargs['bound_ip']
 			del kwargs['bound_ip']
 		except KeyError, e:
 			self.bound_ip = ""
 		requests.Session.__init__(self, *args, **kwargs)
 		self.headers['User-Agent'] = random.choice(user_agents)
 	def patch_socket(self):
 		socket.create_connection = get_patched_func(self.bound_ip)
 	def unpatch_socket(self):
 		socket.create_connection = socket.real_create_connection
 	def get(self, *args, **kwargs):
 		self.patch_socket()
 		response = requests.Session.get(self, *args, **kwargs)
 		self.unpatch_socket()
 		return response
 	def post(self, *args, **kwargs):
 		self.patch_socket()
 		response = requests.Session.post(self, *args, **kwargs)
 		self.unpatch_socket()
 		return response
 def get_patched_func(bind_addr):
 	def set_src_addr(*args):
 		address, timeout = args[0], args[1]
 		source_address = (bind_addr, 0)
 		return socket.real_create_connection(address, timeout, source_address)
 	return set_src_addr
 # You're looking at duct tape and tie-wraps. It's like your local Home
 # Depot, except in Python.
 def download_file(request, target):
 	if request.status_code == 200:
 		f = open(target, "wb")
 		for chunk in request.iter_content():
 			f.write(chunk)
 		f.close()
 	else:
 		raise DownloadException("Status code was %s" % request.status_code)
--- a/sources/init.py
+++ b/sources/init.py
@ -1,2 +1 @@
-class NzbindexSpider(object):
+
 	pass
--- a/sources/binsearch.py
+++ b/sources/binsearch.py
@ -1,2 +1,89 @@
 from shared import NotFoundException, ModifiedSession, download_file
 import requests, re, HTMLParser
 class BinsearchSpider(object):
-	pass
+	def __init__(self, bound_ip):
 		self.bound_ip = bound_ip
 	def find(self, name):
 		parser = HTMLParser.HTMLParser()
 		self.session = ModifiedSession(bound_ip=self.bound_ip)
 		response = self.session.get("https://binsearch.info/index.php", params={
 			"q": name,
 			"m": "",
 			"adv_age": "600",
 			"max": "100",
 			"adv_g": "",
 			"adv_sort": "date",
 			"minsize": "100",
 			"maxsize": "",
 			"adv_col": "on",
 			"adv_nfo": "on",
 			"font": "",
 			"postdate": "",
 			"server": ""
 		}, verify=False)
 		search_results = []
 		# Nice try, corrupting your HTML to deter scrapers. Not going to stop me, though.
 		results = re.findall('<tr[^>]+>(.*?)<a href="browse\.php', response.text, re.DOTALL)
 		for result in results:
 			if 'requires password' in result:
 				# Password protected
 				continue
 			match = re.search('<span[^>]*class="s"[^>]*>(.*?)<\/span>', result, re.DOTALL)
 			if match is None:
 				continue
 			title = parser.unescape(re.sub("<[^>]+>", "", match.group(1)))
 			if name.lower() in title.lower():
 				match = re.search('<input[^>]*type="checkbox"[^>]*name="([0-9]+)"[^>]*>', result)
 				if match is not None:
 					search_results.append(BinsearchResult(name, title, match.group(1), self, response.url))
 		if len(search_results) == 0:
 			raise NotFoundException("No results were found.")
 		return search_results
 class BinsearchResult(object):
 	def __init__(self, name, title, id_, spider, searchurl):
 		self.name = name
 		self.title = title
 		self.id_ = id_
 		self.spider = spider
 		self.searchurl = searchurl
 	def show(self):
 		print "%s -> %s (%s)" % (self.title, self.id_, self.name)
 	def download(self, target_path):
 		data_dict = {"action": "nzb"}
 		data_dict[self.id_] = "on"
 		self.spider.session.headers['Referer'] = self.searchurl
 		response = self.spider.session.post("https://www.binsearch.info/fcgi/nzb.fcgi", params={
 			"q": self.name,
 			"m": "",
 			"adv_age": "600",
 			"max": "100",
 			"adv_g": "",
 			"adv_sort": "date",
 			"minsize": "100",
 			"maxsize": "",
 			"adv_col": "on",
 			"adv_nfo": "on",
 			"font": "",
 			"postdate": "",
 			"server": ""
 		}, data=data_dict)
 		download_file(response, target_path)
--- a/sources/nzbindex.py
+++ b/sources/nzbindex.py
@ -1 +1,67 @@
 from shared import NotFoundException, ModifiedSession, download_file
 import requests, re, HTMLParser
 class NzbindexSpider(object):
 	def __init__(self, bound_ip):
 		self.bound_ip = bound_ip
 	def find(self, name):
 		parser = HTMLParser.HTMLParser()
 		self.session = ModifiedSession(bound_ip=self.bound_ip)
 		self.session.post("https://nzbindex.com/agree/", data={"agree": "I agree"}, verify=False)
 		response = self.session.get("https://nzbindex.com/search/", params={
 			"q": name,
 			"age": "",
 			"max": "50",
 			"minage": "",
 			"sort": "agedesc",
 			"minsize": "100",
 			"maxsize": "",
 			"dq": "",
 			"poster": "",
 			"nfo": "",
 			"hasnfo": "1",
 			"complete": "1",
 			"hidespam": "1",
 			"more": "1"
 		}, verify=False)
 		search_results = []
 		results = re.findall("<tr[^>]*>(.*?)<\/tr>", response.text, re.DOTALL)
 		for result in results:
 			if 'class="threat"' in result:
 				# Password protected or otherwise unsuitable for download
 				continue
 			match = re.search("<label[^>]*>(.*?)<\/label>", result, re.DOTALL)
 			if match is None:
 				continue
 			title = parser.unescape(re.sub("<[^>]*>", "", match.group(1)))
 			if name.lower() in title.lower():
 				match = re.search('https?:\/\/nzbindex\.com\/download\/[^"]+\.nzb', result)
 				if match is not None:
 					search_results.append(NzbindexResult(title, match.group(0), self))
 		if len(search_results) == 0:
 			raise NotFoundException("No results were found.")
 		return search_results
 class NzbindexResult(object):
 	def __init__(self, title, url, spider):
 		self.title = title
 		self.url = url
 		self.spider = spider
 	def show(self):
 		print "%s -> %s" % (self.title, self.url)
 	def download(self, target_path):
 		download_file(self.spider.session.get(self.url), target_path)
--- a/structure.sql
+++ b/structure.sql
@ -0,0 +1,11 @@
 SET SQL_MODE="NO_AUTO_VALUE_ON_ZERO";
 SET time_zone = "+00:00";
 CREATE TABLE IF NOT EXISTS `releases` (
  `releaseid` int(11) NOT NULL AUTO_INCREMENT,
  `time` int(11) NOT NULL,
  `section` varchar(50) COLLATE utf8_unicode_ci NOT NULL,
  `release` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
  PRIMARY KEY (`releaseid`),
  UNIQUE KEY `release` (`release`)
 ) ENGINE=MyISAM  DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;