Release 1.0

11 years ago · 4786ea0443
parent e8cf644062 974d28973d
commit 4786ea0443
9 changed files with 479 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+*.pyc
+config.json
--- a/README.md
+++ b/README.md
@ -11,8 +11,77 @@ extend. Contributions welcome.

 ## Installing

-You'll need to `pip install oursql` (this will require having the MySQL
-development libraries installed). Other than that, just run main.py.
+You'll need to `pip install oursql requests` (this will require having 
+the MySQL development libraries installed). Other than that, just run 
+main.py.
+
+## Usage
+
+You can use nzbspider with either a release list or a configuration 
+file. 
+
+Using `--iplist` you can specify a newline-delimited file that
+contains all the available IPs on your machine. nzbspider will randomly
+pick one for every search query. If not specified, the OS default is 
+used.
+
+Using `--skip` you can specify a newline-delimited file that contains
+all release names that should be skipped, no matter what. This works in
+both modes.
+
+### Release list
+
+This is a text file, specified with the `--list` parameter, that
+contains a newline-delimited list of release names to search for. You
+will need to use the `--target` parameter to specify what directory to
+download the NZBs to.
+
+### Configuration file
+
+This is a text file using a specific configuration syntax to select
+specific releases from a pre-filled MySQl database, to search for. Use
+the `--config` parameter to specify the path of the configuration file
+you wish to use.
+
+To use this mode, you will need to copy config.json.example to 
+config.json and change the database details to match yours. A (basic) 
+database schema is included. Only results that are at least 24 hours old
+will be matched, regardless of your configuration.
+
+The configuration file format is as follows:
+
+* Newline-delimited, a new predicate on every line.
+* Three whitespace-delimited fields: release name, section, and target 
+  directory.
+* Enter `-` for any or both of the first two fields to match regardless
+  of the release name or section (depending on which you fill in as `-`).
+* The `%` character is used to denote a multi-character wildcard
+  anywhere in the first two fields.
+* The first two fields are enclosed in wildcard characters by default.
+* The target directory does not have to exist; it will be created if it
+  doesn't.
+* You must enclose a field value in `"` quotes if it contains a space.
+  
+An example configuration file (the real configuration format doesn't
+allow comments, so don't copy this verbatim!):
+
+	- MP3 ./mp3s             # Will select everything in section 'MP3'
+	- - ./everything         # Will select absolutely everything
+	IMMERSE - ./immerse      # Will select everything labeled 'IMMERSE'
+	Mad.Men%720p - ./madmen  # Will select every 720p episode of Mad Men
+	
+Note that these searches are run against your own database, not directly
+against the NZB indexing sites! You'll still need a list of valid 
+release names pre-filled in your database.
+
+Using `--limit` you can override the default limit of matched results.
+The default is the 250 newest results.
+
+## Notes
+
+The script will assume that all releasenames in your database are safe
+as a filename. No sanitation or conversion of the filenames will take
+place.

 ## License

--- a/config.json.example
+++ b/config.json.example
@ -0,0 +1,9 @@
+{
+	"db": {
+		"host": "localhost",
+		"user": "nzbspider",
+		"pass": "sekrit",
+		"db": "nzbspider",
+		"table": "releases"
+	}
+}
--- a/main.py
+++ b/main.py
@ -1,4 +1,160 @@
-import re, oursql
+import re, oursql, requests, sys, json, shlex, argparse, os, random

 from sources.nzbindex import NzbindexSpider
 from sources.binsearch import BinsearchSpider
+from shared import NotFoundException
+
+parser = argparse.ArgumentParser(description="Automatically download NZBs for releases")
+parser.add_argument("--config", dest="config", action="store", help="Use a configuration file to match against the database as source")
+parser.add_argument("--list", dest="list", action="store", help="Use a newline-delimited list of releases as source")
+parser.add_argument("--target", dest="target", action="store", help="Where to save the NZBs (only needed in list mode)")
+parser.add_argument("--iplist", dest="iplist", action="store", help="Bind every request to a random IP from a newline-delimited list")
+parser.add_argument("--limit", dest="limit", action="store", help="How many records to select in configuration file mode, at most (default: 250)", default=250)
+parser.add_argument("--skip", dest="skip", action="store", help="Optionally, a path to a newline-delimited list of release names to always skip")
+args = parser.parse_args()
+
+if args.config is not None:
+	mode = "config"
+elif args.list is not None:
+	mode = "list"
+else:
+	sys.stderr.write("You must specify either a configuration file or a release list.\n")
+	exit(1)
+	
+if args.iplist is not None:
+	iplist_file = open(args.iplist, "r")
+	iplist = iplist_file.read().splitlines()
+else:
+	iplist = [""]
+	
+if args.skip is not None:
+	skip_file = open(args.skip, "r")
+	skiplist = skip_file.read().splitlines()
+else:
+	skiplist = [""]
+
+if mode == "config":
+	try:
+		conf = json.load(open("config.json", "r"))
+	except IOError, e:
+		sys.stderr.write("You must have a valid config.json.\n")
+		exit(1)
+	
+	if not re.match("^[a-zA-Z0-9_-]+$", conf['db']['table']):
+		sys.stderr.write("Table name must be a-z, A-Z, 0-9, _, -\n")
+		exit(1)
+	
+	try:
+		searchconf_file = open(args.config, "r")
+	except IOError, e:
+		sys.stderr.write("The specified configuration file doesn't exist.\n")
+		exit(1)
+		
+	queries = searchconf_file.read().splitlines()
+	searchconf_file.close()
+	
+	db = oursql.connect(host=conf['db']['host'], user=conf['db']['user'], passwd=conf['db']['pass'], db=conf['db']['db'], autoreconnect=True)
+	c = db.cursor()
+	
+	releases = []
+	
+	for query in queries:
+		title, section, target = shlex.split(query)
+		
+		fields = []
+		values = []
+		
+		if title != "-":
+			fields.append("`release` LIKE ?")
+			values.append("%" + title + "%")
+			
+		if section != "-":
+			fields.append("`section` LIKE ?")
+			values.append("%" + section + "%")
+		
+		values.append(args.limit)
+		
+		if len(fields) == 0:
+			db_query = "SELECT `release` FROM %s WHERE `time` < (UNIX_TIMESTAMP(NOW()) - 86400) ORDER BY `time` DESC LIMIT ?" % conf['db']['table']
+		else:
+			db_query = "SELECT `release` FROM %s WHERE %s AND `time` < (UNIX_TIMESTAMP(NOW()) - 86400) ORDER BY `time` DESC LIMIT ?" % (conf['db']['table'], " AND ".join(fields))
+		
+		c.execute(db_query, values)
+		
+		for row in c:
+			releases.append((row[0], target))
+elif mode == "list":
+	if args.target is None:
+		sys.stderr.write("You did not specify a target directory with --target.\n")
+		exit(1)
+	
+	try:
+		list_file = open(args.list, "r")
+	except IOError, e:
+		sys.stderr.write("The specified list file doesn't exist.\n")
+		exit(1)
+	
+	releases = [(release, args.target) for release in list_file.read().splitlines()]
+	list_file.close()
+
+sys.stdout.write("Found %d releases.\n" % len(releases))
+
+downloaded = 0
+skipped = 0
+errors = 0
+notfound = 0
+
+notfound_list = []
+
+for release in releases:
+	release_name, target_dir = release
+	target_path = os.path.join(target_dir, "%s.nzb" % release_name)
+	
+	if os.path.exists(target_path):
+		# This NZB was already downloaded.
+		skipped += 1
+		continue
+		
+	if release_name in notfound_list:
+		# This NZB couldn't be found before
+		notfound += 1
+		continue
+		
+	if release_name in skiplist:
+		# This release should be skipped
+		skipped += 1
+		continue
+	
+	try:
+		os.makedirs(target_dir)
+	except OSError, e:
+		# Target directory already exists
+		pass
+	
+	try:
+		spider = NzbindexSpider(random.choice(iplist))
+		results = spider.find(release_name)
+	except NotFoundException, e:
+		try:
+			spider = BinsearchSpider(random.choice(iplist))
+			results = spider.find(release_name)
+		except NotFoundException, e:
+			sys.stderr.write("Could not find release %s\n" % release_name)
+			notfound_list.append(release_name)
+			notfound += 1
+			continue
+			
+	# Process result
+	result = results[0]
+	
+	try:
+		result.download(target_path)
+	except Exception, e:
+		errors += 1
+		sys.stderr.write("Downloading NZB for %s failed: %s\n" % (release_name, repr(e)))
+		continue
+		
+	sys.stdout.write("Downloaded NZB for %s.\n" % release_name)
+	downloaded += 1
+
+sys.stdout.write("Finished. %d downloaded, %d skipped, %d errors and %d not found.\n" % (downloaded, skipped, errors, notfound))
--- a/shared.py
+++ b/shared.py
@ -0,0 +1,74 @@
+import requests, random, socket
+
+# These are just some random useragents, you can replace these with a different list
+user_agents = [
+	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36",
+	"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
+	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36",
+	"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36",
+	"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36",
+	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
+	"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
+	"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
+	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1",
+	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.63 Safari/537.36"
+]
+
+class NotFoundException(Exception):
+	pass
+	
+class DownloadException(Exception):
+	pass
+	
+# Very nasty monkeypatching ahead!
+socket.real_create_connection = socket.create_connection
+
+class ModifiedSession(requests.Session):
+	def __init__(self, *args, **kwargs):
+		try:
+			self.bound_ip = kwargs['bound_ip']
+			del kwargs['bound_ip']
+		except KeyError, e:
+			self.bound_ip = ""
+			
+		requests.Session.__init__(self, *args, **kwargs)
+		self.headers['User-Agent'] = random.choice(user_agents)
+		
+	def patch_socket(self):
+		socket.create_connection = get_patched_func(self.bound_ip)
+	
+	def unpatch_socket(self):
+		socket.create_connection = socket.real_create_connection
+	
+	def get(self, *args, **kwargs):
+		self.patch_socket()
+		response = requests.Session.get(self, *args, **kwargs)
+		self.unpatch_socket()
+		return response
+		
+	def post(self, *args, **kwargs):
+		self.patch_socket()
+		response = requests.Session.post(self, *args, **kwargs)
+		self.unpatch_socket()
+		return response
+
+def get_patched_func(bind_addr):
+	def set_src_addr(*args):
+		address, timeout = args[0], args[1]
+		source_address = (bind_addr, 0)
+		return socket.real_create_connection(address, timeout, source_address)
+	return set_src_addr
+	
+# You're looking at duct tape and tie-wraps. It's like your local Home
+# Depot, except in Python.
+	
+def download_file(request, target):
+	if request.status_code == 200:
+		f = open(target, "wb")
+		
+		for chunk in request.iter_content():
+			f.write(chunk)
+			
+		f.close()
+	else:
+		raise DownloadException("Status code was %s" % request.status_code)
--- a/sources/init.py
+++ b/sources/init.py
@ -1,2 +1 @@
-class NzbindexSpider(object):
-	pass
+
--- a/sources/binsearch.py
+++ b/sources/binsearch.py
@ -1,2 +1,89 @@
+from shared import NotFoundException, ModifiedSession, download_file
+import requests, re, HTMLParser
+
 class BinsearchSpider(object):
-	pass
+	def __init__(self, bound_ip):
+		self.bound_ip = bound_ip
+		
+	def find(self, name):
+		parser = HTMLParser.HTMLParser()
+		self.session = ModifiedSession(bound_ip=self.bound_ip)
+		
+		response = self.session.get("https://binsearch.info/index.php", params={
+			"q": name,
+			"m": "",
+			"adv_age": "600",
+			"max": "100",
+			"adv_g": "",
+			"adv_sort": "date",
+			"minsize": "100",
+			"maxsize": "",
+			"adv_col": "on",
+			"adv_nfo": "on",
+			"font": "",
+			"postdate": "",
+			"server": ""
+		}, verify=False)
+		
+		search_results = []
+		
+		# Nice try, corrupting your HTML to deter scrapers. Not going to stop me, though.
+		results = re.findall('<tr[^>]+>(.*?)<a href="browse\.php', response.text, re.DOTALL)
+		
+		for result in results:
+			if 'requires password' in result:
+				# Password protected
+				continue
+			
+			match = re.search('<span[^>]*class="s"[^>]*>(.*?)<\/span>', result, re.DOTALL)
+			
+			if match is None:
+				continue
+				
+			title = parser.unescape(re.sub("<[^>]+>", "", match.group(1)))
+			
+			if name.lower() in title.lower():
+				match = re.search('<input[^>]*type="checkbox"[^>]*name="([0-9]+)"[^>]*>', result)
+				
+				if match is not None:
+					search_results.append(BinsearchResult(name, title, match.group(1), self, response.url))
+		
+		if len(search_results) == 0:
+			raise NotFoundException("No results were found.")
+				
+		return search_results
+	
+class BinsearchResult(object):
+	def __init__(self, name, title, id_, spider, searchurl):
+		self.name = name
+		self.title = title
+		self.id_ = id_
+		self.spider = spider
+		self.searchurl = searchurl
+	
+	def show(self):
+		print "%s -> %s (%s)" % (self.title, self.id_, self.name)
+	
+	def download(self, target_path):
+		data_dict = {"action": "nzb"}
+		data_dict[self.id_] = "on"
+		
+		self.spider.session.headers['Referer'] = self.searchurl
+		
+		response = self.spider.session.post("https://www.binsearch.info/fcgi/nzb.fcgi", params={
+			"q": self.name,
+			"m": "",
+			"adv_age": "600",
+			"max": "100",
+			"adv_g": "",
+			"adv_sort": "date",
+			"minsize": "100",
+			"maxsize": "",
+			"adv_col": "on",
+			"adv_nfo": "on",
+			"font": "",
+			"postdate": "",
+			"server": ""
+		}, data=data_dict)
+		
+		download_file(response, target_path)
--- a/sources/nzbindex.py
+++ b/sources/nzbindex.py
@ -1 +1,67 @@
+from shared import NotFoundException, ModifiedSession, download_file
+import requests, re, HTMLParser

+class NzbindexSpider(object):
+	def __init__(self, bound_ip):
+		self.bound_ip = bound_ip
+		
+	def find(self, name):
+		parser = HTMLParser.HTMLParser()
+		self.session = ModifiedSession(bound_ip=self.bound_ip)
+		self.session.post("https://nzbindex.com/agree/", data={"agree": "I agree"}, verify=False)
+		
+		response = self.session.get("https://nzbindex.com/search/", params={
+			"q": name,
+			"age": "",
+			"max": "50",
+			"minage": "",
+			"sort": "agedesc",
+			"minsize": "100",
+			"maxsize": "",
+			"dq": "",
+			"poster": "",
+			"nfo": "",
+			"hasnfo": "1",
+			"complete": "1",
+			"hidespam": "1",
+			"more": "1"
+		}, verify=False)
+		
+		search_results = []
+		
+		results = re.findall("<tr[^>]*>(.*?)<\/tr>", response.text, re.DOTALL)
+		
+		for result in results:
+			if 'class="threat"' in result:
+				# Password protected or otherwise unsuitable for download
+				continue
+			
+			match = re.search("<label[^>]*>(.*?)<\/label>", result, re.DOTALL)
+			
+			if match is None:
+				continue
+				
+			title = parser.unescape(re.sub("<[^>]*>", "", match.group(1)))
+			
+			if name.lower() in title.lower():
+				match = re.search('https?:\/\/nzbindex\.com\/download\/[^"]+\.nzb', result)
+				
+				if match is not None:
+					search_results.append(NzbindexResult(title, match.group(0), self))
+		
+		if len(search_results) == 0:
+			raise NotFoundException("No results were found.")
+				
+		return search_results
+		
+class NzbindexResult(object):
+	def __init__(self, title, url, spider):
+		self.title = title
+		self.url = url
+		self.spider = spider
+		
+	def show(self):
+		print "%s -> %s" % (self.title, self.url)
+		
+	def download(self, target_path):
+		download_file(self.spider.session.get(self.url), target_path)
--- a/structure.sql
+++ b/structure.sql
@ -0,0 +1,11 @@
+SET SQL_MODE="NO_AUTO_VALUE_ON_ZERO";
+SET time_zone = "+00:00";
+
+CREATE TABLE IF NOT EXISTS `releases` (
+  `releaseid` int(11) NOT NULL AUTO_INCREMENT,
+  `time` int(11) NOT NULL,
+  `section` varchar(50) COLLATE utf8_unicode_ci NOT NULL,
+  `release` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
+  PRIMARY KEY (`releaseid`),
+  UNIQUE KEY `release` (`release`)
+) ENGINE=MyISAM  DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;