From d502f111511a1c7bfe47f240b41cd30aa2641aaf Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 00:06:38 +0200 Subject: [PATCH 01/17] Use requests --- README.md | 5 +++-- main.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fea08af..82b3efb 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,9 @@ extend. Contributions welcome. ## Installing -You'll need to `pip install oursql` (this will require having the MySQL -development libraries installed). Other than that, just run main.py. +You'll need to `pip install oursql requests` (this will require having +the MySQL development libraries installed). Other than that, just run +main.py. ## License diff --git a/main.py b/main.py index 822e2df..31f2657 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,4 @@ -import re, oursql +import re, oursql, requests from sources.nzbindex import NzbindexSpider from sources.binsearch import BinsearchSpider From 60825fd9fe61f6649637f89ad43e71eb0219f652 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 03:00:44 +0200 Subject: [PATCH 02/17] Code so far --- .gitignore | 2 + main.py | 100 ++++++++++++++++++++++++++++++++++++++++++- shared.py | 31 ++++++++++++++ sources/__init__.py | 3 +- sources/binsearch.py | 5 +++ sources/nzbindex.py | 62 +++++++++++++++++++++++++++ 6 files changed, 200 insertions(+), 3 deletions(-) create mode 100644 .gitignore create mode 100644 shared.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..936a3c8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pyc +config.json diff --git a/main.py b/main.py index 31f2657..8a986bb 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,102 @@ -import re, oursql, requests +import re, oursql, requests, sys, json, shlex, argparse from sources.nzbindex import NzbindexSpider from sources.binsearch import BinsearchSpider +from shared import NotFoundException + +parser = argparse.ArgumentParser(description="Automatically download NZBs for releases") +parser.add_argument("--config", dest="config", action="store", help="Use a configuration file to match against the database as source") +parser.add_argument("--list", dest="list", action="store", help="Use a newline-delimited list of releases as source") +parser.add_argument("--target", dest="target", action="store", help="Where to save the NZBs (only needed in list mode)") +parser.add_argument("--iplist", dest="list", action="store", help="Bind every request to a random IP from a newline-delimited list") +args = parser.parse_args() + +if args.config is not None: + mode = "config" +elif args.list is not None: + mode = "list" +else: + sys.stderr.write("You must specify either a configuration file or a release list.\n") + exit(1) + +if mode == "config": + try: + conf = json.load(open("config.json", "r")) + except IOError, e: + sys.stderr.write("You must have a valid config.json.\n") + exit(1) + + if not re.match("^[a-zA-Z0-9_-]+$", conf['db']['table']): + sys.stderr.write("Table name must be a-z, A-Z, 0-9, _, -\n") + exit(1) + + try: + searchconf_file = open(args.config, "r") + except IOError, e: + sys.stderr.write("The specified configuration file doesn't exist.\n") + exit(1) + + queries = searchconf_file.read().splitlines() + searchconf_file.close() + + db = oursql.connect(host=conf['db']['host'], user=conf['db']['user'], passwd=conf['db']['pass'], db=conf['db']['db'], autoreconnect=True) + c = db.cursor() + + releases = [] + + for query in queries: + title, section, target = shlex.split(query) + + fields = [] + values = [] + + if title != "-": + fields.append("`release` LIKE ?") + values.append("%" + title + "%") + + if section != "-": + fields.append("`section` LIKE ?") + values.append("%" + section + "%") + + if len(fields) == 0: + db_query = "SELECT `release` FROM %s WHERE `time` < (UNIX_TIMESTAMP(NOW()) - 86400)" % conf['db']['table'] + else: + db_query = "SELECT `release` FROM %s WHERE %s AND `time` < (UNIX_TIMESTAMP(NOW()) - 86400)" % (conf['db']['table'], " AND ".join(fields)) + + c.execute(db_query, values) + + for row in c: + releases.append((row[0], target)) +elif mode == "list": + if args.target is None: + sys.stderr.write("You did not specify a target directory with --target.\n") + exit(1) + + try: + list_file = open(args.list, "r") + except IOError, e: + sys.stderr.write("The specified list file doesn't exist.\n") + exit(1) + + releases = [(release, args.target) for release in list_file.read().splitlines()] + list_file.close() + +sys.stdout.write("Found %d releases.\n" % len(releases)) + +for release in releases: + release_name, target_dir = release + + try: + spider = NzbindexSpider() + results = spider.find(release_name) + except NotFoundException, e: + try: + spider = BinsearchSpider() + results = spider.find(release_name) + except NotFoundException, e: + sys.stderr.write("Could not find release %s\n" % release_name) + continue + + # Process result + for result in results: + result.show() diff --git a/shared.py b/shared.py new file mode 100644 index 0000000..6611001 --- /dev/null +++ b/shared.py @@ -0,0 +1,31 @@ +import requests, random + +# These are just some random useragents, you can replace these with a different list +user_agents = [ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0", + "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1", +] + +class NotFoundException(Exception): + pass + +class ModifiedSession(requests.Session): + def __init__(self, *args, **kwargs): + requests.Session.__init__(self, *args, **kwargs) + self.headers['user-agent'] = random.choice(user_agents) + +def download_file(self, request, target): + if request.status_code == 200: + f = open(target, "wb") + + for chunk in request.iter_content(): + f.write(chunk) + + f.close() diff --git a/sources/__init__.py b/sources/__init__.py index 77f1e4a..8b13789 100644 --- a/sources/__init__.py +++ b/sources/__init__.py @@ -1,2 +1 @@ -class NzbindexSpider(object): - pass + diff --git a/sources/binsearch.py b/sources/binsearch.py index ed9d53d..21a57d2 100644 --- a/sources/binsearch.py +++ b/sources/binsearch.py @@ -1,2 +1,7 @@ +from shared import NotFoundException + class BinsearchSpider(object): pass + +class BinsearchResult(object): + pass diff --git a/sources/nzbindex.py b/sources/nzbindex.py index 8b13789..509170f 100644 --- a/sources/nzbindex.py +++ b/sources/nzbindex.py @@ -1 +1,63 @@ +from shared import NotFoundException, ModifiedSession, download_file +import requests, re, HTMLParser +class NzbindexSpider(object): + def find(self, name): + parser = HTMLParser.HTMLParser() + self.session = ModifiedSession() + self.session.post("https://nzbindex.com/agree/", data={"agree": "I agree"}, verify=False) + + response = self.session.get("https://nzbindex.com/search/", params={ + "q": name, + "age": "", + "max": "50", + "minage": "", + "sort": "agedesc", + "minsize": "100", + "maxsize": "", + "dq": "", + "poster": "", + "nfo": "", + "hasnfo": "1", + "complete": "1", + "hidespam": "1", + "more": "1" + }, verify=False) + + search_results = [] + + results = re.findall("]+>(.*?)<\/tr>", response.text, re.DOTALL) + + for result in results: + if 'class="threat"' in result: + # Password protected or otherwise unsuitable for download + continue + + match = re.search("]+>(.*?)<\/label>", result, re.DOTALL) + + if match is None: + continue + + title = parser.unescape(re.sub("<[^>]+>", "", match.group(1))) + + if name.lower() in title.lower(): + match = re.search('https?:\/\/nzbindex\.com\/download\/[^"]+\.nzb', result) + + if match is not None: + search_results.append(NzbindexResult(title, match.group(0))) + + if len(search_results) == 0: + raise NotFoundException("No results were found.") + + return search_results +class NzbindexResult(object): + def __init__(self, title, url, spider): + self.title = title + self.url = url + self.spider = spider + + def show(self): + print "%s -> %s" % (self.title, self.url) + + def download(self, target_path): + download_file(self.spider.session.get(self.url), target_path) From d14d4ea9cb3fe410547359bc86594234021bfe57 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 15:35:42 +0200 Subject: [PATCH 03/17] Add note about sanitation --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 82b3efb..70dcd2b 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,12 @@ You'll need to `pip install oursql requests` (this will require having the MySQL development libraries installed). Other than that, just run main.py. +## Notes + +The script will assume that all releasenames in your database are safe +as a filename. No sanitation or conversion of the filenames will take +place. + ## License Licensed under the WTFPL or, if you take issue with that for some From 696d4f1c6e513ff9ab2584769b2fdf939598804e Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 15:36:32 +0200 Subject: [PATCH 04/17] Fixes in NZBIndex spider --- sources/nzbindex.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sources/nzbindex.py b/sources/nzbindex.py index 509170f..9d4f427 100644 --- a/sources/nzbindex.py +++ b/sources/nzbindex.py @@ -26,30 +26,31 @@ class NzbindexSpider(object): search_results = [] - results = re.findall("]+>(.*?)<\/tr>", response.text, re.DOTALL) + results = re.findall("]*>(.*?)<\/tr>", response.text, re.DOTALL) for result in results: if 'class="threat"' in result: # Password protected or otherwise unsuitable for download continue - match = re.search("]+>(.*?)<\/label>", result, re.DOTALL) + match = re.search("]*>(.*?)<\/label>", result, re.DOTALL) if match is None: continue - title = parser.unescape(re.sub("<[^>]+>", "", match.group(1))) + title = parser.unescape(re.sub("<[^>]*>", "", match.group(1))) if name.lower() in title.lower(): match = re.search('https?:\/\/nzbindex\.com\/download\/[^"]+\.nzb', result) if match is not None: - search_results.append(NzbindexResult(title, match.group(0))) + search_results.append(NzbindexResult(title, match.group(0), self)) if len(search_results) == 0: raise NotFoundException("No results were found.") return search_results + class NzbindexResult(object): def __init__(self, title, url, spider): self.title = title From 0917f06de7d39515d691159b71235cbad49ef2d8 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 15:36:52 +0200 Subject: [PATCH 05/17] Implement BinSearch spider --- sources/binsearch.py | 80 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 3 deletions(-) diff --git a/sources/binsearch.py b/sources/binsearch.py index 21a57d2..0402018 100644 --- a/sources/binsearch.py +++ b/sources/binsearch.py @@ -1,7 +1,81 @@ -from shared import NotFoundException +from shared import NotFoundException, ModifiedSession, download_file +import requests, re, HTMLParser class BinsearchSpider(object): - pass + def find(self, name): + parser = HTMLParser.HTMLParser() + self.session = ModifiedSession() + + response = self.session.get("https://binsearch.info/index.php", params={ + "q": name, + "m": "", + "adv_age": "600", + "max": "100", + "adv_g": "", + "adv_sort": "date", + "minsize": "100", + "maxsize": "", + "adv_col": "on", + "adv_nfo": "on", + "font": "", + "postdate": "" + }, verify=False) + + search_results = [] + + # Nice try, corrupting your HTML to deter scrapers. Not going to stop me, though. + results = re.findall(']+>(.*?)]*>(.*?)<\/span>', result, re.DOTALL) + + if match is None: + continue + + title = parser.unescape(re.sub("<[^>]+>", "", match.group(1))) + + if name.lower() in title.lower(): + match = re.search(']*type="checkbox"[^>]*name="([0-9]+)"[^>]*>', result) + + if match is not None: + search_results.append(BinsearchResult(name, title, match.group(1), self)) + + if len(search_results) == 0: + raise NotFoundException("No results were found.") + + return search_results class BinsearchResult(object): - pass + def __init__(self, name, title, id_, spider): + self.name = name + self.title = title + self.id_ = id_ + self.spider = spider + + def show(self): + print "%s -> %s (%s)" % (self.title, self.id_, self.name) + + def download(self, target_path): + data_dict = {"action": "nzb"} + data_dict[self.id_] = "on" + + response = self.spider.session.post("https://www.binsearch.info/fcgi/nzb.fcgi", params={ + "q": self.name, + "m": "", + "adv_age": "600", + "max": "100", + "adv_g": "", + "adv_sort": "date", + "minsize": "100", + "maxsize": "", + "adv_col": "on", + "adv_nfo": "on", + "font": "", + "postdate": "" + }, data=data_dict) + + download_file(response, target_path) From c81ffda57512dc90e86eed3840c9a837d94b48d2 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 15:37:04 +0200 Subject: [PATCH 06/17] Finish main script --- main.py | 34 +++++++++++++++++++++++++++++++--- shared.py | 2 +- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 8a986bb..d7d0032 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,4 @@ -import re, oursql, requests, sys, json, shlex, argparse +import re, oursql, requests, sys, json, shlex, argparse, os from sources.nzbindex import NzbindexSpider from sources.binsearch import BinsearchSpider @@ -83,8 +83,25 @@ elif mode == "list": sys.stdout.write("Found %d releases.\n" % len(releases)) +downloaded = 0 +skipped = 0 +errors = 0 +notfound = 0 + for release in releases: release_name, target_dir = release + target_path = os.path.join(target_dir, "%s.nzb" % release_name) + + if os.path.exists(target_path): + # This NZB was already downloaded. + skipped += 1 + continue + + try: + os.makedirs(target_dir) + except OSError, e: + # Target directory already exists + pass try: spider = NzbindexSpider() @@ -95,8 +112,19 @@ for release in releases: results = spider.find(release_name) except NotFoundException, e: sys.stderr.write("Could not find release %s\n" % release_name) + notfound += 1 continue # Process result - for result in results: - result.show() + result = results[1] + + try: + result.download(target_path) + except Exception, e: + errors += 1 + sys.stderr.write("Downloading NZB for %s failed: %s\n" % (release_name, repr(e))) + + sys.stdout.write("Downloaded NZB for %s.\n" % release_name) + downloaded += 1 + +sys.stdout.write("Finished. %d downloaded, %d skipped, %d errors and %d not found.\n" % (downloaded, skipped, errors, notfound)) diff --git a/shared.py b/shared.py index 6611001..86af281 100644 --- a/shared.py +++ b/shared.py @@ -21,7 +21,7 @@ class ModifiedSession(requests.Session): requests.Session.__init__(self, *args, **kwargs) self.headers['user-agent'] = random.choice(user_agents) -def download_file(self, request, target): +def download_file(request, target): if request.status_code == 200: f = open(target, "wb") From ab2f4e24c3c8c7c8594bab3a08adc97fc25aaaf6 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 15:47:31 +0200 Subject: [PATCH 07/17] Add usage instructions --- README.md | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/README.md b/README.md index 70dcd2b..16b7fc4 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,53 @@ You'll need to `pip install oursql requests` (this will require having the MySQL development libraries installed). Other than that, just run main.py. +## Usage + +You can use nzbspider with either a release list or a configuration +file. + +### Release list + +This is a text file, specified with the `--list` parameter, that +contains a newline-delimited list of release names to search for. You +will need to use the `--target` parameter to specify what directory to +download the NZBs to. + +### Configuration file + +This is a text file using a specific configuration syntax to select +specific releases from a pre-filled MySQl database, to search for. To +use this mode, you will need to copy config.json.example to config.json +and change the database details to match yours. A (basic) database +schema is included. Only results that are at least 24 hours old will be +matched, regardless of your configuration. + +The configuration file format is as follows: + +* Newline-delimited, a new predicate on every line. +* Three whitespace-delimited fields: release name, section, and target + directory. +* Enter `-` for any or both of the first two fields to match regardless + of the release name or section (depending on which you fill in as `-`). +* The `%` character is used to denote a multi-character wildcard + anywhere in the first two fields. +* The first two fields are enclosed in wildcard characters by default. +* The target directory does not have to exist; it will be created if it + doesn't. +* You must enclose a field value in `"` quotes if it contains a space. + +An example configuration file (the real configuration format doesn't +allow comments, so don't copy this verbatim!): + + - MP3 ./mp3s # Will select everything in section 'MP3' + - - ./everything # Will select absolutely everything + IMMERSE - ./immerse # Will select everything labeled 'IMMERSE' + Mad.Men%720p - ./madmen # Will select every 720p episode of Mad Men + +Note that these searches are run against your own database, not directly +against the NZB indexing sites! You'll still need a list of valid +release names pre-filled in your database. + ## Notes The script will assume that all releasenames in your database are safe From 23d1a08d0b74a1d157d05df3644dd44c53ccd77e Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 15:49:05 +0200 Subject: [PATCH 08/17] Modifications to usage instructions --- README.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 16b7fc4..3437804 100644 --- a/README.md +++ b/README.md @@ -30,11 +30,14 @@ download the NZBs to. ### Configuration file This is a text file using a specific configuration syntax to select -specific releases from a pre-filled MySQl database, to search for. To -use this mode, you will need to copy config.json.example to config.json -and change the database details to match yours. A (basic) database -schema is included. Only results that are at least 24 hours old will be -matched, regardless of your configuration. +specific releases from a pre-filled MySQl database, to search for. Use +the `--config` parameter to specify the path of the configuration file +you wish to use. + +To use this mode, you will need to copy config.json.example to +config.json and change the database details to match yours. A (basic) +database schema is included. Only results that are at least 24 hours old +will be matched, regardless of your configuration. The configuration file format is as follows: From 2eb0bad5793aae4e40f20b2c28f840ee595ac6a0 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 15:57:39 +0200 Subject: [PATCH 09/17] Include database structure --- structure.sql | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 structure.sql diff --git a/structure.sql b/structure.sql new file mode 100644 index 0000000..ea5efd5 --- /dev/null +++ b/structure.sql @@ -0,0 +1,11 @@ +SET SQL_MODE="NO_AUTO_VALUE_ON_ZERO"; +SET time_zone = "+00:00"; + +CREATE TABLE IF NOT EXISTS `releases` ( + `releaseid` int(11) NOT NULL AUTO_INCREMENT, + `time` int(11) NOT NULL, + `section` varchar(50) COLLATE utf8_unicode_ci NOT NULL, + `release` varchar(255) COLLATE utf8_unicode_ci NOT NULL, + PRIMARY KEY (`releaseid`), + UNIQUE KEY `release` (`release`) +) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; From efb1efdb17e291a72d4a762c4ff7059246ea3a65 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 15:58:11 +0200 Subject: [PATCH 10/17] Add example configuration --- config.json.example | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 config.json.example diff --git a/config.json.example b/config.json.example new file mode 100644 index 0000000..0bb93dd --- /dev/null +++ b/config.json.example @@ -0,0 +1,9 @@ +{ + "db": { + "host": "localhost", + "user": "nzbspider", + "pass": "sekrit", + "db": "nzbspider", + "table": "releases" + } +} From 73b1881e86c4eea7be799bea1ada06d0e55dd160 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 18:31:18 +0200 Subject: [PATCH 11/17] Don't retry NZBs that can't be found straight away, and limit results to last 250 --- main.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index d7d0032..9317bd4 100644 --- a/main.py +++ b/main.py @@ -59,9 +59,9 @@ if mode == "config": values.append("%" + section + "%") if len(fields) == 0: - db_query = "SELECT `release` FROM %s WHERE `time` < (UNIX_TIMESTAMP(NOW()) - 86400)" % conf['db']['table'] + db_query = "SELECT `release` FROM %s WHERE `time` < (UNIX_TIMESTAMP(NOW()) - 86400) ORDER BY `time` DESC LIMIT 250" % conf['db']['table'] else: - db_query = "SELECT `release` FROM %s WHERE %s AND `time` < (UNIX_TIMESTAMP(NOW()) - 86400)" % (conf['db']['table'], " AND ".join(fields)) + db_query = "SELECT `release` FROM %s WHERE %s AND `time` < (UNIX_TIMESTAMP(NOW()) - 86400) ORDER BY `time` DESC LIMIT 250" % (conf['db']['table'], " AND ".join(fields)) c.execute(db_query, values) @@ -88,6 +88,8 @@ skipped = 0 errors = 0 notfound = 0 +notfound_list = [] + for release in releases: release_name, target_dir = release target_path = os.path.join(target_dir, "%s.nzb" % release_name) @@ -96,6 +98,11 @@ for release in releases: # This NZB was already downloaded. skipped += 1 continue + + if release_name in notfound_list: + # This NZB couldn't be found before + notfound += 1 + continue try: os.makedirs(target_dir) @@ -112,6 +119,7 @@ for release in releases: results = spider.find(release_name) except NotFoundException, e: sys.stderr.write("Could not find release %s\n" % release_name) + notfound_list.append(release_name) notfound += 1 continue From cacbd735b77cae85df36d7ab50ef685acf43a277 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 18:37:25 +0200 Subject: [PATCH 12/17] Wow, that was stupid. --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 9317bd4..561fdb4 100644 --- a/main.py +++ b/main.py @@ -124,7 +124,7 @@ for release in releases: continue # Process result - result = results[1] + result = results[0] try: result.download(target_path) From e7e6cba90ff98dca337e73e73e137ae283b13385 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 19:39:31 +0200 Subject: [PATCH 13/17] Monkeypatch in support for binding sessions to random IPs using an IP list --- main.py | 14 ++++++++++---- shared.py | 39 ++++++++++++++++++++++++++++++++++++++- sources/binsearch.py | 5 ++++- sources/nzbindex.py | 5 ++++- 4 files changed, 56 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index 561fdb4..49e5f3d 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,4 @@ -import re, oursql, requests, sys, json, shlex, argparse, os +import re, oursql, requests, sys, json, shlex, argparse, os, random from sources.nzbindex import NzbindexSpider from sources.binsearch import BinsearchSpider @@ -8,7 +8,7 @@ parser = argparse.ArgumentParser(description="Automatically download NZBs for re parser.add_argument("--config", dest="config", action="store", help="Use a configuration file to match against the database as source") parser.add_argument("--list", dest="list", action="store", help="Use a newline-delimited list of releases as source") parser.add_argument("--target", dest="target", action="store", help="Where to save the NZBs (only needed in list mode)") -parser.add_argument("--iplist", dest="list", action="store", help="Bind every request to a random IP from a newline-delimited list") +parser.add_argument("--iplist", dest="iplist", action="store", help="Bind every request to a random IP from a newline-delimited list") args = parser.parse_args() if args.config is not None: @@ -18,6 +18,12 @@ elif args.list is not None: else: sys.stderr.write("You must specify either a configuration file or a release list.\n") exit(1) + +if args.iplist is not None: + iplist_file = open(args.iplist, "r") + iplist = iplist_file.read().splitlines() +else: + iplist = [""] if mode == "config": try: @@ -111,11 +117,11 @@ for release in releases: pass try: - spider = NzbindexSpider() + spider = NzbindexSpider(random.choice(iplist)) results = spider.find(release_name) except NotFoundException, e: try: - spider = BinsearchSpider() + spider = BinsearchSpider(random.choice(iplist)) results = spider.find(release_name) except NotFoundException, e: sys.stderr.write("Could not find release %s\n" % release_name) diff --git a/shared.py b/shared.py index 86af281..3daa11e 100644 --- a/shared.py +++ b/shared.py @@ -1,4 +1,4 @@ -import requests, random +import requests, random, socket # These are just some random useragents, you can replace these with a different list user_agents = [ @@ -16,10 +16,47 @@ user_agents = [ class NotFoundException(Exception): pass +# Very nasty monkeypatching ahead! +socket.real_create_connection = socket.create_connection + class ModifiedSession(requests.Session): def __init__(self, *args, **kwargs): + try: + self.bound_ip = kwargs['bound_ip'] + del kwargs['bound_ip'] + except KeyError, e: + self.bound_ip = "" + requests.Session.__init__(self, *args, **kwargs) self.headers['user-agent'] = random.choice(user_agents) + + def patch_socket(self): + socket.create_connection = get_patched_func(self.bound_ip) + + def unpatch_socket(self): + socket.create_connection = socket.real_create_connection + + def get(self, *args, **kwargs): + self.patch_socket() + response = requests.Session.get(self, *args, **kwargs) + self.unpatch_socket() + return response + + def post(self, *args, **kwargs): + self.patch_socket() + response = requests.Session.get(self, *args, **kwargs) + self.unpatch_socket() + return response + +def get_patched_func(bind_addr): + def set_src_addr(*args): + address, timeout = args[0], args[1] + source_address = (bind_addr, 0) + return socket.real_create_connection(address, timeout, source_address) + return set_src_addr + +# You're looking at duct tape and tie-wraps. It's like your local Home +# Depot, except in Python. def download_file(request, target): if request.status_code == 200: diff --git a/sources/binsearch.py b/sources/binsearch.py index 0402018..61d1b38 100644 --- a/sources/binsearch.py +++ b/sources/binsearch.py @@ -2,9 +2,12 @@ from shared import NotFoundException, ModifiedSession, download_file import requests, re, HTMLParser class BinsearchSpider(object): + def __init__(self, bound_ip): + self.bound_ip = bound_ip + def find(self, name): parser = HTMLParser.HTMLParser() - self.session = ModifiedSession() + self.session = ModifiedSession(bound_ip=self.bound_ip) response = self.session.get("https://binsearch.info/index.php", params={ "q": name, diff --git a/sources/nzbindex.py b/sources/nzbindex.py index 9d4f427..6a1f4ff 100644 --- a/sources/nzbindex.py +++ b/sources/nzbindex.py @@ -2,9 +2,12 @@ from shared import NotFoundException, ModifiedSession, download_file import requests, re, HTMLParser class NzbindexSpider(object): + def __init__(self, bound_ip): + self.bound_ip = bound_ip + def find(self, name): parser = HTMLParser.HTMLParser() - self.session = ModifiedSession() + self.session = ModifiedSession(bound_ip=self.bound_ip) self.session.post("https://nzbindex.com/agree/", data={"agree": "I agree"}, verify=False) response = self.session.get("https://nzbindex.com/search/", params={ From eee7a0d25332f3d1f87e9eadbfb8554c5f9e4c77 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 19:43:49 +0200 Subject: [PATCH 14/17] Update docs for --iplist --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3437804..a3e778b 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,10 @@ main.py. ## Usage You can use nzbspider with either a release list or a configuration -file. +file. Using `--iplist` you can specify a newline-delimited file that +contains all the available IPs on your machine. nzbspider will randomly +pick one for every search query. If not specified, the OS default is +used. ### Release list From 0644f182013b04aae89b10f22a7859282b2a3f46 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 20:13:27 +0200 Subject: [PATCH 15/17] Fix bugs, set correct headers, etc. --- main.py | 1 + shared.py | 10 ++++++++-- sources/binsearch.py | 13 +++++++++---- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/main.py b/main.py index 49e5f3d..f9f78e9 100644 --- a/main.py +++ b/main.py @@ -137,6 +137,7 @@ for release in releases: except Exception, e: errors += 1 sys.stderr.write("Downloading NZB for %s failed: %s\n" % (release_name, repr(e))) + continue sys.stdout.write("Downloaded NZB for %s.\n" % release_name) downloaded += 1 diff --git a/shared.py b/shared.py index 3daa11e..4bbce69 100644 --- a/shared.py +++ b/shared.py @@ -11,11 +11,15 @@ user_agents = [ "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0", "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.63 Safari/537.36" ] class NotFoundException(Exception): pass +class DownloadException(Exception): + pass + # Very nasty monkeypatching ahead! socket.real_create_connection = socket.create_connection @@ -28,7 +32,7 @@ class ModifiedSession(requests.Session): self.bound_ip = "" requests.Session.__init__(self, *args, **kwargs) - self.headers['user-agent'] = random.choice(user_agents) + self.headers['User-Agent'] = random.choice(user_agents) def patch_socket(self): socket.create_connection = get_patched_func(self.bound_ip) @@ -44,7 +48,7 @@ class ModifiedSession(requests.Session): def post(self, *args, **kwargs): self.patch_socket() - response = requests.Session.get(self, *args, **kwargs) + response = requests.Session.post(self, *args, **kwargs) self.unpatch_socket() return response @@ -66,3 +70,5 @@ def download_file(request, target): f.write(chunk) f.close() + else: + raise DownloadException("Status code was %s" % request.status_code) diff --git a/sources/binsearch.py b/sources/binsearch.py index 61d1b38..932d720 100644 --- a/sources/binsearch.py +++ b/sources/binsearch.py @@ -21,7 +21,8 @@ class BinsearchSpider(object): "adv_col": "on", "adv_nfo": "on", "font": "", - "postdate": "" + "postdate": "", + "server": "" }, verify=False) search_results = [] @@ -45,7 +46,7 @@ class BinsearchSpider(object): match = re.search(']*type="checkbox"[^>]*name="([0-9]+)"[^>]*>', result) if match is not None: - search_results.append(BinsearchResult(name, title, match.group(1), self)) + search_results.append(BinsearchResult(name, title, match.group(1), self, response.url)) if len(search_results) == 0: raise NotFoundException("No results were found.") @@ -53,11 +54,12 @@ class BinsearchSpider(object): return search_results class BinsearchResult(object): - def __init__(self, name, title, id_, spider): + def __init__(self, name, title, id_, spider, searchurl): self.name = name self.title = title self.id_ = id_ self.spider = spider + self.searchurl = searchurl def show(self): print "%s -> %s (%s)" % (self.title, self.id_, self.name) @@ -66,6 +68,8 @@ class BinsearchResult(object): data_dict = {"action": "nzb"} data_dict[self.id_] = "on" + self.spider.session.headers['Referer'] = self.searchurl + response = self.spider.session.post("https://www.binsearch.info/fcgi/nzb.fcgi", params={ "q": self.name, "m": "", @@ -78,7 +82,8 @@ class BinsearchResult(object): "adv_col": "on", "adv_nfo": "on", "font": "", - "postdate": "" + "postdate": "", + "server": "" }, data=data_dict) download_file(response, target_path) From e85eeb3bb23919f2ac434d44a917ad3b4aa8e881 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 21:15:06 +0200 Subject: [PATCH 16/17] Let a custom limit be specified as parameter --- README.md | 7 ++++++- main.py | 7 +++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a3e778b..99fdd87 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,9 @@ main.py. ## Usage You can use nzbspider with either a release list or a configuration -file. Using `--iplist` you can specify a newline-delimited file that +file. + +Using `--iplist` you can specify a newline-delimited file that contains all the available IPs on your machine. nzbspider will randomly pick one for every search query. If not specified, the OS default is used. @@ -68,6 +70,9 @@ Note that these searches are run against your own database, not directly against the NZB indexing sites! You'll still need a list of valid release names pre-filled in your database. +Using `--limit` you can override the default limit of matched results. +The default is the 250 newest results. + ## Notes The script will assume that all releasenames in your database are safe diff --git a/main.py b/main.py index f9f78e9..939bcc4 100644 --- a/main.py +++ b/main.py @@ -9,6 +9,7 @@ parser.add_argument("--config", dest="config", action="store", help="Use a confi parser.add_argument("--list", dest="list", action="store", help="Use a newline-delimited list of releases as source") parser.add_argument("--target", dest="target", action="store", help="Where to save the NZBs (only needed in list mode)") parser.add_argument("--iplist", dest="iplist", action="store", help="Bind every request to a random IP from a newline-delimited list") +parser.add_argument("--limit", dest="limit", action="store", help="How many records to select in configuration file mode, at most (default: 250)", default=250) args = parser.parse_args() if args.config is not None: @@ -64,10 +65,12 @@ if mode == "config": fields.append("`section` LIKE ?") values.append("%" + section + "%") + values.append(args.limit) + if len(fields) == 0: - db_query = "SELECT `release` FROM %s WHERE `time` < (UNIX_TIMESTAMP(NOW()) - 86400) ORDER BY `time` DESC LIMIT 250" % conf['db']['table'] + db_query = "SELECT `release` FROM %s WHERE `time` < (UNIX_TIMESTAMP(NOW()) - 86400) ORDER BY `time` DESC LIMIT ?" % conf['db']['table'] else: - db_query = "SELECT `release` FROM %s WHERE %s AND `time` < (UNIX_TIMESTAMP(NOW()) - 86400) ORDER BY `time` DESC LIMIT 250" % (conf['db']['table'], " AND ".join(fields)) + db_query = "SELECT `release` FROM %s WHERE %s AND `time` < (UNIX_TIMESTAMP(NOW()) - 86400) ORDER BY `time` DESC LIMIT ?" % (conf['db']['table'], " AND ".join(fields)) c.execute(db_query, values) From 974d28973dd20232e33cba058e05afbeaf91a5d7 Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Sun, 30 Jun 2013 21:17:37 +0200 Subject: [PATCH 17/17] Allow for a list of to-be-skipped release names --- README.md | 4 ++++ main.py | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/README.md b/README.md index 99fdd87..64b7f87 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,10 @@ contains all the available IPs on your machine. nzbspider will randomly pick one for every search query. If not specified, the OS default is used. +Using `--skip` you can specify a newline-delimited file that contains +all release names that should be skipped, no matter what. This works in +both modes. + ### Release list This is a text file, specified with the `--list` parameter, that diff --git a/main.py b/main.py index 939bcc4..92631fc 100644 --- a/main.py +++ b/main.py @@ -10,6 +10,7 @@ parser.add_argument("--list", dest="list", action="store", help="Use a newline-d parser.add_argument("--target", dest="target", action="store", help="Where to save the NZBs (only needed in list mode)") parser.add_argument("--iplist", dest="iplist", action="store", help="Bind every request to a random IP from a newline-delimited list") parser.add_argument("--limit", dest="limit", action="store", help="How many records to select in configuration file mode, at most (default: 250)", default=250) +parser.add_argument("--skip", dest="skip", action="store", help="Optionally, a path to a newline-delimited list of release names to always skip") args = parser.parse_args() if args.config is not None: @@ -25,6 +26,12 @@ if args.iplist is not None: iplist = iplist_file.read().splitlines() else: iplist = [""] + +if args.skip is not None: + skip_file = open(args.skip, "r") + skiplist = skip_file.read().splitlines() +else: + skiplist = [""] if mode == "config": try: @@ -112,6 +119,11 @@ for release in releases: # This NZB couldn't be found before notfound += 1 continue + + if release_name in skiplist: + # This release should be skipped + skipped += 1 + continue try: os.makedirs(target_dir)