diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..936a3c8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pyc +config.json diff --git a/README.md b/README.md index fea08af..64b7f87 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,77 @@ extend. Contributions welcome. ## Installing -You'll need to `pip install oursql` (this will require having the MySQL -development libraries installed). Other than that, just run main.py. +You'll need to `pip install oursql requests` (this will require having +the MySQL development libraries installed). Other than that, just run +main.py. + +## Usage + +You can use nzbspider with either a release list or a configuration +file. + +Using `--iplist` you can specify a newline-delimited file that +contains all the available IPs on your machine. nzbspider will randomly +pick one for every search query. If not specified, the OS default is +used. + +Using `--skip` you can specify a newline-delimited file that contains +all release names that should be skipped, no matter what. This works in +both modes. + +### Release list + +This is a text file, specified with the `--list` parameter, that +contains a newline-delimited list of release names to search for. You +will need to use the `--target` parameter to specify what directory to +download the NZBs to. + +### Configuration file + +This is a text file using a specific configuration syntax to select +specific releases from a pre-filled MySQl database, to search for. Use +the `--config` parameter to specify the path of the configuration file +you wish to use. + +To use this mode, you will need to copy config.json.example to +config.json and change the database details to match yours. A (basic) +database schema is included. Only results that are at least 24 hours old +will be matched, regardless of your configuration. + +The configuration file format is as follows: + +* Newline-delimited, a new predicate on every line. +* Three whitespace-delimited fields: release name, section, and target + directory. +* Enter `-` for any or both of the first two fields to match regardless + of the release name or section (depending on which you fill in as `-`). +* The `%` character is used to denote a multi-character wildcard + anywhere in the first two fields. +* The first two fields are enclosed in wildcard characters by default. +* The target directory does not have to exist; it will be created if it + doesn't. +* You must enclose a field value in `"` quotes if it contains a space. + +An example configuration file (the real configuration format doesn't +allow comments, so don't copy this verbatim!): + + - MP3 ./mp3s # Will select everything in section 'MP3' + - - ./everything # Will select absolutely everything + IMMERSE - ./immerse # Will select everything labeled 'IMMERSE' + Mad.Men%720p - ./madmen # Will select every 720p episode of Mad Men + +Note that these searches are run against your own database, not directly +against the NZB indexing sites! You'll still need a list of valid +release names pre-filled in your database. + +Using `--limit` you can override the default limit of matched results. +The default is the 250 newest results. + +## Notes + +The script will assume that all releasenames in your database are safe +as a filename. No sanitation or conversion of the filenames will take +place. ## License diff --git a/config.json.example b/config.json.example new file mode 100644 index 0000000..0bb93dd --- /dev/null +++ b/config.json.example @@ -0,0 +1,9 @@ +{ + "db": { + "host": "localhost", + "user": "nzbspider", + "pass": "sekrit", + "db": "nzbspider", + "table": "releases" + } +} diff --git a/main.py b/main.py index 822e2df..92631fc 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,160 @@ -import re, oursql +import re, oursql, requests, sys, json, shlex, argparse, os, random from sources.nzbindex import NzbindexSpider from sources.binsearch import BinsearchSpider +from shared import NotFoundException + +parser = argparse.ArgumentParser(description="Automatically download NZBs for releases") +parser.add_argument("--config", dest="config", action="store", help="Use a configuration file to match against the database as source") +parser.add_argument("--list", dest="list", action="store", help="Use a newline-delimited list of releases as source") +parser.add_argument("--target", dest="target", action="store", help="Where to save the NZBs (only needed in list mode)") +parser.add_argument("--iplist", dest="iplist", action="store", help="Bind every request to a random IP from a newline-delimited list") +parser.add_argument("--limit", dest="limit", action="store", help="How many records to select in configuration file mode, at most (default: 250)", default=250) +parser.add_argument("--skip", dest="skip", action="store", help="Optionally, a path to a newline-delimited list of release names to always skip") +args = parser.parse_args() + +if args.config is not None: + mode = "config" +elif args.list is not None: + mode = "list" +else: + sys.stderr.write("You must specify either a configuration file or a release list.\n") + exit(1) + +if args.iplist is not None: + iplist_file = open(args.iplist, "r") + iplist = iplist_file.read().splitlines() +else: + iplist = [""] + +if args.skip is not None: + skip_file = open(args.skip, "r") + skiplist = skip_file.read().splitlines() +else: + skiplist = [""] + +if mode == "config": + try: + conf = json.load(open("config.json", "r")) + except IOError, e: + sys.stderr.write("You must have a valid config.json.\n") + exit(1) + + if not re.match("^[a-zA-Z0-9_-]+$", conf['db']['table']): + sys.stderr.write("Table name must be a-z, A-Z, 0-9, _, -\n") + exit(1) + + try: + searchconf_file = open(args.config, "r") + except IOError, e: + sys.stderr.write("The specified configuration file doesn't exist.\n") + exit(1) + + queries = searchconf_file.read().splitlines() + searchconf_file.close() + + db = oursql.connect(host=conf['db']['host'], user=conf['db']['user'], passwd=conf['db']['pass'], db=conf['db']['db'], autoreconnect=True) + c = db.cursor() + + releases = [] + + for query in queries: + title, section, target = shlex.split(query) + + fields = [] + values = [] + + if title != "-": + fields.append("`release` LIKE ?") + values.append("%" + title + "%") + + if section != "-": + fields.append("`section` LIKE ?") + values.append("%" + section + "%") + + values.append(args.limit) + + if len(fields) == 0: + db_query = "SELECT `release` FROM %s WHERE `time` < (UNIX_TIMESTAMP(NOW()) - 86400) ORDER BY `time` DESC LIMIT ?" % conf['db']['table'] + else: + db_query = "SELECT `release` FROM %s WHERE %s AND `time` < (UNIX_TIMESTAMP(NOW()) - 86400) ORDER BY `time` DESC LIMIT ?" % (conf['db']['table'], " AND ".join(fields)) + + c.execute(db_query, values) + + for row in c: + releases.append((row[0], target)) +elif mode == "list": + if args.target is None: + sys.stderr.write("You did not specify a target directory with --target.\n") + exit(1) + + try: + list_file = open(args.list, "r") + except IOError, e: + sys.stderr.write("The specified list file doesn't exist.\n") + exit(1) + + releases = [(release, args.target) for release in list_file.read().splitlines()] + list_file.close() + +sys.stdout.write("Found %d releases.\n" % len(releases)) + +downloaded = 0 +skipped = 0 +errors = 0 +notfound = 0 + +notfound_list = [] + +for release in releases: + release_name, target_dir = release + target_path = os.path.join(target_dir, "%s.nzb" % release_name) + + if os.path.exists(target_path): + # This NZB was already downloaded. + skipped += 1 + continue + + if release_name in notfound_list: + # This NZB couldn't be found before + notfound += 1 + continue + + if release_name in skiplist: + # This release should be skipped + skipped += 1 + continue + + try: + os.makedirs(target_dir) + except OSError, e: + # Target directory already exists + pass + + try: + spider = NzbindexSpider(random.choice(iplist)) + results = spider.find(release_name) + except NotFoundException, e: + try: + spider = BinsearchSpider(random.choice(iplist)) + results = spider.find(release_name) + except NotFoundException, e: + sys.stderr.write("Could not find release %s\n" % release_name) + notfound_list.append(release_name) + notfound += 1 + continue + + # Process result + result = results[0] + + try: + result.download(target_path) + except Exception, e: + errors += 1 + sys.stderr.write("Downloading NZB for %s failed: %s\n" % (release_name, repr(e))) + continue + + sys.stdout.write("Downloaded NZB for %s.\n" % release_name) + downloaded += 1 + +sys.stdout.write("Finished. %d downloaded, %d skipped, %d errors and %d not found.\n" % (downloaded, skipped, errors, notfound)) diff --git a/shared.py b/shared.py new file mode 100644 index 0000000..4bbce69 --- /dev/null +++ b/shared.py @@ -0,0 +1,74 @@ +import requests, random, socket + +# These are just some random useragents, you can replace these with a different list +user_agents = [ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0", + "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.63 Safari/537.36" +] + +class NotFoundException(Exception): + pass + +class DownloadException(Exception): + pass + +# Very nasty monkeypatching ahead! +socket.real_create_connection = socket.create_connection + +class ModifiedSession(requests.Session): + def __init__(self, *args, **kwargs): + try: + self.bound_ip = kwargs['bound_ip'] + del kwargs['bound_ip'] + except KeyError, e: + self.bound_ip = "" + + requests.Session.__init__(self, *args, **kwargs) + self.headers['User-Agent'] = random.choice(user_agents) + + def patch_socket(self): + socket.create_connection = get_patched_func(self.bound_ip) + + def unpatch_socket(self): + socket.create_connection = socket.real_create_connection + + def get(self, *args, **kwargs): + self.patch_socket() + response = requests.Session.get(self, *args, **kwargs) + self.unpatch_socket() + return response + + def post(self, *args, **kwargs): + self.patch_socket() + response = requests.Session.post(self, *args, **kwargs) + self.unpatch_socket() + return response + +def get_patched_func(bind_addr): + def set_src_addr(*args): + address, timeout = args[0], args[1] + source_address = (bind_addr, 0) + return socket.real_create_connection(address, timeout, source_address) + return set_src_addr + +# You're looking at duct tape and tie-wraps. It's like your local Home +# Depot, except in Python. + +def download_file(request, target): + if request.status_code == 200: + f = open(target, "wb") + + for chunk in request.iter_content(): + f.write(chunk) + + f.close() + else: + raise DownloadException("Status code was %s" % request.status_code) diff --git a/sources/__init__.py b/sources/__init__.py index 77f1e4a..8b13789 100644 --- a/sources/__init__.py +++ b/sources/__init__.py @@ -1,2 +1 @@ -class NzbindexSpider(object): - pass + diff --git a/sources/binsearch.py b/sources/binsearch.py index ed9d53d..932d720 100644 --- a/sources/binsearch.py +++ b/sources/binsearch.py @@ -1,2 +1,89 @@ +from shared import NotFoundException, ModifiedSession, download_file +import requests, re, HTMLParser + class BinsearchSpider(object): - pass + def __init__(self, bound_ip): + self.bound_ip = bound_ip + + def find(self, name): + parser = HTMLParser.HTMLParser() + self.session = ModifiedSession(bound_ip=self.bound_ip) + + response = self.session.get("https://binsearch.info/index.php", params={ + "q": name, + "m": "", + "adv_age": "600", + "max": "100", + "adv_g": "", + "adv_sort": "date", + "minsize": "100", + "maxsize": "", + "adv_col": "on", + "adv_nfo": "on", + "font": "", + "postdate": "", + "server": "" + }, verify=False) + + search_results = [] + + # Nice try, corrupting your HTML to deter scrapers. Not going to stop me, though. + results = re.findall(']+>(.*?)]*>(.*?)<\/span>', result, re.DOTALL) + + if match is None: + continue + + title = parser.unescape(re.sub("<[^>]+>", "", match.group(1))) + + if name.lower() in title.lower(): + match = re.search(']*type="checkbox"[^>]*name="([0-9]+)"[^>]*>', result) + + if match is not None: + search_results.append(BinsearchResult(name, title, match.group(1), self, response.url)) + + if len(search_results) == 0: + raise NotFoundException("No results were found.") + + return search_results + +class BinsearchResult(object): + def __init__(self, name, title, id_, spider, searchurl): + self.name = name + self.title = title + self.id_ = id_ + self.spider = spider + self.searchurl = searchurl + + def show(self): + print "%s -> %s (%s)" % (self.title, self.id_, self.name) + + def download(self, target_path): + data_dict = {"action": "nzb"} + data_dict[self.id_] = "on" + + self.spider.session.headers['Referer'] = self.searchurl + + response = self.spider.session.post("https://www.binsearch.info/fcgi/nzb.fcgi", params={ + "q": self.name, + "m": "", + "adv_age": "600", + "max": "100", + "adv_g": "", + "adv_sort": "date", + "minsize": "100", + "maxsize": "", + "adv_col": "on", + "adv_nfo": "on", + "font": "", + "postdate": "", + "server": "" + }, data=data_dict) + + download_file(response, target_path) diff --git a/sources/nzbindex.py b/sources/nzbindex.py index 8b13789..6a1f4ff 100644 --- a/sources/nzbindex.py +++ b/sources/nzbindex.py @@ -1 +1,67 @@ +from shared import NotFoundException, ModifiedSession, download_file +import requests, re, HTMLParser +class NzbindexSpider(object): + def __init__(self, bound_ip): + self.bound_ip = bound_ip + + def find(self, name): + parser = HTMLParser.HTMLParser() + self.session = ModifiedSession(bound_ip=self.bound_ip) + self.session.post("https://nzbindex.com/agree/", data={"agree": "I agree"}, verify=False) + + response = self.session.get("https://nzbindex.com/search/", params={ + "q": name, + "age": "", + "max": "50", + "minage": "", + "sort": "agedesc", + "minsize": "100", + "maxsize": "", + "dq": "", + "poster": "", + "nfo": "", + "hasnfo": "1", + "complete": "1", + "hidespam": "1", + "more": "1" + }, verify=False) + + search_results = [] + + results = re.findall("]*>(.*?)<\/tr>", response.text, re.DOTALL) + + for result in results: + if 'class="threat"' in result: + # Password protected or otherwise unsuitable for download + continue + + match = re.search("]*>(.*?)<\/label>", result, re.DOTALL) + + if match is None: + continue + + title = parser.unescape(re.sub("<[^>]*>", "", match.group(1))) + + if name.lower() in title.lower(): + match = re.search('https?:\/\/nzbindex\.com\/download\/[^"]+\.nzb', result) + + if match is not None: + search_results.append(NzbindexResult(title, match.group(0), self)) + + if len(search_results) == 0: + raise NotFoundException("No results were found.") + + return search_results + +class NzbindexResult(object): + def __init__(self, title, url, spider): + self.title = title + self.url = url + self.spider = spider + + def show(self): + print "%s -> %s" % (self.title, self.url) + + def download(self, target_path): + download_file(self.spider.session.get(self.url), target_path) diff --git a/structure.sql b/structure.sql new file mode 100644 index 0000000..ea5efd5 --- /dev/null +++ b/structure.sql @@ -0,0 +1,11 @@ +SET SQL_MODE="NO_AUTO_VALUE_ON_ZERO"; +SET time_zone = "+00:00"; + +CREATE TABLE IF NOT EXISTS `releases` ( + `releaseid` int(11) NOT NULL AUTO_INCREMENT, + `time` int(11) NOT NULL, + `section` varchar(50) COLLATE utf8_unicode_ci NOT NULL, + `release` varchar(255) COLLATE utf8_unicode_ci NOT NULL, + PRIMARY KEY (`releaseid`), + UNIQUE KEY `release` (`release`) +) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;