Release 1.0

master
Sven Slootweg 11 years ago
commit 4786ea0443

2
.gitignore vendored

@ -0,0 +1,2 @@
*.pyc
config.json

@ -11,8 +11,77 @@ extend. Contributions welcome.
## Installing
You'll need to `pip install oursql` (this will require having the MySQL
development libraries installed). Other than that, just run main.py.
You'll need to `pip install oursql requests` (this will require having
the MySQL development libraries installed). Other than that, just run
main.py.
## Usage
You can use nzbspider with either a release list or a configuration
file.
Using `--iplist` you can specify a newline-delimited file that
contains all the available IPs on your machine. nzbspider will randomly
pick one for every search query. If not specified, the OS default is
used.
Using `--skip` you can specify a newline-delimited file that contains
all release names that should be skipped, no matter what. This works in
both modes.
### Release list
This is a text file, specified with the `--list` parameter, that
contains a newline-delimited list of release names to search for. You
will need to use the `--target` parameter to specify what directory to
download the NZBs to.
### Configuration file
This is a text file using a specific configuration syntax to select
specific releases from a pre-filled MySQl database, to search for. Use
the `--config` parameter to specify the path of the configuration file
you wish to use.
To use this mode, you will need to copy config.json.example to
config.json and change the database details to match yours. A (basic)
database schema is included. Only results that are at least 24 hours old
will be matched, regardless of your configuration.
The configuration file format is as follows:
* Newline-delimited, a new predicate on every line.
* Three whitespace-delimited fields: release name, section, and target
directory.
* Enter `-` for any or both of the first two fields to match regardless
of the release name or section (depending on which you fill in as `-`).
* The `%` character is used to denote a multi-character wildcard
anywhere in the first two fields.
* The first two fields are enclosed in wildcard characters by default.
* The target directory does not have to exist; it will be created if it
doesn't.
* You must enclose a field value in `"` quotes if it contains a space.
An example configuration file (the real configuration format doesn't
allow comments, so don't copy this verbatim!):
- MP3 ./mp3s # Will select everything in section 'MP3'
- - ./everything # Will select absolutely everything
IMMERSE - ./immerse # Will select everything labeled 'IMMERSE'
Mad.Men%720p - ./madmen # Will select every 720p episode of Mad Men
Note that these searches are run against your own database, not directly
against the NZB indexing sites! You'll still need a list of valid
release names pre-filled in your database.
Using `--limit` you can override the default limit of matched results.
The default is the 250 newest results.
## Notes
The script will assume that all releasenames in your database are safe
as a filename. No sanitation or conversion of the filenames will take
place.
## License

@ -0,0 +1,9 @@
{
"db": {
"host": "localhost",
"user": "nzbspider",
"pass": "sekrit",
"db": "nzbspider",
"table": "releases"
}
}

@ -1,4 +1,160 @@
import re, oursql
import re, oursql, requests, sys, json, shlex, argparse, os, random
from sources.nzbindex import NzbindexSpider
from sources.binsearch import BinsearchSpider
from shared import NotFoundException
parser = argparse.ArgumentParser(description="Automatically download NZBs for releases")
parser.add_argument("--config", dest="config", action="store", help="Use a configuration file to match against the database as source")
parser.add_argument("--list", dest="list", action="store", help="Use a newline-delimited list of releases as source")
parser.add_argument("--target", dest="target", action="store", help="Where to save the NZBs (only needed in list mode)")
parser.add_argument("--iplist", dest="iplist", action="store", help="Bind every request to a random IP from a newline-delimited list")
parser.add_argument("--limit", dest="limit", action="store", help="How many records to select in configuration file mode, at most (default: 250)", default=250)
parser.add_argument("--skip", dest="skip", action="store", help="Optionally, a path to a newline-delimited list of release names to always skip")
args = parser.parse_args()
if args.config is not None:
mode = "config"
elif args.list is not None:
mode = "list"
else:
sys.stderr.write("You must specify either a configuration file or a release list.\n")
exit(1)
if args.iplist is not None:
iplist_file = open(args.iplist, "r")
iplist = iplist_file.read().splitlines()
else:
iplist = [""]
if args.skip is not None:
skip_file = open(args.skip, "r")
skiplist = skip_file.read().splitlines()
else:
skiplist = [""]
if mode == "config":
try:
conf = json.load(open("config.json", "r"))
except IOError, e:
sys.stderr.write("You must have a valid config.json.\n")
exit(1)
if not re.match("^[a-zA-Z0-9_-]+$", conf['db']['table']):
sys.stderr.write("Table name must be a-z, A-Z, 0-9, _, -\n")
exit(1)
try:
searchconf_file = open(args.config, "r")
except IOError, e:
sys.stderr.write("The specified configuration file doesn't exist.\n")
exit(1)
queries = searchconf_file.read().splitlines()
searchconf_file.close()
db = oursql.connect(host=conf['db']['host'], user=conf['db']['user'], passwd=conf['db']['pass'], db=conf['db']['db'], autoreconnect=True)
c = db.cursor()
releases = []
for query in queries:
title, section, target = shlex.split(query)
fields = []
values = []
if title != "-":
fields.append("`release` LIKE ?")
values.append("%" + title + "%")
if section != "-":
fields.append("`section` LIKE ?")
values.append("%" + section + "%")
values.append(args.limit)
if len(fields) == 0:
db_query = "SELECT `release` FROM %s WHERE `time` < (UNIX_TIMESTAMP(NOW()) - 86400) ORDER BY `time` DESC LIMIT ?" % conf['db']['table']
else:
db_query = "SELECT `release` FROM %s WHERE %s AND `time` < (UNIX_TIMESTAMP(NOW()) - 86400) ORDER BY `time` DESC LIMIT ?" % (conf['db']['table'], " AND ".join(fields))
c.execute(db_query, values)
for row in c:
releases.append((row[0], target))
elif mode == "list":
if args.target is None:
sys.stderr.write("You did not specify a target directory with --target.\n")
exit(1)
try:
list_file = open(args.list, "r")
except IOError, e:
sys.stderr.write("The specified list file doesn't exist.\n")
exit(1)
releases = [(release, args.target) for release in list_file.read().splitlines()]
list_file.close()
sys.stdout.write("Found %d releases.\n" % len(releases))
downloaded = 0
skipped = 0
errors = 0
notfound = 0
notfound_list = []
for release in releases:
release_name, target_dir = release
target_path = os.path.join(target_dir, "%s.nzb" % release_name)
if os.path.exists(target_path):
# This NZB was already downloaded.
skipped += 1
continue
if release_name in notfound_list:
# This NZB couldn't be found before
notfound += 1
continue
if release_name in skiplist:
# This release should be skipped
skipped += 1
continue
try:
os.makedirs(target_dir)
except OSError, e:
# Target directory already exists
pass
try:
spider = NzbindexSpider(random.choice(iplist))
results = spider.find(release_name)
except NotFoundException, e:
try:
spider = BinsearchSpider(random.choice(iplist))
results = spider.find(release_name)
except NotFoundException, e:
sys.stderr.write("Could not find release %s\n" % release_name)
notfound_list.append(release_name)
notfound += 1
continue
# Process result
result = results[0]
try:
result.download(target_path)
except Exception, e:
errors += 1
sys.stderr.write("Downloading NZB for %s failed: %s\n" % (release_name, repr(e)))
continue
sys.stdout.write("Downloaded NZB for %s.\n" % release_name)
downloaded += 1
sys.stdout.write("Finished. %d downloaded, %d skipped, %d errors and %d not found.\n" % (downloaded, skipped, errors, notfound))

@ -0,0 +1,74 @@
import requests, random, socket
# These are just some random useragents, you can replace these with a different list
user_agents = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5 Safari/536.30.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.63 Safari/537.36"
]
class NotFoundException(Exception):
pass
class DownloadException(Exception):
pass
# Very nasty monkeypatching ahead!
socket.real_create_connection = socket.create_connection
class ModifiedSession(requests.Session):
def __init__(self, *args, **kwargs):
try:
self.bound_ip = kwargs['bound_ip']
del kwargs['bound_ip']
except KeyError, e:
self.bound_ip = ""
requests.Session.__init__(self, *args, **kwargs)
self.headers['User-Agent'] = random.choice(user_agents)
def patch_socket(self):
socket.create_connection = get_patched_func(self.bound_ip)
def unpatch_socket(self):
socket.create_connection = socket.real_create_connection
def get(self, *args, **kwargs):
self.patch_socket()
response = requests.Session.get(self, *args, **kwargs)
self.unpatch_socket()
return response
def post(self, *args, **kwargs):
self.patch_socket()
response = requests.Session.post(self, *args, **kwargs)
self.unpatch_socket()
return response
def get_patched_func(bind_addr):
def set_src_addr(*args):
address, timeout = args[0], args[1]
source_address = (bind_addr, 0)
return socket.real_create_connection(address, timeout, source_address)
return set_src_addr
# You're looking at duct tape and tie-wraps. It's like your local Home
# Depot, except in Python.
def download_file(request, target):
if request.status_code == 200:
f = open(target, "wb")
for chunk in request.iter_content():
f.write(chunk)
f.close()
else:
raise DownloadException("Status code was %s" % request.status_code)

@ -1,2 +1 @@
class NzbindexSpider(object):
pass

@ -1,2 +1,89 @@
from shared import NotFoundException, ModifiedSession, download_file
import requests, re, HTMLParser
class BinsearchSpider(object):
pass
def __init__(self, bound_ip):
self.bound_ip = bound_ip
def find(self, name):
parser = HTMLParser.HTMLParser()
self.session = ModifiedSession(bound_ip=self.bound_ip)
response = self.session.get("https://binsearch.info/index.php", params={
"q": name,
"m": "",
"adv_age": "600",
"max": "100",
"adv_g": "",
"adv_sort": "date",
"minsize": "100",
"maxsize": "",
"adv_col": "on",
"adv_nfo": "on",
"font": "",
"postdate": "",
"server": ""
}, verify=False)
search_results = []
# Nice try, corrupting your HTML to deter scrapers. Not going to stop me, though.
results = re.findall('<tr[^>]+>(.*?)<a href="browse\.php', response.text, re.DOTALL)
for result in results:
if 'requires password' in result:
# Password protected
continue
match = re.search('<span[^>]*class="s"[^>]*>(.*?)<\/span>', result, re.DOTALL)
if match is None:
continue
title = parser.unescape(re.sub("<[^>]+>", "", match.group(1)))
if name.lower() in title.lower():
match = re.search('<input[^>]*type="checkbox"[^>]*name="([0-9]+)"[^>]*>', result)
if match is not None:
search_results.append(BinsearchResult(name, title, match.group(1), self, response.url))
if len(search_results) == 0:
raise NotFoundException("No results were found.")
return search_results
class BinsearchResult(object):
def __init__(self, name, title, id_, spider, searchurl):
self.name = name
self.title = title
self.id_ = id_
self.spider = spider
self.searchurl = searchurl
def show(self):
print "%s -> %s (%s)" % (self.title, self.id_, self.name)
def download(self, target_path):
data_dict = {"action": "nzb"}
data_dict[self.id_] = "on"
self.spider.session.headers['Referer'] = self.searchurl
response = self.spider.session.post("https://www.binsearch.info/fcgi/nzb.fcgi", params={
"q": self.name,
"m": "",
"adv_age": "600",
"max": "100",
"adv_g": "",
"adv_sort": "date",
"minsize": "100",
"maxsize": "",
"adv_col": "on",
"adv_nfo": "on",
"font": "",
"postdate": "",
"server": ""
}, data=data_dict)
download_file(response, target_path)

@ -1 +1,67 @@
from shared import NotFoundException, ModifiedSession, download_file
import requests, re, HTMLParser
class NzbindexSpider(object):
def __init__(self, bound_ip):
self.bound_ip = bound_ip
def find(self, name):
parser = HTMLParser.HTMLParser()
self.session = ModifiedSession(bound_ip=self.bound_ip)
self.session.post("https://nzbindex.com/agree/", data={"agree": "I agree"}, verify=False)
response = self.session.get("https://nzbindex.com/search/", params={
"q": name,
"age": "",
"max": "50",
"minage": "",
"sort": "agedesc",
"minsize": "100",
"maxsize": "",
"dq": "",
"poster": "",
"nfo": "",
"hasnfo": "1",
"complete": "1",
"hidespam": "1",
"more": "1"
}, verify=False)
search_results = []
results = re.findall("<tr[^>]*>(.*?)<\/tr>", response.text, re.DOTALL)
for result in results:
if 'class="threat"' in result:
# Password protected or otherwise unsuitable for download
continue
match = re.search("<label[^>]*>(.*?)<\/label>", result, re.DOTALL)
if match is None:
continue
title = parser.unescape(re.sub("<[^>]*>", "", match.group(1)))
if name.lower() in title.lower():
match = re.search('https?:\/\/nzbindex\.com\/download\/[^"]+\.nzb', result)
if match is not None:
search_results.append(NzbindexResult(title, match.group(0), self))
if len(search_results) == 0:
raise NotFoundException("No results were found.")
return search_results
class NzbindexResult(object):
def __init__(self, title, url, spider):
self.title = title
self.url = url
self.spider = spider
def show(self):
print "%s -> %s" % (self.title, self.url)
def download(self, target_path):
download_file(self.spider.session.get(self.url), target_path)

@ -0,0 +1,11 @@
SET SQL_MODE="NO_AUTO_VALUE_ON_ZERO";
SET time_zone = "+00:00";
CREATE TABLE IF NOT EXISTS `releases` (
`releaseid` int(11) NOT NULL AUTO_INCREMENT,
`time` int(11) NOT NULL,
`section` varchar(50) COLLATE utf8_unicode_ci NOT NULL,
`release` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
PRIMARY KEY (`releaseid`),
UNIQUE KEY `release` (`release`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
Loading…
Cancel
Save