From 88e6819bf2200ce0f1b3af65778001876a2937db Mon Sep 17 00:00:00 2001 From: Sven Slootweg Date: Fri, 26 Oct 2012 17:51:27 +0200 Subject: [PATCH 01/17] Update the shared code and all resolvers for the new class-based model --- resolv/__init__.py | 29 ++++-- resolv/resolvers/dummy.py | 11 +- resolv/resolvers/filebox.py | 109 ++++++++----------- resolv/resolvers/mediafire.py | 62 ++++++----- resolv/resolvers/onechannel.py | 31 +++--- resolv/resolvers/pastebin.py | 70 +++++++------ resolv/resolvers/putlocker.py | 132 ++++++++++++----------- resolv/resolvers/sockshare.py | 132 ++++++++++++----------- resolv/resolvers/youtube.py | 185 +++++++++++++++++++-------------- resolv/shared.py | 41 ++++++++ 10 files changed, 461 insertions(+), 341 deletions(-) diff --git a/resolv/__init__.py b/resolv/__init__.py index 6c0ac33..aceb7e9 100644 --- a/resolv/__init__.py +++ b/resolv/__init__.py @@ -1,21 +1,28 @@ import re -from resolvers import * +import resolvers def resolve(url): if re.match("https?:\/\/(www\.)?putlocker\.com", url) is not None: - return putlocker.resolve(url) + task = resolvers.PutlockerTask(url) + return task.run() elif re.match("https?:\/\/(www\.)?sockshare\.com", url) is not None: - return sockshare.resolve(url) + task = resolvers.SockshareTask(url) + return task.run() elif re.match("https?:\/\/(www\.)?1channel\.ch\/external\.php", url) is not None: - return onechannel.resolve(url) + task = resolvers.OneChannelTask(url) + return task.run() elif re.match("https?:\/\/(www\.)?youtube\.com\/watch\?", url) is not None: - return youtube.resolve(url) + task = resolvers.YoutubeTask(url) + return task.run() elif re.match("https?:\/\/(www\.)?filebox\.com\/[a-zA-Z0-9]+", url) is not None: - return filebox.resolve(url) + task = resolvers.FileboxTask(url) + return task.run() elif re.match("https?:\/\/(www\.)?pastebin\.com\/[a-zA-Z0-9]+", url) is not None: - return pastebin.resolve(url) + task = resolvers.PastebinTask(url) + return task.run() elif re.match("https?:\/\/(www\.)?mediafire\.com\/\?[a-z0-9]+", url) is not None: - return mediafire.resolve(url) + task = resolvers.MediafireTask(url) + return task.run() else: return {} @@ -25,10 +32,10 @@ def recurse(url): while True: result = resolve(url) - if result == {}: + if result.state != "finished": return previous_result - elif 'url' not in result: + elif result.result_type != "url": return result - url = result['url'] + url = result.results['url'] previous_result = result diff --git a/resolv/resolvers/dummy.py b/resolv/resolvers/dummy.py index deeb57f..152a78c 100644 --- a/resolv/resolvers/dummy.py +++ b/resolv/resolvers/dummy.py @@ -1,2 +1,9 @@ -def resolve(input): - return {'dummy': input} +from resolv.shared import Task + +class DummyTask(Task): + result_type = "dummy" + + def run(self): + self.results = {'dummy': self.url} + self.state = "finished" + return self diff --git a/resolv/resolvers/filebox.py b/resolv/resolvers/filebox.py index c8bdff1..cc6c27a 100644 --- a/resolv/resolvers/filebox.py +++ b/resolv/resolvers/filebox.py @@ -1,70 +1,43 @@ import re, time, urllib2 -from resolv.shared import ResolverError +from resolv.shared import ResolverError, Task -def resolve(url): - matches = re.search("https?:\/\/(www\.)?filebox\.com\/([a-zA-Z0-9]+)", url) - - if matches is None: - raise ResolverError("The provided URL is not a valid Filebox.com URL.") - - video_id = matches.group(2) - - try: - contents = urllib2.urlopen("http://www.filebox.com/embed-%s-970x543.html" % video_id).read() - except: - raise ResolverError("Could not retrieve the video page.") - - matches = re.search("url: '([^']+)',", contents) - - if matches is None: - raise ResolverError("No video was found on the specified URL.") - - video_file = matches.group(1) - - stream_dict = { - 'url' : video_file, - 'quality' : "unknown", - 'priority' : 1, - 'format' : "unknown" - } - - return { 'title': "", 'videos': [stream_dict] } - -def resolve2(url): - # This is a fallback function in case no video could be found through the resolve() method. - # It's not recommended to use it, as it introduces a 5 second wait. - - try: - import mechanize - except ImportError: - raise ResolverError("The Python mechanize module is required to resolve Filebox.com URLs.") - - matches = re.search("https?:\/\/(www\.)?filebox\.com\/([a-zA-Z0-9]+)", url) - - if matches is None: - raise ResolverError("The provided URL is not a valid Filebox.com URL.") - - try: - browser = mechanize.Browser() - browser.set_handle_robots(False) - browser.open(url) - except: - raise ResolverError("The Filebox.com site could not be reached.") - - time.sleep(6) - - try: - browser.select_form(nr=0) - result = browser.submit() - page = result.read() - except Exception, e: - raise ResolverError("The file was removed, or the URL is incorrect.") - - matches = re.search("this\.play\('([^']+)'\)", page) - - if matches is None: - raise ResolverError("No video file was found on the given URL; the Filebox.com server for this file may be in maintenance mode, or the given URL may not be a video file. The Filebox.com resolver currently only supports video links.") - - video_file = matches.group(1) - - return { 'title': "", 'videos': { 'video': video_file } } +class FileboxTask(Task): + result_type = "video" + + def run(self): + matches = re.search("https?:\/\/(www\.)?filebox\.com\/([a-zA-Z0-9]+)", self.url) + + if matches is None: + self.state = "invalid" + raise ResolverError("The provided URL is not a valid Filebox.com URL.") + + video_id = matches.group(2) + + try: + contents = self.fetch_page("http://www.filebox.com/embed-%s-970x543.html" % video_id) + except urllib2.URLError, e: + self.state = "failed" + raise ResolverError("Could not retrieve the video page.") + + matches = re.search("url: '([^']+)',", contents) + + if matches is None: + self.state = "invalid" + raise ResolverError("No video was found on the specified URL. The Filebox.com resolver currently only supports videos.") + + video_file = matches.group(1) + + stream_dict = { + 'url' : video_file, + 'quality' : "unknown", + 'priority' : 1, + 'format' : "unknown" + } + + self.results = { + 'title': "", + 'videos': [stream_dict] + } + + self.state = "finished" + return self diff --git a/resolv/resolvers/mediafire.py b/resolv/resolvers/mediafire.py index 1e3641a..a3452af 100644 --- a/resolv/resolvers/mediafire.py +++ b/resolv/resolvers/mediafire.py @@ -1,28 +1,40 @@ import re, urllib2 -from resolv.shared import ResolverError, unescape +from resolv.shared import ResolverError, unescape, Task -def resolve(url): - try: - contents = urllib2.urlopen(url).read() - except: - raise ResolverError("Could not retrieve the specified URL.") +class MediafireTask(Task): + result_type = "file" - matches = re.search('kNO = "([^"]+)";', contents) - - if matches is None: - raise ResolverError("No download was found on the given URL; the server for this file may be in maintenance mode, or the given URL may not be valid. It is also possible that you have been blocked - CAPTCHA support is not yet present.") - - file_url = matches.group(1) - - try: - file_title = unescape(re.search('([^<]+)<\/title>', contents).group(1)) - except: - raise ResolverError("Could not find the download title.") - - file_dict = { - 'url' : file_url, - 'priority' : 1, - 'format' : "unknown" - } - - return { 'title': file_title, 'files': [file_dict] } + def run(self): + try: + contents = self.fetch_page(self.url) + except urllib2.URLError, e: + self.state = "failed" + raise ResolverError("Could not retrieve the specified URL.") + + matches = re.search('kNO = "([^"]+)";', contents) + + if matches is None: + self.state = "failed" + raise ResolverError("No download was found on the given URL; the server for this file may be in maintenance mode, or the given URL may not be valid. It is also possible that you have been blocked - CAPTCHA support is not yet present.") + + file_url = matches.group(1) + + try: + file_title = unescape(re.search('<title>([^<]+)<\/title>', contents).group(1)) + except: + self.state = "failed" + raise ResolverError("Could not find the download title.") + + file_dict = { + 'url' : file_url, + 'priority' : 1, + 'format' : "unknown" + } + + self.results = { + 'title': file_title, + 'files': [file_dict] + } + + self.state = "finished" + return self diff --git a/resolv/resolvers/onechannel.py b/resolv/resolvers/onechannel.py index d94ed7d..068096e 100644 --- a/resolv/resolvers/onechannel.py +++ b/resolv/resolvers/onechannel.py @@ -1,15 +1,22 @@ import re, base64 -from resolv.shared import ResolverError +from resolv.shared import ResolverError, Task -def resolve(url): - matches = re.search("https?:\/\/(www\.)?1channel\.ch\/external\.php\?.*url=([^&]+)", url) - - if matches is None: - raise ResolverError("The provided URL is not a valid external 1channel URL.") - - try: - real_url = base64.b64decode(matches.group(2)).strip() - except TypeError: - raise ResolverError("The provided URL is malformed.") +class OneChannelTask(Task): + result_type = "url" - return { 'url': real_url } + def run(self): + matches = re.search("https?:\/\/(www\.)?1channel\.ch\/external\.php\?.*url=([^&]+)", self.url) + + if matches is None: + self.state = "invalid" + raise ResolverError("The provided URL is not a valid external 1channel URL.") + + try: + real_url = base64.b64decode(matches.group(2)).strip() + except TypeError: + self.state = "failed" + raise ResolverError("The provided URL is malformed.") + + self.results = { 'url': real_url } + self.state = "finished" + return self diff --git a/resolv/resolvers/pastebin.py b/resolv/resolvers/pastebin.py index 4f2089f..061b24b 100644 --- a/resolv/resolvers/pastebin.py +++ b/resolv/resolvers/pastebin.py @@ -1,30 +1,42 @@ -import re, urllib, urllib2 -from resolv.shared import ResolverError, unescape +import re, urllib2 +from resolv.shared import ResolverError, unescape, Task -def resolve(url): - matches = re.search("https?:\/\/(www\.)?pastebin\.com\/([a-zA-Z0-9]+)", url) - - if matches is None: - raise ResolverError("The provided URL is not a valid Pastebin URL.") - - paste_id = matches.group(2) - - try: - contents = urllib2.urlopen(url).read() - except: - raise ResolverError("Could not retrieve the specified URL. The specified paste may not exist.") - - matches = re.search("<h1>([^<]+)</h1>", contents) - - if matches is None: - raise ResolverError("The provided URL is not a valid paste.") - - paste_title = unescape(matches.group(1)) - - file_dict = { - 'url' : "http://pastebin.com/download.php?i=%s" % paste_id, - 'priority' : 1, - 'format' : "text" - } - - return { 'title': paste_title, 'files': [file_dict] } +class PastebinTask(Task): + result_type = "text" + + def run(self): + matches = re.search("https?:\/\/(www\.)?pastebin\.com\/([a-zA-Z0-9]+)", self.url) + + if matches is None: + self.state = "invalid" + raise ResolverError("The provided URL is not a valid Pastebin URL.") + + paste_id = matches.group(2) + + try: + contents = self.fetch_page(self.url) + except urllib2.URLError, e: + self.state = "failed" + raise ResolverError("Could not retrieve the specified URL. The paste may not exist.") + + matches = re.search("<h1>([^<]+)</h1>", contents) + + if matches is None: + self.state = "invalid" + raise ResolverError("The provided URL is not a valid paste.") + + paste_title = unescape(matches.group(1)) + + resolved = { + 'url' : "http://pastebin.com/download.php?i=%s" % paste_id, + 'priority' : 1, + 'format' : "text" + } + + self.results = { + 'title': paste_title, + 'files': [resolved] + } + + self.state = "finished" + return self diff --git a/resolv/resolvers/putlocker.py b/resolv/resolvers/putlocker.py index b603e3a..dc75ce7 100644 --- a/resolv/resolvers/putlocker.py +++ b/resolv/resolvers/putlocker.py @@ -1,62 +1,78 @@ import re -from resolv.shared import ResolverError, unescape +from resolv.shared import ResolverError, unescape, Task -def resolve(url): - try: - import mechanize - except ImportError: - raise ResolverError("The Python mechanize module is required to resolve PutLocker URLs.") +class PutlockerTask(Task): + result_type = "video" - matches = re.search("https?:\/\/(www\.)?putlocker\.com\/(file|embed)\/([A-Z0-9]+)", url) + def run(self): + try: + import mechanize + except ImportError: + self.state = "failed" + raise ResolverError("The Python mechanize module is required to resolve PutLocker URLs.") + + matches = re.search("https?:\/\/(www\.)?putlocker\.com\/(file|embed)\/([A-Z0-9]+)", self.url) - if matches is None: - raise ResolverError("The provided URL is not a valid PutLocker URL.") - - video_id = matches.group(3) - - try: - browser = mechanize.Browser() - browser.set_handle_robots(False) - browser.open("http://putlocker.com/embed/%s" % video_id) - except: - raise ResolverError("The PutLocker site could not be reached.") - - try: - browser.select_form(nr=0) - result = browser.submit() - page = result.read() - except Exception, e: - raise ResolverError("The file was removed, or the URL is incorrect.") - - matches = re.search("playlist: '([^']+)'", page) - - if matches is None: - raise ResolverError("No playlist was found on the given URL; the PutLocker server for this file may be in maintenance mode, or the given URL may not be a video file. The PutLocker resolver currently only supports video links.") - - playlist = matches.group(1) - - try: - browser.open("http://www.putlocker.com%s" % playlist) - except: - raise ResolverError("The playlist file for the given URL could not be loaded.") - - matches = re.search("url=\"([^\"]+)\" type=\"video\/x-flv\"", browser.response().read()) - - if matches is None: - raise ResolverError("The playlist file does not contain any video URLs. The PutLocker resolver currently only supports video links.") - - video_file = matches.group(1) - - try: - video_title = unescape(re.search('<a href="\/file\/[^"]+"[^>]*><strong>([^<]*)<\/strong><\/a>', page).group(1)) - except: - raise ResolverError("Could not find the video title.") - - stream_dict = { - 'url' : video_file, - 'quality' : "unknown", - 'priority' : 1, - 'format' : "unknown" - } - - return { 'title': video_title, 'videos': [stream_dict] } + if matches is None: + self.state = "invalid" + raise ResolverError("The provided URL is not a valid PutLocker URL.") + + video_id = matches.group(3) + + try: + browser = mechanize.Browser() + browser.set_handle_robots(False) + browser.open("http://putlocker.com/embed/%s" % video_id) + except: + self.state = "failed" + raise ResolverError("The PutLocker site could not be reached.") + + try: + browser.select_form(nr=0) + result = browser.submit() + page = result.read() + except Exception, e: + self.state = "nonexistent" + raise ResolverError("The file was removed, or the URL is incorrect.") + + matches = re.search("playlist: '([^']+)'", page) + + if matches is None: + raise ResolverError("No playlist was found on the given URL; the PutLocker server for this file may be in maintenance mode, or the given URL may not be a video file. The PutLocker resolver currently only supports video links.") + + playlist = matches.group(1) + + try: + browser.open("http://www.putlocker.com%s" % playlist) + except: + self.state = "failed" + raise ResolverError("The playlist file for the given URL could not be loaded.") + + matches = re.search("url=\"([^\"]+)\" type=\"video\/x-flv\"", browser.response().read()) + + if matches is None: + self.state = "failed" + raise ResolverError("The playlist file does not contain any video URLs. The PutLocker resolver currently only supports video links.") + + video_file = matches.group(1) + + try: + video_title = unescape(re.search('<a href="\/file\/[^"]+"[^>]*><strong>([^<]*)<\/strong><\/a>', page).group(1)) + except: + self.state = "failed" + raise ResolverError("Could not find the video title.") + + stream_dict = { + 'url' : video_file, + 'quality' : "unknown", + 'priority' : 1, + 'format' : "unknown" + } + + self.results = { + 'title': video_title, + 'videos': [stream_dict] + } + + self.state = "finished" + return self diff --git a/resolv/resolvers/sockshare.py b/resolv/resolvers/sockshare.py index 079b110..a13fca6 100644 --- a/resolv/resolvers/sockshare.py +++ b/resolv/resolvers/sockshare.py @@ -1,62 +1,78 @@ import re -from resolv.shared import ResolverError, unescape +from resolv.shared import ResolverError, unescape, Task -def resolve(url): - try: - import mechanize - except ImportError: - raise ResolverError("The Python mechanize module is required to resolve SockShare URLs.") +class SockshareTask(Task): + result_type = "video" - matches = re.search("https?:\/\/(www\.)?sockshare\.com\/(file|embed)\/([A-Z0-9]+)", url) + def run(self): + try: + import mechanize + except ImportError: + self.state = "failed" + raise ResolverError("The Python mechanize module is required to resolve Sockshare URLs.") + + matches = re.search("https?:\/\/(www\.)?sockshare\.com\/(file|embed)\/([A-Z0-9]+)", self.url) - if matches is None: - raise ResolverError("The provided URL is not a valid SockShare URL.") - - video_id = matches.group(3) - - try: - browser = mechanize.Browser() - browser.set_handle_robots(False) - browser.open("http://sockshare.com/embed/%s" % video_id) - except: - raise ResolverError("The SockShare site could not be reached.") - - try: - browser.select_form(nr=0) - result = browser.submit() - page = result.read() - except Exception, e: - raise ResolverError("The file was removed, or the URL is incorrect.") - - matches = re.search("playlist: '([^']+)'", page) - - if matches is None: - raise ResolverError("No playlist was found on the given URL; the SockShare server for this file may be in maintenance mode, or the given URL may not be a video file. The SockShare resolver currently only supports video links.") - - playlist = matches.group(1) - - try: - browser.open("http://www.sockshare.com%s" % playlist) - except: - raise ResolverError("The playlist file for the given URL could not be loaded.") - - matches = re.search("url=\"([^\"]+)\" type=\"video\/x-flv\"", browser.response().read()) - - if matches is None: - raise ResolverError("The playlist file does not contain any video URLs. The SockShare resolver currently only supports video links.") - - video_file = matches.group(1) - - try: - video_title = unescape(re.search('<a href="\/file\/[^"]+"[^>]*><strong>([^<]*)<\/strong><\/a>', page).group(1)) - except: - raise ResolverError("Could not find the video title.") - - stream_dict = { - 'url' : video_file, - 'quality' : "unknown", - 'priority' : 1, - 'format' : "unknown" - } - - return { 'title': video_title, 'videos': [stream_dict] } + if matches is None: + self.state = "invalid" + raise ResolverError("The provided URL is not a valid Sockshare URL.") + + video_id = matches.group(3) + + try: + browser = mechanize.Browser() + browser.set_handle_robots(False) + browser.open("http://sockshare.com/embed/%s" % video_id) + except: + self.state = "failed" + raise ResolverError("The Sockshare site could not be reached.") + + try: + browser.select_form(nr=0) + result = browser.submit() + page = result.read() + except Exception, e: + self.state = "nonexistent" + raise ResolverError("The file was removed, or the URL is incorrect.") + + matches = re.search("playlist: '([^']+)'", page) + + if matches is None: + raise ResolverError("No playlist was found on the given URL; the Sockshare server for this file may be in maintenance mode, or the given URL may not be a video file. The Sockshare resolver currently only supports video links.") + + playlist = matches.group(1) + + try: + browser.open("http://www.sockshare.com%s" % playlist) + except: + self.state = "failed" + raise ResolverError("The playlist file for the given URL could not be loaded.") + + matches = re.search("url=\"([^\"]+)\" type=\"video\/x-flv\"", browser.response().read()) + + if matches is None: + self.state = "failed" + raise ResolverError("The playlist file does not contain any video URLs. The Sockshare resolver currently only supports video links.") + + video_file = matches.group(1) + + try: + video_title = unescape(re.search('<a href="\/file\/[^"]+"[^>]*><strong>([^<]*)<\/strong><\/a>', page).group(1)) + except: + self.state = "failed" + raise ResolverError("Could not find the video title.") + + stream_dict = { + 'url' : video_file, + 'quality' : "unknown", + 'priority' : 1, + 'format' : "unknown" + } + + self.results = { + 'title': video_title, + 'videos': [stream_dict] + } + + self.state = "finished" + return self diff --git a/resolv/resolvers/youtube.py b/resolv/resolvers/youtube.py index 6ffd2e1..e76aea7 100644 --- a/resolv/resolvers/youtube.py +++ b/resolv/resolvers/youtube.py @@ -1,88 +1,117 @@ -import re, urllib, urllib2 -from resolv.shared import ResolverError, unescape +import re, urllib, urllib2, urlparse +from resolv.shared import ResolverError, unescape, Task -def resolve(url): - try: - contents = urllib2.urlopen(url).read() - except: - raise ResolverError("Could not retrieve the specified URL.") +class YoutubeTask(Task): + result_type = "video" - map_start = "url_encoded_fmt_stream_map=" - map_end = "\\u0026amp;" + extra_headers = { + 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-us,en;q=0.5' + } - try: - pos_start = contents.index(map_start) + len(map_start) + 6 - snippet = contents[pos_start:] - except ValueError: - raise ResolverError("The starting position for the YouTube player configuration could not be found. Is the URL really a valid video page?") - - try: - pos_end = snippet.index(map_end) - stream_map = snippet[:pos_end] - except ValueError: - raise ResolverError("The ending position for the YouTube player configuration could not be found.") - - try: - stream_map = urllib.unquote(stream_map) - streams = stream_map.split(',url=') - except: - raise ResolverError("The YouTube player configuration is corrupted.") - - stream_pool = [] - - for stream in streams: - fields = stream.split('&') + def run(self): + try: + contents = self.fetch_page(self.url) + except urllib2.URLError, e: + self.state = "failed" + raise ResolverError("Could not retrieve the specified URL.") + + map_start = "url_encoded_fmt_stream_map=" + map_end = "\\u0026amp;" - if len(fields) < 5: - raise ResolverError("The amount of fields in the YouTube player configuration is incorrect.") + try: + pos_start = contents.index(map_start) + len(map_start) + snippet = contents[pos_start:] + except ValueError: + self.state = "failed" + raise ResolverError("The starting position for the YouTube player configuration could not be found. Is the URL really a valid video page?") - video_url = urllib.unquote(fields[0]) - quality = fields[1].split("=")[1] - fallback_host = fields[2].split("=")[1] - mimetype = urllib.unquote(fields[3].split("=")[1]) - itag = fields[4].split("=", 2)[1] + try: + pos_end = snippet.index(map_end) + stream_map = snippet[:pos_end] + except ValueError: + self.state = "failed" + raise ResolverError("The ending position for the YouTube player configuration could not be found.") - if mimetype.startswith("video/mp4"): - video_format = "mp4" - elif mimetype.startswith("video/x-flv"): - video_format = "flv" - elif mimetype.startswith("video/3gpp"): - video_format = "3gp" - elif mimetype.startswith("video/webm"): - video_format = "webm" - else: - video_format = "unknown" + try: + stream_map = urllib.unquote(stream_map) + streams = stream_map.split(',') + except: + self.state = "failed" + raise ResolverError("The YouTube player configuration is corrupted.") - if quality == "small": - video_quality = "240p" - video_priority = 5 - elif quality == "medium": - video_quality = "360p" - video_priority = 4 - elif quality == "large": - video_quality = "480p" - video_priority = 3 - elif quality == "hd720": - video_quality = "720p" - video_priority = 2 - elif quality == "hd1080": - video_quality = "1080p" - video_priority = 1 - else: - video_quality = "unknown" + stream_pool = [] - stream_dict = { - 'url' : video_url, - 'quality' : video_quality, - 'priority' : video_priority, - 'format' : video_format + for stream in streams: + fields = urlparse.parse_qs(stream) + + if len(fields) < 6: + self.state = "failed" + raise ResolverError("The amount of fields in the YouTube player configuration is incorrect.") + + signature = fields['sig'][0] + video_url = "%s&signature=%s" % (fields['url'][0], signature) + quality = fields['quality'][0] + fallback_host = fields['fallback_host'][0] + mimetype = fields['type'][0] + itag = fields['itag'][0] + + if mimetype.startswith("video/mp4"): + video_format = "mp4" + elif mimetype.startswith("video/x-flv"): + video_format = "flv" + elif mimetype.startswith("video/3gpp"): + video_format = "3gp" + elif mimetype.startswith("video/webm"): + video_format = "webm" + else: + video_format = "unknown" + + if quality == "small": + video_quality = "240p" + video_priority = 5 + elif quality == "medium": + video_quality = "360p" + video_priority = 4 + elif quality == "large": + video_quality = "480p" + video_priority = 3 + elif quality == "hd720": + video_quality = "720p" + video_priority = 2 + elif quality == "hd1080": + video_quality = "1080p" + video_priority = 1 + else: + video_quality = "unknown" + video_priority = 0 + print "UNKNOWN: %s" % quality + + stream_dict = { + 'url' : video_url, + 'quality' : video_quality, + 'priority' : video_priority, + 'format' : video_format, + 'extra' : { + 'itag': itag, + 'mimetype': mimetype, + 'fallback_host': fallback_host + } + } + + stream_pool.append(stream_dict) + + try: + video_title = unescape(re.search('<meta property="og:title" content="([^"]*)">', contents).group(1)) + except: + self.state = "failed" + raise ResolverError("Could not find the video title.") + + self.results = { + 'title': video_title, + 'videos': stream_pool } - stream_pool.append(stream_dict) - - try: - video_title = unescape(re.search('<meta property="og:title" content="([^"]*)">', contents).group(1)) - except: - raise ResolverError("Could not find the video title.") - - return { 'title': video_title, 'videos': stream_pool } + self.state = "finished" + return self diff --git a/resolv/shared.py b/resolv/shared.py index a668870..5598bb4 100644 --- a/resolv/shared.py +++ b/resolv/shared.py @@ -1,4 +1,5 @@ from HTMLParser import HTMLParser +import cookielib, urllib2 import sys reload(sys) @@ -11,5 +12,45 @@ class ResolverError(Exception): def __str__(self): return repr(self.val) +class Task(): + captcha = None + cookiejar = None + useragent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.47 Safari/536.11" + opener = None + results = None + state = "none" + url = "" + result_type = "none" + extra_headers = {} + + def __init__(self, url): + self.cookiejar = cookielib.CookieJar() + + self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookiejar)) + self.opener.addheaders = [] + + self.extra_headers['User-agent'] = self.useragent + + for header, payload in self.extra_headers.iteritems(): + self.opener.addheaders.append((header, payload)) + + self.url = url + + def run(self): + self.state = "finished" + self.results = self.url + return self + + def fetch_page(self, url): + return self.opener.open(url).read() + +class Captcha(): + image = "" + audio = "" + + def __init__(image="", audio=""): + self.image = image + self.audio = audio + def unescape(s): return HTMLParser.unescape.__func__(HTMLParser, s) From 1d8332a7766ef36394ceb1cf396c4ce5bd4ba56f Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Fri, 26 Oct 2012 18:49:53 +0200 Subject: [PATCH 02/17] Fixes --- resolv/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/resolv/__init__.py b/resolv/__init__.py index aceb7e9..8b58b0f 100644 --- a/resolv/__init__.py +++ b/resolv/__init__.py @@ -1,6 +1,8 @@ import re import resolvers +from resolv.shared import ResolverError + def resolve(url): if re.match("https?:\/\/(www\.)?putlocker\.com", url) is not None: task = resolvers.PutlockerTask(url) @@ -24,7 +26,7 @@ def resolve(url): task = resolvers.MediafireTask(url) return task.run() else: - return {} + raise ResolverError("No suitable resolver found for %s" % url) def recurse(url): previous_result = {} @@ -32,7 +34,7 @@ def recurse(url): while True: result = resolve(url) - if result.state != "finished": + if result.state == "failed": return previous_result elif result.result_type != "url": return result From 1fb10f2a559073fa8049e68f778c78fb7e2808f9 Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Fri, 26 Oct 2012 18:50:28 +0200 Subject: [PATCH 03/17] Pass on the correct referer and add support for POST requests --- resolv/shared.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/resolv/shared.py b/resolv/shared.py index 5598bb4..3a0c5c9 100644 --- a/resolv/shared.py +++ b/resolv/shared.py @@ -1,5 +1,5 @@ from HTMLParser import HTMLParser -import cookielib, urllib2 +import cookielib, urllib, urllib2 import sys reload(sys) @@ -22,6 +22,7 @@ class Task(): url = "" result_type = "none" extra_headers = {} + last_url = "" def __init__(self, url): self.cookiejar = cookielib.CookieJar() @@ -42,7 +43,23 @@ class Task(): return self def fetch_page(self, url): - return self.opener.open(url).read() + request = urllib2.Request(url) + + if self.last_url != "": + request.add_header("Referer", self.last_url) + + self.last_url = url + return self.opener.open(request).read() + + def post_page(self, url, data): + payload = urllib.urlencode(data) + request = urllib2.Request(url, payload) + + if self.last_url != "": + request.add_header("Referer", self.last_url) + + self.last_url = url + return self.opener.open(request).read() class Captcha(): image = "" From 0238b5b31debb7f28ff860027656eae9c12de422 Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Fri, 26 Oct 2012 18:50:49 +0200 Subject: [PATCH 04/17] Add support for password-protected downloads --- resolv/resolvers/mediafire.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/resolv/resolvers/mediafire.py b/resolv/resolvers/mediafire.py index a3452af..228c32d 100644 --- a/resolv/resolvers/mediafire.py +++ b/resolv/resolvers/mediafire.py @@ -11,10 +11,28 @@ class MediafireTask(Task): self.state = "failed" raise ResolverError("Could not retrieve the specified URL.") + if '<form name="form_password"' in contents: + # The file is password-protected + self.state = "need_password" + return self + else: + return self._find_link(contents) + + def verify_password(self, password): + contents = self.post_page(self.url, {'downloadp': password}) + + if '<form name="form_password"' in contents: + self.state = "password_invalid" + return self + else: + return self._find_link(contents) + + def _find_link(self, contents): matches = re.search('kNO = "([^"]+)";', contents) if matches is None: self.state = "failed" + print contents raise ResolverError("No download was found on the given URL; the server for this file may be in maintenance mode, or the given URL may not be valid. It is also possible that you have been blocked - CAPTCHA support is not yet present.") file_url = matches.group(1) From bf986393b992365183417fac213d0c3151726b32 Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Fri, 26 Oct 2012 19:06:00 +0200 Subject: [PATCH 05/17] Add testing script --- test.py | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 test.py diff --git a/test.py b/test.py new file mode 100644 index 0000000..18d5ca9 --- /dev/null +++ b/test.py @@ -0,0 +1,76 @@ +import resolv, urllib, urllib2, argparse + +# TODO: +# http://www.mediafire.com/view/?vxltbkr2l9ycmah => http://www.mediafire.com/?vxltbkr2l9ycmah + +suites = { + '1channel': { + "1channel + PutLocker (video)": "http://www.1channel.ch/external.php?title=Big+Buck+Bunny&url=aHR0cDovL3d3dy5wdXRsb2NrZXIuY29tL2ZpbGUvOTg3RkVCRjVEQjY0NUUyRQ==&domain=cHV0bG9ja2VyLmNvbQ==&loggedin=0" + }, + 'putlocker': { + "PutLocker (video)": "http://www.putlocker.com/file/987FEBF5DB645E2E", + "SockShare (video)": "http://www.sockshare.com/file/88DF2133C85521BD" + }, + 'filebox': { + "Filebox (video)": "http://www.filebox.com/p0rp8nabrcfk" + }, + 'pastebin': { + "Pastebin": "http://pastebin.com/imyEc26g" + }, + 'mediafire': { + "MediaFire": "http://www.mediafire.com/?vxltbkr2l9ycmah", + "MediaFire with password (mfddl)": "http://www.mediafire.com/?traa1p0lki9611h" + }, + 'youtube': { + "YouTube": "http://www.youtube.com/watch?v=XSGBVzeBUbk" + } +} + +parser = argparse.ArgumentParser(description='Testing script for the resolv library.') + +parser.add_argument('suites', metavar='SUITE', type=str, nargs='*', + help='suites to test (leave empty to test all suites)') + +args = parser.parse_args() +options = vars(args) + +to_test = {} + +if len(options['suites']) == 0: + for suite in suites: + for description, url in suite.iteritems(): + to_test[description] = url +else: + for suite in options['suites']: + for description, url in suites[suite].iteritems(): + to_test[description] = url + +def process_result(res): + if res.state == "finished": + print "Successful!\nType: %s\nResults: %s\nCookie jar: %s" % (res.result_type, str(res.results), str(res.cookiejar)) + elif res.state == "failed": + print "Failed." + elif res.state == "invalid": + print "Invalid URL." + elif res.state == "need_password": + pw = raw_input("Password required. Enter password: ") + res.verify_password(pw) + process_result(res) + elif res.state == "password_invalid": + pw = raw_input("Password invalid! Try again: ") + res.verify_password(pw) + process_result(res) + else: + print "Unknown result state: %s" % res.state + +for title, url in to_test.iteritems(): + print "============ %s ============" % title + print "RESOLVE:" + res = resolv.resolve(url) + process_result(res) + print "" + print "RECURSE:" + res = resolv.recurse(url) + process_result(res) + print "" + From b1c0bf1a26517dd14cfbe1f9553e9684f2779552 Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Fri, 26 Oct 2012 19:14:09 +0200 Subject: [PATCH 06/17] Make Captcha class more sensible --- resolv/shared.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/resolv/shared.py b/resolv/shared.py index 3a0c5c9..43dbd7c 100644 --- a/resolv/shared.py +++ b/resolv/shared.py @@ -62,12 +62,14 @@ class Task(): return self.opener.open(request).read() class Captcha(): - image = "" - audio = "" + image = None + audio = None + text = None - def __init__(image="", audio=""): + def __init__(image=None, audio=None, text=None): self.image = image self.audio = audio + self.text = text def unescape(s): return HTMLParser.unescape.__func__(HTMLParser, s) From 414aa6ba52ec8744fc00bc2ca4668c011a8a4380 Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Fri, 26 Oct 2012 19:16:21 +0200 Subject: [PATCH 07/17] Add functions to Captcha class for retrieving the audio and image captcha representations using the corresponding Tasks cookie jar --- resolv/shared.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/resolv/shared.py b/resolv/shared.py index 43dbd7c..32ba4a8 100644 --- a/resolv/shared.py +++ b/resolv/shared.py @@ -65,11 +65,19 @@ class Captcha(): image = None audio = None text = None + task = None - def __init__(image=None, audio=None, text=None): + def __init__(self, task, image=None, audio=None, text=None): self.image = image self.audio = audio self.text = text + self.task = task + + def get_image(self): + return self.task.fetch_page(self.image) + + def get_audio(self): + return self.task.fetch_page(self.audio) def unescape(s): return HTMLParser.unescape.__func__(HTMLParser, s) From 974be1cb132d61776b7bccc541f5920c75eadcce Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Fri, 26 Oct 2012 23:30:49 +0200 Subject: [PATCH 08/17] Fix video priority --- resolv/resolvers/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/resolv/resolvers/youtube.py b/resolv/resolvers/youtube.py index e76aea7..8f766b2 100644 --- a/resolv/resolvers/youtube.py +++ b/resolv/resolvers/youtube.py @@ -85,11 +85,12 @@ class YoutubeTask(Task): video_priority = 1 else: video_quality = "unknown" - video_priority = 0 + video_priority = 10 print "UNKNOWN: %s" % quality stream_dict = { 'url' : video_url, + 'method' : "GET", 'quality' : video_quality, 'priority' : video_priority, 'format' : video_format, From 4ef042b603f150c67492457d534e6f25dd4d6c08 Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Fri, 26 Oct 2012 23:31:09 +0200 Subject: [PATCH 09/17] Add placeholder functions for CAPTCHAs and passwords --- resolv/shared.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/resolv/shared.py b/resolv/shared.py index 32ba4a8..4c583a9 100644 --- a/resolv/shared.py +++ b/resolv/shared.py @@ -60,6 +60,22 @@ class Task(): self.last_url = url return self.opener.open(request).read() + + def verify_password(password): + # Has to be overridden by inherited classes. + pass + + def verify_image_captcha(solution): + # Has to be overridden by inherited classes. + pass + + def verify_audio_captcha(solution): + # Has to be overridden by inherited classes. + pass + + def verify_text_captcha(solution): + # Has to be overridden by inherited classes. + pass class Captcha(): image = None From a8186e46ed7499349c31295bb854bb8bc3ac04c0 Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Fri, 26 Oct 2012 23:33:09 +0200 Subject: [PATCH 10/17] Fix bug where specifying no testing suites would break the test script --- test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test.py b/test.py index 18d5ca9..5d6b356 100644 --- a/test.py +++ b/test.py @@ -37,7 +37,7 @@ options = vars(args) to_test = {} if len(options['suites']) == 0: - for suite in suites: + for key, suite in suites.iteritems(): for description, url in suite.iteritems(): to_test[description] = url else: From d6dd91cd853e99b524b97ebe9e2db8cd3ec44ed5 Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Fri, 26 Oct 2012 23:33:17 +0200 Subject: [PATCH 11/17] Add metadata and HTTP methods --- resolv/resolvers/dummy.py | 4 ++++ resolv/resolvers/filebox.py | 5 +++++ resolv/resolvers/mediafire.py | 5 +++++ resolv/resolvers/onechannel.py | 4 ++++ resolv/resolvers/pastebin.py | 5 +++++ resolv/resolvers/putlocker.py | 5 +++++ resolv/resolvers/sockshare.py | 5 +++++ resolv/resolvers/youtube.py | 4 ++++ 8 files changed, 37 insertions(+) diff --git a/resolv/resolvers/dummy.py b/resolv/resolvers/dummy.py index 152a78c..7adb959 100644 --- a/resolv/resolvers/dummy.py +++ b/resolv/resolvers/dummy.py @@ -3,6 +3,10 @@ from resolv.shared import Task class DummyTask(Task): result_type = "dummy" + name = "Dummy Resolver" + author = "Sven Slootweg" + author_url = "http://cryto.net/~joepie91" + def run(self): self.results = {'dummy': self.url} self.state = "finished" diff --git a/resolv/resolvers/filebox.py b/resolv/resolvers/filebox.py index cc6c27a..4400e5a 100644 --- a/resolv/resolvers/filebox.py +++ b/resolv/resolvers/filebox.py @@ -4,6 +4,10 @@ from resolv.shared import ResolverError, Task class FileboxTask(Task): result_type = "video" + name = "Filebox.com" + author = "Sven Slootweg" + author_url = "http://cryto.net/~joepie91" + def run(self): matches = re.search("https?:\/\/(www\.)?filebox\.com\/([a-zA-Z0-9]+)", self.url) @@ -29,6 +33,7 @@ class FileboxTask(Task): stream_dict = { 'url' : video_file, + 'method' : "GET", 'quality' : "unknown", 'priority' : 1, 'format' : "unknown" diff --git a/resolv/resolvers/mediafire.py b/resolv/resolvers/mediafire.py index 228c32d..bc5f405 100644 --- a/resolv/resolvers/mediafire.py +++ b/resolv/resolvers/mediafire.py @@ -4,6 +4,10 @@ from resolv.shared import ResolverError, unescape, Task class MediafireTask(Task): result_type = "file" + name = "MediaFire" + author = "Sven Slootweg" + author_url = "http://cryto.net/~joepie91" + def run(self): try: contents = self.fetch_page(self.url) @@ -45,6 +49,7 @@ class MediafireTask(Task): file_dict = { 'url' : file_url, + 'method' : "GET", 'priority' : 1, 'format' : "unknown" } diff --git a/resolv/resolvers/onechannel.py b/resolv/resolvers/onechannel.py index 068096e..ad9f5ad 100644 --- a/resolv/resolvers/onechannel.py +++ b/resolv/resolvers/onechannel.py @@ -4,6 +4,10 @@ from resolv.shared import ResolverError, Task class OneChannelTask(Task): result_type = "url" + name = "1channel" + author = "Sven Slootweg" + author_url = "http://cryto.net/~joepie91" + def run(self): matches = re.search("https?:\/\/(www\.)?1channel\.ch\/external\.php\?.*url=([^&]+)", self.url) diff --git a/resolv/resolvers/pastebin.py b/resolv/resolvers/pastebin.py index 061b24b..02a5a22 100644 --- a/resolv/resolvers/pastebin.py +++ b/resolv/resolvers/pastebin.py @@ -4,6 +4,10 @@ from resolv.shared import ResolverError, unescape, Task class PastebinTask(Task): result_type = "text" + name = "Pastebin" + author = "Sven Slootweg" + author_url = "http://cryto.net/~joepie91" + def run(self): matches = re.search("https?:\/\/(www\.)?pastebin\.com\/([a-zA-Z0-9]+)", self.url) @@ -29,6 +33,7 @@ class PastebinTask(Task): resolved = { 'url' : "http://pastebin.com/download.php?i=%s" % paste_id, + 'method' : "GET", 'priority' : 1, 'format' : "text" } diff --git a/resolv/resolvers/putlocker.py b/resolv/resolvers/putlocker.py index dc75ce7..96e8058 100644 --- a/resolv/resolvers/putlocker.py +++ b/resolv/resolvers/putlocker.py @@ -4,6 +4,10 @@ from resolv.shared import ResolverError, unescape, Task class PutlockerTask(Task): result_type = "video" + name = "PutLocker" + author = "Sven Slootweg" + author_url = "http://cryto.net/~joepie91" + def run(self): try: import mechanize @@ -64,6 +68,7 @@ class PutlockerTask(Task): stream_dict = { 'url' : video_file, + 'method' : "GET", 'quality' : "unknown", 'priority' : 1, 'format' : "unknown" diff --git a/resolv/resolvers/sockshare.py b/resolv/resolvers/sockshare.py index a13fca6..3e34866 100644 --- a/resolv/resolvers/sockshare.py +++ b/resolv/resolvers/sockshare.py @@ -4,6 +4,10 @@ from resolv.shared import ResolverError, unescape, Task class SockshareTask(Task): result_type = "video" + name = "SockShare" + author = "Sven Slootweg" + author_url = "http://cryto.net/~joepie91" + def run(self): try: import mechanize @@ -64,6 +68,7 @@ class SockshareTask(Task): stream_dict = { 'url' : video_file, + 'method' : "GET", 'quality' : "unknown", 'priority' : 1, 'format' : "unknown" diff --git a/resolv/resolvers/youtube.py b/resolv/resolvers/youtube.py index 8f766b2..a1cde41 100644 --- a/resolv/resolvers/youtube.py +++ b/resolv/resolvers/youtube.py @@ -4,6 +4,10 @@ from resolv.shared import ResolverError, unescape, Task class YoutubeTask(Task): result_type = "video" + name = "YouTube" + author = "Sven Slootweg" + author_url = "http://cryto.net/~joepie91" + extra_headers = { 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', From ed940b4ee4c6516fd665672d0e11a3e6d3c1dca5 Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Sat, 27 Oct 2012 07:57:23 +0200 Subject: [PATCH 12/17] Improve error handling --- resolv/resolvers/filebox.py | 4 ++-- resolv/resolvers/mediafire.py | 6 +++--- resolv/resolvers/onechannel.py | 4 ++-- resolv/resolvers/putlocker.py | 11 ++++++----- resolv/resolvers/sockshare.py | 11 ++++++----- resolv/resolvers/youtube.py | 18 +++++++++++------- resolv/shared.py | 7 +++++++ setup.py | 2 +- 8 files changed, 38 insertions(+), 25 deletions(-) diff --git a/resolv/resolvers/filebox.py b/resolv/resolvers/filebox.py index 4400e5a..0fc14ed 100644 --- a/resolv/resolvers/filebox.py +++ b/resolv/resolvers/filebox.py @@ -1,5 +1,5 @@ import re, time, urllib2 -from resolv.shared import ResolverError, Task +from resolv.shared import ResolverError, TechnicalError, Task class FileboxTask(Task): result_type = "video" @@ -21,7 +21,7 @@ class FileboxTask(Task): contents = self.fetch_page("http://www.filebox.com/embed-%s-970x543.html" % video_id) except urllib2.URLError, e: self.state = "failed" - raise ResolverError("Could not retrieve the video page.") + raise TechnicalError("Could not retrieve the video page.") matches = re.search("url: '([^']+)',", contents) diff --git a/resolv/resolvers/mediafire.py b/resolv/resolvers/mediafire.py index bc5f405..a891ab1 100644 --- a/resolv/resolvers/mediafire.py +++ b/resolv/resolvers/mediafire.py @@ -1,5 +1,5 @@ import re, urllib2 -from resolv.shared import ResolverError, unescape, Task +from resolv.shared import ResolverError, TechnicalError, unescape, Task class MediafireTask(Task): result_type = "file" @@ -13,7 +13,7 @@ class MediafireTask(Task): contents = self.fetch_page(self.url) except urllib2.URLError, e: self.state = "failed" - raise ResolverError("Could not retrieve the specified URL.") + raise TechnicalError("Could not retrieve the specified URL.") if '<form name="form_password"' in contents: # The file is password-protected @@ -45,7 +45,7 @@ class MediafireTask(Task): file_title = unescape(re.search('<title>([^<]+)<\/title>', contents).group(1)) except: self.state = "failed" - raise ResolverError("Could not find the download title.") + raise TechnicalError("Could not find the download title.") file_dict = { 'url' : file_url, diff --git a/resolv/resolvers/onechannel.py b/resolv/resolvers/onechannel.py index ad9f5ad..164df22 100644 --- a/resolv/resolvers/onechannel.py +++ b/resolv/resolvers/onechannel.py @@ -1,5 +1,5 @@ import re, base64 -from resolv.shared import ResolverError, Task +from resolv.shared import ResolverError, TechnicalError, Task class OneChannelTask(Task): result_type = "url" @@ -19,7 +19,7 @@ class OneChannelTask(Task): real_url = base64.b64decode(matches.group(2)).strip() except TypeError: self.state = "failed" - raise ResolverError("The provided URL is malformed.") + raise TechnicalError("The provided URL is malformed.") self.results = { 'url': real_url } self.state = "finished" diff --git a/resolv/resolvers/putlocker.py b/resolv/resolvers/putlocker.py index 96e8058..8a66af4 100644 --- a/resolv/resolvers/putlocker.py +++ b/resolv/resolvers/putlocker.py @@ -1,5 +1,5 @@ import re -from resolv.shared import ResolverError, unescape, Task +from resolv.shared import ResolverError, TechnicalError, unescape, Task class PutlockerTask(Task): result_type = "video" @@ -13,7 +13,7 @@ class PutlockerTask(Task): import mechanize except ImportError: self.state = "failed" - raise ResolverError("The Python mechanize module is required to resolve PutLocker URLs.") + raise TechnicalError("The Python mechanize module is required to resolve PutLocker URLs.") matches = re.search("https?:\/\/(www\.)?putlocker\.com\/(file|embed)\/([A-Z0-9]+)", self.url) @@ -29,7 +29,7 @@ class PutlockerTask(Task): browser.open("http://putlocker.com/embed/%s" % video_id) except: self.state = "failed" - raise ResolverError("The PutLocker site could not be reached.") + raise TechnicalError("The PutLocker site could not be reached.") try: browser.select_form(nr=0) @@ -42,6 +42,7 @@ class PutlockerTask(Task): matches = re.search("playlist: '([^']+)'", page) if matches is None: + self.state = "failed" raise ResolverError("No playlist was found on the given URL; the PutLocker server for this file may be in maintenance mode, or the given URL may not be a video file. The PutLocker resolver currently only supports video links.") playlist = matches.group(1) @@ -50,7 +51,7 @@ class PutlockerTask(Task): browser.open("http://www.putlocker.com%s" % playlist) except: self.state = "failed" - raise ResolverError("The playlist file for the given URL could not be loaded.") + raise TechnicalError("The playlist file for the given URL could not be loaded.") matches = re.search("url=\"([^\"]+)\" type=\"video\/x-flv\"", browser.response().read()) @@ -64,7 +65,7 @@ class PutlockerTask(Task): video_title = unescape(re.search('<a href="\/file\/[^"]+"[^>]*><strong>([^<]*)<\/strong><\/a>', page).group(1)) except: self.state = "failed" - raise ResolverError("Could not find the video title.") + raise TechnicalError("Could not find the video title.") stream_dict = { 'url' : video_file, diff --git a/resolv/resolvers/sockshare.py b/resolv/resolvers/sockshare.py index 3e34866..6d9e080 100644 --- a/resolv/resolvers/sockshare.py +++ b/resolv/resolvers/sockshare.py @@ -1,5 +1,5 @@ import re -from resolv.shared import ResolverError, unescape, Task +from resolv.shared import ResolverError, TechnicalError, unescape, Task class SockshareTask(Task): result_type = "video" @@ -13,7 +13,7 @@ class SockshareTask(Task): import mechanize except ImportError: self.state = "failed" - raise ResolverError("The Python mechanize module is required to resolve Sockshare URLs.") + raise TechnicalError("The Python mechanize module is required to resolve Sockshare URLs.") matches = re.search("https?:\/\/(www\.)?sockshare\.com\/(file|embed)\/([A-Z0-9]+)", self.url) @@ -29,7 +29,7 @@ class SockshareTask(Task): browser.open("http://sockshare.com/embed/%s" % video_id) except: self.state = "failed" - raise ResolverError("The Sockshare site could not be reached.") + raise TechnicalError("The Sockshare site could not be reached.") try: browser.select_form(nr=0) @@ -42,6 +42,7 @@ class SockshareTask(Task): matches = re.search("playlist: '([^']+)'", page) if matches is None: + self.state = "failed" raise ResolverError("No playlist was found on the given URL; the Sockshare server for this file may be in maintenance mode, or the given URL may not be a video file. The Sockshare resolver currently only supports video links.") playlist = matches.group(1) @@ -50,7 +51,7 @@ class SockshareTask(Task): browser.open("http://www.sockshare.com%s" % playlist) except: self.state = "failed" - raise ResolverError("The playlist file for the given URL could not be loaded.") + raise TechnicalError("The playlist file for the given URL could not be loaded.") matches = re.search("url=\"([^\"]+)\" type=\"video\/x-flv\"", browser.response().read()) @@ -64,7 +65,7 @@ class SockshareTask(Task): video_title = unescape(re.search('<a href="\/file\/[^"]+"[^>]*><strong>([^<]*)<\/strong><\/a>', page).group(1)) except: self.state = "failed" - raise ResolverError("Could not find the video title.") + raise TechnicalError("Could not find the video title.") stream_dict = { 'url' : video_file, diff --git a/resolv/resolvers/youtube.py b/resolv/resolvers/youtube.py index a1cde41..d6f5b7c 100644 --- a/resolv/resolvers/youtube.py +++ b/resolv/resolvers/youtube.py @@ -1,5 +1,5 @@ import re, urllib, urllib2, urlparse -from resolv.shared import ResolverError, unescape, Task +from resolv.shared import ResolverError, TechnicalError, unescape, Task class YoutubeTask(Task): result_type = "video" @@ -19,7 +19,11 @@ class YoutubeTask(Task): contents = self.fetch_page(self.url) except urllib2.URLError, e: self.state = "failed" - raise ResolverError("Could not retrieve the specified URL.") + raise TechnicalError("Could not retrieve the specified URL.") + + if '<meta property="og:video:type"' not in contents: + self.state = "invalid" + raise ResolverError("The specified URL is not a valid YouTube video.") map_start = "url_encoded_fmt_stream_map=" map_end = "\\u0026amp;" @@ -29,21 +33,21 @@ class YoutubeTask(Task): snippet = contents[pos_start:] except ValueError: self.state = "failed" - raise ResolverError("The starting position for the YouTube player configuration could not be found. Is the URL really a valid video page?") + raise TechnicalError("The starting position for the YouTube player configuration could not be found. Is the URL really a valid video page?") try: pos_end = snippet.index(map_end) stream_map = snippet[:pos_end] except ValueError: self.state = "failed" - raise ResolverError("The ending position for the YouTube player configuration could not be found.") + raise TechnicalError("The ending position for the YouTube player configuration could not be found.") try: stream_map = urllib.unquote(stream_map) streams = stream_map.split(',') except: self.state = "failed" - raise ResolverError("The YouTube player configuration is corrupted.") + raise TechnicalError("The YouTube player configuration is corrupted.") stream_pool = [] @@ -52,7 +56,7 @@ class YoutubeTask(Task): if len(fields) < 6: self.state = "failed" - raise ResolverError("The amount of fields in the YouTube player configuration is incorrect.") + raise TechnicalError("The amount of fields in the YouTube player configuration is incorrect.") signature = fields['sig'][0] video_url = "%s&signature=%s" % (fields['url'][0], signature) @@ -111,7 +115,7 @@ class YoutubeTask(Task): video_title = unescape(re.search('<meta property="og:title" content="([^"]*)">', contents).group(1)) except: self.state = "failed" - raise ResolverError("Could not find the video title.") + raise TechnicalError("Could not find the video title.") self.results = { 'title': video_title, diff --git a/resolv/shared.py b/resolv/shared.py index 4c583a9..94ba755 100644 --- a/resolv/shared.py +++ b/resolv/shared.py @@ -11,6 +11,13 @@ class ResolverError(Exception): def __str__(self): return repr(self.val) + +class TechnicalError(Exception): + def __init__(self, value): + self.val = value + + def __str__(self): + return repr(self.val) class Task(): captcha = None diff --git a/setup.py b/setup.py index 639f629..93b19fa 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='resolv', - version='1.1.0', + version='1.2.0', description='Module for resolving URLs from filehosters, video hosters, and other content hosters', author='Sven Slootweg', author_email='resolv@cryto.net', From e93339d1655853be6710cef4c0049b1ea6cdc97e Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Sun, 28 Oct 2012 15:43:28 +0100 Subject: [PATCH 13/17] Add documentation --- docs/developers.md | 357 +++++++++++++++++++++++++++++++++++++++++++++ docs/structures.md | 305 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 662 insertions(+) create mode 100644 docs/developers.md create mode 100644 docs/structures.md diff --git a/docs/developers.md b/docs/developers.md new file mode 100644 index 0000000..4a018d3 --- /dev/null +++ b/docs/developers.md @@ -0,0 +1,357 @@ +# Documentation for developers + +The majority of this document will apply to both third-party developers ("users") of the resolv library, and core/plugin developers writing code for resolv itself. Where necessary, a distinction is made using the terms "users" (for third-party developers that use resolv in their project as a library) and "developers" (for developers that work on either the core or the plugins for the resolv library). + +## Purpose + +The purpose of python-resolv is quite simple: to provide a reusable library for resolving URLs. "Resolving" in this context refers to various things; for example: + + * The resolution of an obfuscated 'external' 1channel URL to the real URL of a stream. + * The resolution of a YouTube URL to directly streamable video files that can for example be downloaded via wget, or streamed via VLC Media Player (in various qualities). + * The resolution of a Mediafire page URL to a wget-able direct URL. + * The resolution of a Pastebin URL to a 'raw' version, including the fetching of the title. + * And so on, and so on... + +Basically, resolv's purpose is to turn any kind of URL into the most 'direct' URL that can be acquired for either streaming or downloading, in such a way that it can easily be integrated into third-party software (such as a download manager, media player, etc.) + +## Technical summary + +The resolv library is a Python module - this means it can be imported like any other module and used in any Python application or application that supports Python scripting. Each "resolver" - a 'plugin' to resolve URLs for a certain service - is its own class, inheriting from the Task base class. A task may either be finished immediately, or require further user input (for example, a password or a CAPTCHA solution). The final result is a nested dictionary with the information that is necessary for downloading or streaming. The library can be kept up to date independently via its PyPi packages. + +## Currently supported services: + +<table> + <tr> + <th>Name</th> + <th>Supported URL types</th> + <th>Supports CAPTCHAs</th> + <th>Class name</th> + </tr> + <tr> + <td>Pastebin</td> + <td>All URLs</td> + <td>n/a</td> + <td>PastebinTask</td> + </tr> + <tr> + <td>YouTube</td> + <td>All URLs</td> + <td>No</td> + <td>YoutubeTask</td> + </tr> + <tr> + <td>Mediafire</td> + <td>Files, password-protected files</td> + <td>No</td> + <td>MediafireTask</td> + </tr> + <tr> + <td>PutLocker</td> + <td>Videos</td> + <td>n/a</td> + <td>PutlockerTask</td> + </tr> + <tr> + <td>SockShare</td> + <td>Videos</td> + <td>n/a</td> + <td>SockshareTask</td> + </tr> + <tr> + <td>FileBox.com</td> + <td>Videos</td> + <td>n/a</td> + <td>FileboxTask</td> + </tr> + <tr> + <td>1channel</td> + <td>Obfuscated external URLs</td> + <td>n/a</td> + <td>OneChannelTask</td> + </tr> + <tr> + <td>VidX Den</td> + <td>All URLs</td> + <td>n/a</td> + <td>VidxdenTask</td> + </tr> + <tr> + <td>VidBux</td> + <td>All URLs</td> + <td>n/a</td> + <td>VidbuxTask</td> + </tr> +</table> + +## Getting started + +To install resolv, you can use `pip install resolv` or the PyPi-using package manager of your choice. To update resolve, run `pip install --upgrade resolv`. + +To start using the resolv library in your code (or in a Python shell), simply `import resolv`. + +## Resolving + +The resolv library can be used in two ways: either by using a specific resolver directly, or by letting the library figure out what resolver to use for a URL. + +### Automatically detecting the needed resolver + +If you want to have the library automatically figure out what resolver to use, there are two functions available for you that only differ slightly: + +#### resolv.resolve(url) +*Returns:* an instance of a Task-derived class, depending on the resolver used. + +This function finds the needed resolver, attempts to complete resolution, and returns the newly created task. + +Example: + + >>> import resolv + >>> task = resolv.resolve("http://www.1channel.ch/external.php?title=Big+Buck+Bunny&url=aHR0cDovL3d3dy5wdXRsb2NrZXIuY29tL2ZpbGUvOTg3RkVCRjVEQjY0NUUyRQ==&domain=cHV0bG9ja2VyLmNvbQ==&loggedin=0") + >>> task.state + 'finished' + >>> task.results + {'url': 'http://www.putlocker.com/file/987FEBF5DB645E2E'} + +#### resolv.recurse(url) +*Returns:* an instance of a Task-derived class, depending on the resolver used. + +This function does the same as resolv.resolve(), but will only return the result if it is not a deobfuscated URL, or if the 'next hop' failed to resolve. This means that, for example, running resolv.recurse() on an obfuscated 1channel link will not return the URL behind that obfuscated link, but a *resolved* version of that URL. + +Example: + + >>> import resolv + >>> task = resolv.recurse("http://www.1channel.ch/external.php?title=Big+Buck+Bunny&url=aHR0cDovL3d3dy5wdXRsb2NrZXIuY29tL2ZpbGUvOTg3RkVCRjVEQjY0NUUyRQ==&domain=cHV0bG9ja2VyLmNvbQ==&loggedin=0") + >>> task.state + 'finished' + >>> task.results + {'videos': [{'url': "http://media-a9.putlocker.com/download/41/1246281_61c1d.flv?h=fMVE5HhbDbqv_WuXJZHSXw&e=1351276025&f='1246281_61c1d.flv'", 'priority': 1, 'quality': 'unknown', 'method': 'GET', 'format': 'unknown'}], 'title': 'Big Buck Bunny 39'} + +### Manually picking a resolver + +If you wish to only resolve a certain type of URL, you can manually pick a resolver for a certain site. Simply create a new instance of resolv.resolvers.*classname* with the target URL as argument, and call the run() method. + +Let's say that we want to, for example, resolve a specific Putlocker URL. We would do this: + + >>> import resolv + >>> task = resolv.resolvers.PutlockerTask("http://www.putlocker.com/file/987FEBF5DB645E2E") + >>> task.run() + >>> task.state + 'finished' + >>> task.results + {'videos': [{'url': "http://media-a9.putlocker.com/download/41/1246281_61c1d.flv?h=OFwiLT3SwZxenCNEt5650g&e=1351276212&f='1246281_61c1d.flv'", 'priority': 1, 'quality': 'unknown', 'method': 'GET', 'format': 'unknown'}], 'title': 'Big Buck Bunny 39'} + +It's really that simple! + +## Catching exceptions + +All user-related exceptions (deleted files, and such) thrown by the resolv module are `resolv.shared.ResolverError` exceptions. It's recommended to do the following to make error-catching easier: + + from resolv.shared import ResolverError + +After doing this, you can simply refer to `ResolverError` directly, instead of `resolv.shared.ResolverError`. + +The error message will always have a user-friendly description explaining what went wrong. It is feasible to directly show these error messages to the end user when they occur. + +In a similar way, `resolv.shared.TechnicalError` is used for technical failures that are likely to indicate a broken resolver. + +Technical errors should typically be logged, and a generic 'broken' message should be shown to the user. + +## Dealing with results + +The run() method will return the relevant task (this will usually be itself, although it is technically possible to create a new task and return that instead). This instance of a Task-derived class will have certain information and functions available. + +### Task.name + +The name of the resolver. This is usually the name of the site. + +### Task.author + +The author of the resolver. + +### Task.author_url + +The URL for this resolver (this will typically be the site of the author, a repository, etc.) + +### Task.state + +The state that the task is in - this is guaranteed to be set after calling run(). The state may be any of the following: + +<table> + <tr> + <th>Name</th> + <th>Description</th> + </tr> + <tr> + <td>blank</td> + <td>The run() method has not been called yet.</td> + </tr> + <tr> + <td>finished</td> + <td>The URL was successfully resolved.</td> + </tr> + <tr> + <td>need_password</td> + <td>A password is required to resolve this URL.</td> + </tr> + <tr> + <td>password_invalid</td> + <td>The provided password was incorrect.</td> + </tr> + <tr> + <td>need_captcha</td> + <td>A CAPTCHA needs to be solved to continue resolving.</td> + </tr> + <tr> + <td>captcha_invalid</td> + <td>The given CAPTCHA response was incorrect.</td> + </tr> + <tr> + <td>invalid</td> + <td>The URL is invalid for this resolver.</td> + </tr> + <tr> + <td>unsupported</td> + <td>This type of URL is not supported by this resolver.</td> + </tr> + <tr> + <td>failed</td> + <td>The resolution failed for some other reason.</td> + </tr> +</table> + +How to handle these situations is up to your application. + +### Task.result_type + +This variable holds the type of result that the Task holds. It can be any of `url` (for deobfuscated and un-shortened URLs), `file` (for downloadable files), `text` (for pastebins and such), `video` (for streaming video), `audio` (for streaming audio), and `image` for embeddable images. A special type is `dummy` which is used by the `dummy` resolver, but may also appear in other resolvers for testing purposes. For all practical purposes, `dummy` results should be ignored. + +__Important:__ Do *not* use this variable to determine whether resolution was successful. A resolver may set this variable before doing any resolution, if the resolver only supports one kind of result. + +### Task.results + +This variable holds the results of the resolution. The format of these results will differ depending on the result type. When successfully resolving a URL, the results will always be in the form of a dictionary. + +Further documentation on the structure of these dictionaries for each result type, can be found in structures.md. + +### Task.captcha + +If solving a CAPTCHA is required (as indicated by the `need_captcha` state), this variable will hold a Captcha object. The Captcha class is documented further down this document. + +### Task.cookiejar + +The cookielib Cookie Jar that is used for this task. + +### Task.run() + +*Returns:* An instance of a Task-derived class, usually itself. + +Runs the task. + +### Task.fetch_page(url) + +*Returns:* A string containing the resulting data. + +Does a GET request to the specified `url`, using the Cookie Jar for the task. When manually making GET requests related to a task, always use this function to ensure that session information is retained. + +### Task.post_page(url, data) + +*Returns:* A string containing the resulting data. + +Does a POST request to the specified `url`, using the Cookie Jar for the task. The `data` argument should be a dictionary of POST fields. When manually making POST requests related to a task, always use this function to ensure that session information is retained. + +### Task.verify_password(password) + +*Returns:* An instance of a Task-derived class, usually itself. + +Continues the task, using the provided password. Essentially works the same as the run() method. Password validity is checked via the `state` variable. This function is only available for resolvers that support password-protected URLs. + +### Task.verify_image_captcha(solution) + +*Returns:* An instance of a Task-derived class, usually itself. + +Continues the task, using the provided image CAPTCHA solution. Essentially works the same as the run() method. CAPTCHA solution validity is checked via the `state` variable. This function is only available for resolvers that support CAPTCHA handling. + +### Task.verify_audio_captcha(solution) + +*Returns:* An instance of a Task-derived class, usually itself. + +Continues the task, using the provided audio CAPTCHA solution. Essentially works the same as the run() method. CAPTCHA solution validity is checked via the `state` variable. This function is only available for resolvers that support CAPTCHA handling. + +### Task.verify_text_captcha(solution) + +*Returns:* An instance of a Task-derived class, usually itself. + +Continues the task, using the provided text CAPTCHA solution. Essentially works the same as the run() method. CAPTCHA solution validity is checked via the `state` variable. This function is only available for resolvers that support CAPTCHA handling. + +## CAPTCHA handling + +If a site requires a CAPTCHA to be solved before you can fully resolve the URL, the state will be set to `need_captcha`. The resolv library does not process CAPTCHAs itself; it simply provides you with the CAPTCHA data so that you can figure out some way to solve it. The `Task.captcha` variable will hold a Captcha object that has everything you will need. To provide a solution for a CAPTCHA, use the appropriate method in the Task instance (see above). + +### Captcha.task + +This variable will hold a reference to the `Task` this CAPTCHA belongs to. + +### Captcha.text + +This variable will either be `None` (if no text version of the CAPTCHA was available) or the text challenge as a string. + +### Captcha.image + +This variable holds `None` or the URL for the image CAPTCHA. __Do NOT use this variable unless you know what you're doing - the majority of image CAPTCHAs are tied to an IP address and set of cookies. You should use the get_image() method for this.__ + +### Captcha.audio + +This variable holds `None` or the URL for the audio CAPTCHA. __Do NOT use this variable unless you know what you're doing - the majority of audio CAPTCHAs are tied to an IP address and set of cookies. You should use the get_audio() method for this.__ + +### Captcha.get_image() + +*Returns:* a tuple containing (file type, binary image data). + +You can save the output of this method to a file, or send it elsewhere, to further process the image CAPTCHA. + +### Captcha.get_audio() + +*Returns:* a tuple containing (file type, binary audio data). + +You can save the output of this method to a file, or send it elsewhere, to further process the audio CAPTCHA. + +### Some ideas for terminal-based CAPTCHA solving + +When writing a terminal-based download application, you often can't just display a CAPTCHA to the end user. A few suggestions to work around this: + +* Use a third-party CAPTCHA solving service to cover whatever CAPTCHAs can be covered. +* Implement a web interface for the application in its entirety. +* Convert the image CAPTCHA to colored text (the ASCII art approach) to display it on a terminal. +* Start a temporary HTTP daemon that serves the CAPTCHA and terminates when the CAPTCHA has been solved. + +## Resolver-specific documentation + +### YouTube + +The YouTube resolver provides some specific custom keys for each video result: `itag` (a format identifier used by YouTube internally), `fallback_host`, and a YouTube-supplied `mimetype` definition containing encoding details. + +## Documentation specific to plugin (resolver) developers + +### Getting started + +1. Clone the repository. +2. Look at existing resolvers, especially dummy.py to see the basic format for a resolver. +3. Modify a resolver or make your own. +4. Create a pull request to have your changes merged into the main repository (if you want to). + +### Things to keep in mind + +* ResolverError exceptions must always contain a user-friendly description. +* TechnicalError exceptions do not have to be user-friendly, but they must be clear. +* Don't forget to set metadata in your resolver class! +* Adhere to the standard formats for results - if you want to return something for which no suitable format exists, change the documentation to add your format and make a pull request to have it added in - this way you can be sure that applications can handle your format in the future. +* For the sake of consistency, all code, comments, and error messages should be in English. +* Always set the state of a Task to `failed`, `unsupported` or `invalid` depending on the problem, before raising an exception. +* When specifying a HTTP method, always use *uppercase* characters (GET, POST). + +### Whether to use the failed, unsupported or invalid state + +The `invalid` state is intended for situations where it is *certain* that the input (URL) was invalid. For example, the homepage of a filehost instead of a URL to a certain file, or an entirely different site altogether. If the URL is malformed in some way, you may also use this state. If you cannot be entirely sure whether the URL is invalid or whether there was another problem, use the `failed` state. An example of this would be a 'not authorized' page - the URL may be invalid, but it may also be possible that there is simply no public access. + +The `unsupported` state is intended for situation where the URL that is provided cannot be resolved because a certain feature needed for this is not available. Examples include a CAPTCHA on a site for which the resolver has no CAPTCHA handling, or a file download on a site for which the resolver only supports resolving video streams. Use of this state should always be temporary - at some point the required functionality should be implemented. + +The `failed` state is for everything else. \ No newline at end of file diff --git a/docs/structures.md b/docs/structures.md new file mode 100644 index 0000000..3e6b30b --- /dev/null +++ b/docs/structures.md @@ -0,0 +1,305 @@ +## URLs + +<table> + <tr> + <th>Key</th> + <th>Description</th> + </tr> + <tr> + <td>url</td> + <td>The deobfuscated or un-shortened URL.</td> + </tr> +</table> + +## Dummy data + +<table> + <tr> + <th>Key</th> + <th>Description</th> + </tr> + <tr> + <td>dummy</td> + <td>The dummy data.</td> + </tr> +</table> + +## Video + +<table> + <tr> + <th>Key</th> + <th>Description</th> + </tr> + <tr> + <td>title</td> + <td>Title of the video.</td> + </tr> + <tr> + <td>videos</td> + <td>A list of all available video files.</td> + </tr> +</table> + +The list of videos can contain multiple dictionaries, each of which has the following fields: + +<table> + <tr> + <th>Key</th> + <th>Description</th> + </tr> + <tr> + <td>url</td> + <td>URL of the video file.</td> + </tr> + <tr> + <td>method</td> + <td>The method to be used for retrieving this URL (either GET or POST).</td> + </tr> + <tr> + <td>postdata</td> + <td>(optional) The POST data to send if the method to be used is POST. This data is in dictionary form.</td> + </tr> + <tr> + <td>quality</td> + <td>A textual description of the video quality (this will typically be along the lines of `360p`, `720p`, `1080p`, `low`, `medium`, `high`, etc, but any value + is possible). If the quality is not specified, this will be set to `unknown`. Don't parse this programmatically - use the `priority` field instead.</td> + </tr> + <tr> + <td>format</td> + <td>The name of the file format for this video, along the lines of `webm`, `mp4`, `3gp`, `flv`, `wmv`, etc. While this value should typically be pretty consistent, + different abbreviations may be used for different resolvers. It's probably not a good idea to automatically parse these unless you know the exact values + a resolver will return. This may be set to `unknown`.</td> + </tr> + <tr> + <td>priority</td> + <td>The priority for this video file. Higher quality video has a lower 'priority'. To always get the highest quality video, go for the URL with the lowest + priority (this may not always be 1).</td> + </tr> + <tr> + <td>extra</td> + <td>This is a dictionary that may contain any custom data provided by the specific resolver that is used. Refer to the resolver-specific documentation for this.</td> + </tr> +</table> + +## Audio + +<table> + <tr> + <th>Key</th> + <th>Description</th> + </tr> + <tr> + <td>title</td> + <td>Title of the audio file.</td> + </tr> + <tr> + <td>audiofiles</td> + <td>A list of all available audio files.</td> + </tr> +</table> + +The list of audio files can contain multiple dictionaries, each of which has the following fields: + +<table> + <tr> + <th>Key</th> + <th>Description</th> + </tr> + <tr> + <td>url</td> + <td>URL of the audio file.</td> + </tr> + <tr> + <td>method</td> + <td>The method to be used for retrieving this URL (either GET or POST).</td> + </tr> + <tr> + <td>postdata</td> + <td>(optional) The POST data to send if the method to be used is POST. This data is in dictionary form.</td> + </tr> + <tr> + <td>quality</td> + <td>A textual description of the audio quality (this will typically be along the lines of `low`, `medium`, `high`, `lossless`, etc, but any value is possible). If + the quality is not specified, this will be set to `unknown`. Don't parse this programmatically - use the `priority` field instead.</td> + </tr> + <tr> + <td>format</td> + <td>The name of the file format for this audio file, along the lines of `mp3`, `flac`, `midi`, `ogg`, etc. While this value should typically be pretty consistent, + different abbreviations may be used for different resolvers. It's probably not a good idea to automatically parse these unless you know the exact values + a resolver will return. This may be set to `unknown`.</td> + </tr> + <tr> + <td>priority</td> + <td>The priority for this audio file. Higher quality audio has a lower 'priority'. To always get the highest quality audio file, go for the URL with the lowest + priority (this may not always be 1).</td> + </tr> + <tr> + <td>extra</td> + <td>This is a dictionary that may contain any custom data provided by the specific resolver that is used. Refer to the resolver-specific documentation for this.</td> + </tr> +</table> + +## Images + +<table> + <tr> + <th>Key</th> + <th>Description</th> + </tr> + <tr> + <td>title</td> + <td>Title of the image.</td> + </tr> + <tr> + <td>images</td> + <td>A list of all available image files.</td> + </tr> +</table> + +The list of images can contain multiple dictionaries, each of which has the following fields: + +<table> + <tr> + <th>Key</th> + <th>Description</th> + </tr> + <tr> + <td>url</td> + <td>URL of the image.</td> + </tr> + <tr> + <td>method</td> + <td>The method to be used for retrieving this URL (either GET or POST).</td> + </tr> + <tr> + <td>postdata</td> + <td>(optional) The POST data to send if the method to be used is POST. This data is in dictionary form.</td> + </tr> + <tr> + <td>quality</td> + <td>A textual description of the image quality (this will typically be along the lines of `low`, `medium`, `high`, `lossless`, etc, but any value is possible). If + the quality is not specified, this will be set to `unknown`. Don't parse this programmatically - use the `priority` field instead.</td> + </tr> + <tr> + <td>format</td> + <td>The name of the file format for this image, along the lines of `jpg`, `png`, `psd`, `svg`, etc. While this value should typically be pretty consistent, + different abbreviations may be used for different resolvers. It's probably not a good idea to automatically parse these unless you know the exact values + a resolver will return. This may be set to `unknown`.</td> + </tr> + <tr> + <td>priority</td> + <td>The priority for this image. Higher quality images have a lower 'priority'. To always get the highest quality image, go for the URL with the lowest + priority (this may not always be 1).</td> + </tr> + <tr> + <td>extra</td> + <td>This is a dictionary that may contain any custom data provided by the specific resolver that is used. Refer to the resolver-specific documentation for this.</td> + </tr> +</table> + +## Files + +<table> + <tr> + <th>Key</th> + <th>Description</th> + </tr> + <tr> + <td>title</td> + <td>Title of the file.</td> + </tr> + <tr> + <td>files</td> + <td>A list of all available URLs for this file.</td> + </tr> +</table> + +The list of files can contain multiple dictionaries, each of which has the following fields: + +<table> + <tr> + <th>Key</th> + <th>Description</th> + </tr> + <tr> + <td>url</td> + <td>URL of the file.</td> + </tr> + <tr> + <td>method</td> + <td>The method to be used for retrieving this URL (either GET or POST).</td> + </tr> + <tr> + <td>postdata</td> + <td>(optional) The POST data to send if the method to be used is POST. This data is in dictionary form.</td> + </tr> + <tr> + <td>format</td> + <td>The name of the file format, along the lines of `zip`, `mp3`, `pdf`, `doc`, etc. While this value should typically be pretty consistent, + different abbreviations may be used for different resolvers. It's probably not a good idea to automatically parse these unless you know the exact values + a resolver will return. This may be set to `unknown`.</td> + </tr> + <tr> + <td>priority</td> + <td>The priority for this URL. More important or faster URLs have a lower 'priority'. To always get the best result, go for the URL with the lowest + priority (this may not always be 1).</td> + </tr> + <tr> + <td>extra</td> + <td>This is a dictionary that may contain any custom data provided by the specific resolver that is used. Refer to the resolver-specific documentation for this.</td> + </tr> +</table> + +## Text + +<table> + <tr> + <th>Key</th> + <th>Description</th> + </tr> + <tr> + <td>title</td> + <td>Title of the text file.</td> + </tr> + <tr> + <td>files</td> + <td>A list of all available URLs for this file.</td> + </tr> +</table> + +The list of text files can contain multiple dictionaries, each of which has the following fields: + +<table> + <tr> + <th>Key</th> + <th>Description</th> + </tr> + <tr> + <td>url</td> + <td>URL of the file.</td> + </tr> + <tr> + <td>method</td> + <td>The method to be used for retrieving this URL (either GET or POST).</td> + </tr> + <tr> + <td>postdata</td> + <td>(optional) The POST data to send if the method to be used is POST. This data is in dictionary form.</td> + </tr> + <tr> + <td>format</td> + <td>The name of the file format, along the lines of `zip`, `mp3`, `pdf`, `doc`, etc. While this value should typically be pretty consistent, + different abbreviations may be used for different resolvers. It's probably not a good idea to automatically parse these unless you know the exact values + a resolver will return. This may be set to `unknown`.</td> + </tr> + <tr> + <td>priority</td> + <td>The priority for this URL. More important or faster URLs have a lower 'priority'. To always get the best result, go for the URL with the lowest + priority (this may not always be 1).</td> + </tr> + <tr> + <td>extra</td> + <td>This is a dictionary that may contain any custom data provided by the specific resolver that is used. Refer to the resolver-specific documentation for this.</td> + </tr> +</table> \ No newline at end of file From 9522543a7dd534380a3af04c64da09f5c8088f28 Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Sun, 28 Oct 2012 15:43:51 +0100 Subject: [PATCH 14/17] Add Javascript unpacker --- resolv/shared.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/resolv/shared.py b/resolv/shared.py index 94ba755..a490af0 100644 --- a/resolv/shared.py +++ b/resolv/shared.py @@ -1,5 +1,5 @@ from HTMLParser import HTMLParser -import cookielib, urllib, urllib2 +import cookielib, urllib, urllib2, re import sys reload(sys) @@ -104,3 +104,24 @@ class Captcha(): def unescape(s): return HTMLParser.unescape.__func__(HTMLParser, s) + +def str_base(num, base): + # Thanks to http://code.activestate.com/recipes/65212/#c7 + return ((num == 0) and "0" ) or ( str_base(num // base, base).lstrip("0") + "0123456789abcdefghijklmnopqrstuvwxyz"[num % base]) + +def unpack_js(packed): + positions = re.search("return p\}\('(.+[^\\\\])',", packed).group(1) + base, counter, strings = re.search(",([0-9]+),([0-9]+),'([^']+)'", packed).groups(1) + + counter = int(counter) + base = int(base) + strings = strings.split("|") + + for i in reversed(xrange(0, int(counter))): + target = str_base(i, base) + positions = re.sub(r"\b%s\b" % target, strings[i], positions) + + # Fix escaped apostrophes. + positions = re.sub(r"(?<!\\)\\'", "'", positions) + + return positions From 6e6489a04b87e816c91799f9ebc9396f0746202d Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Sun, 28 Oct 2012 15:44:23 +0100 Subject: [PATCH 15/17] Add VidBux and VidX Den support --- resolv/__init__.py | 6 +++ resolv/resolvers/__init__.py | 2 + resolv/resolvers/vidbux.py | 85 ++++++++++++++++++++++++++++++++++++ resolv/resolvers/vidxden.py | 85 ++++++++++++++++++++++++++++++++++++ test.py | 6 +++ 5 files changed, 184 insertions(+) create mode 100644 resolv/resolvers/vidbux.py create mode 100644 resolv/resolvers/vidxden.py diff --git a/resolv/__init__.py b/resolv/__init__.py index 8b58b0f..21ca5d6 100644 --- a/resolv/__init__.py +++ b/resolv/__init__.py @@ -19,6 +19,12 @@ def resolve(url): elif re.match("https?:\/\/(www\.)?filebox\.com\/[a-zA-Z0-9]+", url) is not None: task = resolvers.FileboxTask(url) return task.run() + elif re.match("https?:\/\/(www\.)?vidxden\.com\/[a-zA-Z0-9]+", url) is not None: + task = resolvers.VidxdenTask(url) + return task.run() + elif re.match("https?:\/\/(www\.)?vidbux\.com\/[a-zA-Z0-9]+", url) is not None: + task = resolvers.VidbuxTask(url) + return task.run() elif re.match("https?:\/\/(www\.)?pastebin\.com\/[a-zA-Z0-9]+", url) is not None: task = resolvers.PastebinTask(url) return task.run() diff --git a/resolv/resolvers/__init__.py b/resolv/resolvers/__init__.py index b514ffd..6c0e8b2 100644 --- a/resolv/resolvers/__init__.py +++ b/resolv/resolvers/__init__.py @@ -6,3 +6,5 @@ from youtube import * from filebox import * from pastebin import * from mediafire import * +from vidxden import * +from vidbux import * diff --git a/resolv/resolvers/vidbux.py b/resolv/resolvers/vidbux.py new file mode 100644 index 0000000..0777a69 --- /dev/null +++ b/resolv/resolvers/vidbux.py @@ -0,0 +1,85 @@ +import re, time, urllib2 +from resolv.shared import ResolverError, TechnicalError, Task, unpack_js + +# No such file or the file has been removed due to copyright infringement issues. + +class VidbuxTask(Task): + result_type = "video" + + name = "VidBux" + author = "Sven Slootweg" + author_url = "http://cryto.net/~joepie91" + + def run(self): + matches = re.search("https?:\/\/(www\.)?vidbux\.com\/([a-zA-Z0-9]+)", self.url) + + if matches is None: + self.state = "invalid" + raise ResolverError("The provided URL is not a valid VidBux URL.") + + video_id = matches.group(2) + + try: + contents = self.fetch_page(self.url) + except urllib2.URLError, e: + self.state = "failed" + raise TechnicalError("Could not retrieve the video page.") + + if 'Human Verification' not in contents: + self.state = "invalid" + raise ResolverError("The provided URL does not exist.") + + matches = re.search('<input name="fname" type="hidden" value="([^"]+)">', contents) + + if matches is None: + self.state = "failed" + raise TechnicalError("Could not find filename.") + + filename = matches.group(1) + + matches = re.search('<input name="referer" type="hidden" value="([^"]*)">', contents) + + if matches is None: + self.state = "failed" + raise TechnicalError("Could not find referer.") + + referer = matches.group(1) + + try: + contents = self.post_page(self.url, { + 'op': "download1", + 'usr_login': "", + 'id': video_id, + 'filename': filename, + 'referer': referer, + 'method_free': "Continue to Video" + }) + except urllib2.URLError, e: + self.state = "failed" + raise TechnicalError("Could not complete human verification") + + script = unpack_js(contents) + + matches = re.search("'file','([^']+)'", script) + + if matches is None: + self.state = "failed" + raise TechnicalError("No video was found on the specified URL.") + + video_file = matches.group(1) + + stream_dict = { + 'url' : video_file, + 'method' : "GET", + 'quality' : "unknown", + 'priority' : 1, + 'format' : "unknown" + } + + self.results = { + 'title': "", + 'videos': [stream_dict] + } + + self.state = "finished" + return self diff --git a/resolv/resolvers/vidxden.py b/resolv/resolvers/vidxden.py new file mode 100644 index 0000000..683ee0c --- /dev/null +++ b/resolv/resolvers/vidxden.py @@ -0,0 +1,85 @@ +import re, time, urllib2 +from resolv.shared import ResolverError, TechnicalError, Task, unpack_js + +# No such file or the file has been removed due to copyright infringement issues. + +class VidxdenTask(Task): + result_type = "video" + + name = "VidX Den" + author = "Sven Slootweg" + author_url = "http://cryto.net/~joepie91" + + def run(self): + matches = re.search("https?:\/\/(www\.)?vidxden\.com\/([a-zA-Z0-9]+)", self.url) + + if matches is None: + self.state = "invalid" + raise ResolverError("The provided URL is not a valid VidX Den URL.") + + video_id = matches.group(2) + + try: + contents = self.fetch_page(self.url) + except urllib2.URLError, e: + self.state = "failed" + raise TechnicalError("Could not retrieve the video page.") + + if 'Human Verification' not in contents: + self.state = "invalid" + raise ResolverError("The provided URL does not exist.") + + matches = re.search('<input name="fname" type="hidden" value="([^"]+)">', contents) + + if matches is None: + self.state = "failed" + raise TechnicalError("Could not find filename.") + + filename = matches.group(1) + + matches = re.search('<input name="referer" type="hidden" value="([^"]*)">', contents) + + if matches is None: + self.state = "failed" + raise TechnicalError("Could not find referer.") + + referer = matches.group(1) + + try: + contents = self.post_page(self.url, { + 'op': "download1", + 'usr_login': "", + 'id': video_id, + 'filename': filename, + 'referer': referer, + 'method_free': "Continue to Video" + }) + except urllib2.URLError, e: + self.state = "failed" + raise TechnicalError("Could not complete human verification") + + script = unpack_js(contents) + + matches = re.search("'file','([^']+)'", script) + + if matches is None: + self.state = "failed" + raise TechnicalError("No video was found on the specified URL.") + + video_file = matches.group(1) + + stream_dict = { + 'url' : video_file, + 'method' : "GET", + 'quality' : "unknown", + 'priority' : 1, + 'format' : "unknown" + } + + self.results = { + 'title': "", + 'videos': [stream_dict] + } + + self.state = "finished" + return self diff --git a/test.py b/test.py index 5d6b356..333b3d0 100644 --- a/test.py +++ b/test.py @@ -23,6 +23,12 @@ suites = { }, 'youtube': { "YouTube": "http://www.youtube.com/watch?v=XSGBVzeBUbk" + }, + 'vidxden': { + "VidX Den": "http://www.vidxden.com/l404fifyhfn1" + }, + 'vidbux': { + "VidBux": "http://www.vidbux.com/5ovunjri3fqq" } } From acd96523aae8bc51075562360ee564fab1e23e56 Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Sun, 28 Oct 2012 16:02:46 +0100 Subject: [PATCH 16/17] Add Filenuke support --- resolv/__init__.py | 3 ++ resolv/resolvers/__init__.py | 1 + resolv/resolvers/filenuke.py | 93 ++++++++++++++++++++++++++++++++++++ test.py | 3 ++ 4 files changed, 100 insertions(+) create mode 100644 resolv/resolvers/filenuke.py diff --git a/resolv/__init__.py b/resolv/__init__.py index 21ca5d6..90507c7 100644 --- a/resolv/__init__.py +++ b/resolv/__init__.py @@ -25,6 +25,9 @@ def resolve(url): elif re.match("https?:\/\/(www\.)?vidbux\.com\/[a-zA-Z0-9]+", url) is not None: task = resolvers.VidbuxTask(url) return task.run() + elif re.match("https?:\/\/(www\.)?filenuke\.com\/[a-zA-Z0-9]+", url) is not None: + task = resolvers.FilenukeTask(url) + return task.run() elif re.match("https?:\/\/(www\.)?pastebin\.com\/[a-zA-Z0-9]+", url) is not None: task = resolvers.PastebinTask(url) return task.run() diff --git a/resolv/resolvers/__init__.py b/resolv/resolvers/__init__.py index 6c0e8b2..732ff75 100644 --- a/resolv/resolvers/__init__.py +++ b/resolv/resolvers/__init__.py @@ -8,3 +8,4 @@ from pastebin import * from mediafire import * from vidxden import * from vidbux import * +from filenuke import * diff --git a/resolv/resolvers/filenuke.py b/resolv/resolvers/filenuke.py new file mode 100644 index 0000000..bc6096d --- /dev/null +++ b/resolv/resolvers/filenuke.py @@ -0,0 +1,93 @@ +import re, time, urllib2 +from resolv.shared import ResolverError, TechnicalError, Task, unpack_js + +# No such file or the file has been removed due to copyright infringement issues. + +class FilenukeTask(Task): + result_type = "video" + + name = "Filenuke" + author = "Sven Slootweg" + author_url = "http://cryto.net/~joepie91" + + def run(self): + matches = re.search("https?:\/\/(www\.)?filenuke\.com\/([a-zA-Z0-9]+)", self.url) + + if matches is None: + self.state = "invalid" + raise ResolverError("The provided URL is not a valid Filenuke URL.") + + video_id = matches.group(2) + + try: + contents = self.fetch_page(self.url) + except urllib2.URLError, e: + self.state = "failed" + raise TechnicalError("Could not retrieve the video page.") + + if 'Choose how to download' not in contents: + self.state = "invalid" + raise ResolverError("The provided URL does not exist.") + + matches = re.search('<input type="hidden" name="fname" value="([^"]+)">', contents) + + if matches is None: + self.state = "failed" + raise TechnicalError("Could not find filename.") + + filename = matches.group(1) + + matches = re.search('<input type="hidden" name="referer" value="([^"]*)">', contents) + + if matches is None: + self.state = "failed" + raise TechnicalError("Could not find referer.") + + referer = matches.group(1) + + try: + contents = self.post_page(self.url, { + 'op': "download1", + 'usr_login': "", + 'id': video_id, + 'filename': filename, + 'referer': referer, + 'method_free': "Free" + }) + except urllib2.URLError, e: + self.state = "failed" + raise TechnicalError("Could not continue to download") + + matches = re.search('<div id="player_code">(.*?)</div>', contents, re.DOTALL) + + if matches is None: + self.state = "unsupported" + raise ResolverError("No player was found. The Filenuke resolver currently only supports video links.") + + player_code = matches.group(1) + + script = unpack_js(player_code) + + matches = re.search("'file','([^']+)'", script) + + if matches is None: + self.state = "failed" + raise TechnicalError("No video was found on the specified URL.") + + video_file = matches.group(1) + + stream_dict = { + 'url' : video_file, + 'method' : "GET", + 'quality' : "unknown", + 'priority' : 1, + 'format' : "unknown" + } + + self.results = { + 'title': "", + 'videos': [stream_dict] + } + + self.state = "finished" + return self diff --git a/test.py b/test.py index 333b3d0..0055110 100644 --- a/test.py +++ b/test.py @@ -29,6 +29,9 @@ suites = { }, 'vidbux': { "VidBux": "http://www.vidbux.com/5ovunjri3fqq" + }, + 'filenuke': { + "Filenuke": "http://filenuke.com/osk9yi7vbtq0" } } From d352f1d05348edf6a40367dd0769289ffaa70e04 Mon Sep 17 00:00:00 2001 From: Sven Slootweg <jamsoftgamedev@gmail.com> Date: Sun, 28 Oct 2012 16:18:15 +0100 Subject: [PATCH 17/17] Add documentation entry on Filenuke --- docs/developers.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/developers.md b/docs/developers.md index 4a018d3..bf4d93d 100644 --- a/docs/developers.md +++ b/docs/developers.md @@ -81,6 +81,12 @@ The resolv library is a Python module - this means it can be imported like any o <td>n/a</td> <td>VidbuxTask</td> </tr> + <tr> + <td>Filenuke</td> + <td>Videos</td> + <td>n/a</td> + <td>FilenukeTask</td> + </tr> </table> ## Getting started