diff --git a/.gitignore b/.gitignore index f86fa8e..23981d5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ # https://git-scm.com/docs/gitignore # https://help.github.com/articles/ignoring-files -# Example .gitignore files: https://github.com/github/gitignore \ No newline at end of file +# Example .gitignore files: https://github.com/github/gitignore +/node_modules/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..e46a50b --- /dev/null +++ b/README.md @@ -0,0 +1,66 @@ +# hma-proxy-parse + +Good game, HideMyAss, but I win :) + +This module extracts (parses) proxy details including the IP address from [http://proxylist.hidemyass.com/](http://proxylist.hidemyass.com/) and anything else using the same system, despite their (rather heavy) CSS-based obfuscation. + +Note that this module only does the parsing; you're responsible for fetching the source HTML yourself. + +## License + +[WTFPL](http://www.wtfpl.net/txt/copying/) or [CC0](https://creativecommons.org/publicdomain/zero/1.0/), whichever you prefer. A donation and/or attribution are appreciated, but not required. + +## Donate + +My income consists entirely of donations for my projects. If this module is useful to you, consider [making a donation](http://cryto.net/~joepie91/donate.html)! + +You can donate using Bitcoin, PayPal, Gratipay, Flattr, cash-in-mail, SEPA transfers, and pretty much anything else. + +Bitcoins can also be sent to `1KafMHn6YEDFkUSoHK6pKkqqmfJUF5Wd1C` directly :) + +## Contributing + +Pull requests welcome. Please make sure your modifications are in line with the overall code style, and ensure that you're editing the `.coffee` files, not the `.js` files. + +Build tool of choice is `gulp`; simply run `gulp` while developing, and it will watch for changes. + +Be aware that by making a pull request, you agree to release your modifications under the licenses stated above. + +## Usage + +You can input HTML from any source, but this example uses [`bhttp`](https://www.npmjs.com/package/bhttp) in Promises mode. + +```javascript +var hmaProxyParse = require("hma-proxy-parse"); +var bhttp = require("bhttp"); +var Promise = require("bluebird"); + +Promise.try(function(){ + return bhttp.get("http://proxylist.hidemyass.com/"); +}).then(function(response){ + console.log(hmaProxyParse(response.body.toString())); +}); +``` + +## API + +### hmaProxyParse(html) + +Parses the specified `html`, and returns an array of objects with proxy information. The objects look something like this: + +```javascript +{ + updateTimestamp: '1422645602', + ip: '187.108.223.204', + port: '8080', + country: 'br', + speed: '2441', + connectionTime: '235', + protocol: 'HTTP', + anonymity: 'Low' +} +``` + +It doesn't attempt to parse the actual data provided - all data is directly as specified in the list, and you'll have to figure out what to do with it. I have no idea, for example, what the bounds on `speed` or `connectionTime` are, or what the possible options for `anonymity` are. + +If this helped you, don't forget to donate ;) diff --git a/gulpfile.js b/gulpfile.js new file mode 100644 index 0000000..bb7f05f --- /dev/null +++ b/gulpfile.js @@ -0,0 +1,28 @@ +var gulp = require('gulp'); + +/* CoffeeScript compile deps */ +var path = require('path'); +var gutil = require('gulp-util'); +var concat = require('gulp-concat'); +var rename = require('gulp-rename'); +var coffee = require('gulp-coffee'); +var cache = require('gulp-cached'); +var remember = require('gulp-remember'); +var plumber = require('gulp-plumber'); + +var source = ["lib/**/*.coffee", "index.coffee"] + +gulp.task('coffee', function() { + return gulp.src(source, {base: "."}) + .pipe(plumber()) + .pipe(cache("coffee")) + .pipe(coffee({bare: true}).on('error', gutil.log)).on('data', gutil.log) + .pipe(remember("coffee")) + .pipe(gulp.dest(".")); +}); + +gulp.task('watch', function () { + gulp.watch(source, ['coffee']); +}); + +gulp.task('default', ['coffee', 'watch']); \ No newline at end of file diff --git a/index.coffee b/index.coffee new file mode 100644 index 0000000..1248281 --- /dev/null +++ b/index.coffee @@ -0,0 +1 @@ +module.exports = require "./lib/hma-proxy-parse" diff --git a/index.js b/index.js new file mode 100644 index 0000000..12384e4 --- /dev/null +++ b/index.js @@ -0,0 +1 @@ +module.exports = require("./lib/hma-proxy-parse"); diff --git a/lib/hma-proxy-parse.coffee b/lib/hma-proxy-parse.coffee new file mode 100644 index 0000000..68077f0 --- /dev/null +++ b/lib/hma-proxy-parse.coffee @@ -0,0 +1,67 @@ +cheerio = require "cheerio" + +module.exports = (html) -> + junkRegex = /\.([a-zA-Z0-9_-]+){display:none}/g + + junk = [] + proxies = [] + + match = true # Kickstarting the loop... + while match + match = junkRegex.exec html + if match + junk.push match[1] + + $ = cheerio.load html + + rows = $("tbody > tr") + + rows.each -> + element = $(this) + timestamp = element.children(".timestamp").attr("rel") + port = element.children("td:nth-of-type(3)").text().replace("\n", "") + country = element.find("td .country").parent().attr("rel") + speed = element.find("td .progress-indicator").eq(0).attr("value") + connectionTime = element.find("td .progress-indicator").eq(1).attr("value") + protocol = element.children("td:nth-of-type(7)").text() + anonymity = element.children("td:nth-of-type(8)").text() + + ipSegments = [] + + ipBlock = element.find("td:nth-of-type(2) > span") + + ipBlock.contents().each -> + ipElement = $(this) + + if this.tagName == null + if ipElement.text().trim() not in [".", ""] + ipSegments.push ipElement.text().trim().replace(".", "") + else if this.tagName in ["div", "span"] + isJunk = false + + classNames = ipElement.attr("class")?.split(" ") + + if classNames? + for className in classNames + if className in junk + isJunk = true + else + + if ipElement.attr("style")? + if ipElement.css("display") == "none" + isJunk = true + + if not isJunk and ipElement.text().trim() not in [".", ""] + ipSegments.push ipElement.text().trim().replace(".", "") + + proxies.push + updateTimestamp: timestamp + ip: ipSegments.join(".") + port: port + country: country + speed: speed + connectionTime: connectionTime + protocol: protocol + anonymity: anonymity + + return proxies diff --git a/lib/hma-proxy-parse.js b/lib/hma-proxy-parse.js new file mode 100644 index 0000000..6d99622 --- /dev/null +++ b/lib/hma-proxy-parse.js @@ -0,0 +1,74 @@ +var cheerio, + __indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; }; + +cheerio = require("cheerio"); + +module.exports = function(html) { + var $, junk, junkRegex, match, proxies, rows; + junkRegex = /\.([a-zA-Z0-9_-]+){display:none}/g; + junk = []; + proxies = []; + match = true; + while (match) { + match = junkRegex.exec(html); + if (match) { + junk.push(match[1]); + } + } + $ = cheerio.load(html); + rows = $("tbody > tr"); + rows.each(function() { + var anonymity, connectionTime, country, element, ipBlock, ipSegments, port, protocol, speed, timestamp; + element = $(this); + timestamp = element.children(".timestamp").attr("rel"); + port = element.children("td:nth-of-type(3)").text().replace("\n", ""); + country = element.find("td .country").parent().attr("rel"); + speed = element.find("td .progress-indicator").eq(0).attr("value"); + connectionTime = element.find("td .progress-indicator").eq(1).attr("value"); + protocol = element.children("td:nth-of-type(7)").text(); + anonymity = element.children("td:nth-of-type(8)").text(); + ipSegments = []; + ipBlock = element.find("td:nth-of-type(2) > span"); + ipBlock.contents().each(function() { + var className, classNames, ipElement, isJunk, _i, _len, _ref, _ref1, _ref2, _ref3; + ipElement = $(this); + if (this.tagName === null) { + if ((_ref = ipElement.text().trim()) !== "." && _ref !== "") { + return ipSegments.push(ipElement.text().trim().replace(".", "")); + } + } else if ((_ref1 = this.tagName) === "div" || _ref1 === "span") { + isJunk = false; + classNames = (_ref2 = ipElement.attr("class")) != null ? _ref2.split(" ") : void 0; + if (classNames != null) { + for (_i = 0, _len = classNames.length; _i < _len; _i++) { + className = classNames[_i]; + if (__indexOf.call(junk, className) >= 0) { + isJunk = true; + } else { + + } + } + } + if (ipElement.attr("style") != null) { + if (ipElement.css("display") === "none") { + isJunk = true; + } + } + if (!isJunk && ((_ref3 = ipElement.text().trim()) !== "." && _ref3 !== "")) { + return ipSegments.push(ipElement.text().trim().replace(".", "")); + } + } + }); + return proxies.push({ + updateTimestamp: timestamp, + ip: ipSegments.join("."), + port: port, + country: country, + speed: speed, + connectionTime: connectionTime, + protocol: protocol, + anonymity: anonymity + }); + }); + return proxies; +}; diff --git a/package.json b/package.json new file mode 100644 index 0000000..585e9cc --- /dev/null +++ b/package.json @@ -0,0 +1,37 @@ +{ + "name": "hma-proxy-parse", + "version": "1.0.0", + "description": "Parses proxies out of HideMyAss' public proxy list.", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "repository": { + "type": "git", + "url": "git://github.com/joepie91/hma-proxy-parse" + }, + "keywords": [ + "hidemyass", + "proxy", + "parser" + ], + "author": "Sven Slootweg", + "license": "WTFPL", + "dependencies": { + "cheerio": "^0.18.0" + }, + "devDependencies": { + "bhttp": "^1.0.2", + "bluebird": "^2.9.4", + "gulp": "~3.8.0", + "gulp-cached": "~0.0.3", + "gulp-coffee": "~2.0.1", + "gulp-concat": "~2.2.0", + "gulp-livereload": "~2.1.0", + "gulp-nodemon": "~1.0.4", + "gulp-plumber": "~0.6.3", + "gulp-remember": "~0.2.0", + "gulp-rename": "~1.2.0", + "gulp-util": "~2.2.17" + } +} diff --git a/test.coffee b/test.coffee new file mode 100644 index 0000000..a8242b5 --- /dev/null +++ b/test.coffee @@ -0,0 +1,8 @@ +hmaProxyParse = require "./" +bhttp = require "bhttp" +Promise = require "bluebird" + +Promise.try -> + bhttp.get "http://proxylist.hidemyass.com/" +.then (response) -> + console.log hmaProxyParse(response.body.toString())