"use strict"; const Promise = require("bluebird"); const express = require("express"); const expressReactViews = require("@joepie91/express-react-views"); const morgan = require("morgan"); const path = require("path"); const bodyParser = require("body-parser"); const expressSession = require("express-session"); const browserify = require("browserify"); const watchifyMiddleware = require("watchify-middleware"); const fs = require("fs"); const defaultValue = require("default-value"); const url = require("url"); const tinyLr = require("tiny-lr"); const chokidar = require("chokidar"); const createUrlRewriter = require("./url-rewriter"); const createSessionManager = require("./session-manager"); const rewriteCssUrls = require("./rewrite-css-urls"); const rewriteHtmlUrls = require("./rewrite-html-urls"); const appendHtml = require("./append-html"); let injectorHtml = fs.readFileSync(path.join(__dirname, "./data/injector.html"), "utf8"); let sessionManager = createSessionManager(); if (process.env.NODE_ENV === "development") { let reloadServer = tinyLr(); reloadServer.listen(35729); let firstReloadDone = false; reloadServer.on("MSG /create", (id, url) => { if (firstReloadDone === false) { firstReloadDone = true; reloadPage(); } }); function reloadPage(files) { reloadServer.changed({ body: { files: defaultValue(files, [ "*" ]) } }); } chokidar.watch(path.join(__dirname, "src/**/*.{js,jsx}")).on("all", () => { reloadPage(); }); chokidar.watch(path.join(__dirname, "public/*.css")).on("all", (event, changedPath) => { let relativeChangedPath = path.relative(path.join(__dirname, "public"), changedPath); reloadPage(relativeChangedPath); }); } let app = express(); app.engine("jsx", expressReactViews.createEngine({})); app.set("view engine", "jsx"); app.set("views", path.join(__dirname, "views")); if (process.env.NODE_ENV === "development") { app.use("/scraping-tool-bundle.js", watchifyMiddleware(browserify("src/injector/index.jsx", { basedir: __dirname, debug: true, cache: {}, extensions: [".jsx"], transform: [ ["babelify", { presets: ["@babel/preset-env", "@babel/preset-react"], }] ] }))); } app.use(express.static(path.join(__dirname, "public"))); app.use(morgan("dev")); app.use(bodyParser.urlencoded({ extended: false })); app.use(expressSession({ secret: "foobar", resave: false, saveUninitialized: false })); app.get("/", (req, res) => { res.render("index"); }); app.post("/browse", (req, res) => { res.redirect(`/scrape/${encodeURIComponent(req.body.url)}`); }); app.get(/^\/scrape\/(.+)/, (req, res) => { return Promise.try(() => { if (req.session.cookieJar == null) { /* NOTE: We don't store the session directly in req.session because it will not survive express-session's serialization and deserialization. We don't really care about cookie persistence across restarts right now. */ req.session.bhttpSessionId = sessionManager.createSession(); return Promise.promisify(req.session.save.bind(req.session))(); } }).then(() => { let targetUrl = req.params[0]; let rewriteUrl = createUrlRewriter(targetUrl, `http://${req.get("host")}/scrape/`); return Promise.try(() => { if (url.parse(targetUrl).hostname == null) { throw new Error(`Attempted to load a relative URL (${targetUrl}); this means that something was not correctly rewritten.`); } else { return sessionManager.getSession(req.session.bhttpSessionId).get(targetUrl, { headers: { "User-Agent": req.headers['user-agent'] } }); } }).then((response) => { let contentType = response.headers['content-type']; let contentLength = response.headers['content-length'] if (contentType != null) { res.setHeader("Content-Type", contentType); } if (contentLength != null) { res.setHeader("Content-Length", contentLength); } let rewrittenBody; /* FIXME: Use an actual content-type parser */ if (contentType == null) { rewrittenBody = response.body; } else if (contentType.includes("text/html")) { rewrittenBody = rewriteHtmlUrls(response.body, rewriteUrl); rewrittenBody = appendHtml(rewrittenBody, injectorHtml); } else if (contentType.includes("text/css")) { rewrittenBody = rewriteCssUrls(response.body.toString(), rewriteUrl); } else { rewrittenBody = response.body; } res.send(rewrittenBody); }); }); }); /* TODO: Maybe make a 404 a hard error, as it is likely to occur when URLs are incorrectly rewritten? How to deal with crawlers trying nonsense URLs? */ app.listen(3000, () => { console.log("Listening on port 3000..."); });