You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

153 lines
4.5 KiB
JavaScript

5 years ago
"use strict";
const Promise = require("bluebird");
const express = require("express");
const expressReactViews = require("@joepie91/express-react-views");
const morgan = require("morgan");
const path = require("path");
const bodyParser = require("body-parser");
const expressSession = require("express-session");
const browserify = require("browserify");
const watchifyMiddleware = require("watchify-middleware");
const fs = require("fs");
const defaultValue = require("default-value");
const url = require("url");
const tinyLr = require("tiny-lr");
const chokidar = require("chokidar");
const createUrlRewriter = require("./url-rewriter");
const createSessionManager = require("./session-manager");
const rewriteCssUrls = require("./rewrite-css-urls");
const rewriteHtmlUrls = require("./rewrite-html-urls");
const appendHtml = require("./append-html");
let injectorHtml = fs.readFileSync(path.join(__dirname, "./data/injector.html"), "utf8");
let sessionManager = createSessionManager();
if (process.env.NODE_ENV === "development") {
let reloadServer = tinyLr();
reloadServer.listen(35729);
let firstReloadDone = false;
reloadServer.on("MSG /create", (id, url) => {
if (firstReloadDone === false) {
firstReloadDone = true;
reloadPage();
}
});
function reloadPage(files) {
reloadServer.changed({ body: { files: defaultValue(files, [ "*" ]) } });
}
chokidar.watch(path.join(__dirname, "src/**/*.{js,jsx}")).on("all", () => {
reloadPage();
});
chokidar.watch(path.join(__dirname, "public/*.css")).on("all", (event, changedPath) => {
let relativeChangedPath = path.relative(path.join(__dirname, "public"), changedPath);
reloadPage(relativeChangedPath);
});
}
let app = express();
app.engine("jsx", expressReactViews.createEngine({}));
app.set("view engine", "jsx");
app.set("views", path.join(__dirname, "views"));
if (process.env.NODE_ENV === "development") {
app.use("/scraping-tool-bundle.js", watchifyMiddleware(browserify("src/injector/index.jsx", {
basedir: __dirname,
debug: true,
cache: {},
extensions: [".jsx"],
transform: [
["babelify", {
presets: ["@babel/preset-env", "@babel/preset-react"],
}]
]
})));
}
app.use(express.static(path.join(__dirname, "public")));
app.use(morgan("dev"));
app.use(bodyParser.urlencoded({ extended: false }));
app.use(expressSession({
secret: "foobar",
resave: false,
saveUninitialized: false
}));
app.get("/", (req, res) => {
res.render("index");
});
app.post("/browse", (req, res) => {
res.redirect(`/scrape/${encodeURIComponent(req.body.url)}`);
});
app.get(/^\/scrape\/(.+)/, (req, res) => {
return Promise.try(() => {
if (req.session.cookieJar == null) {
/* NOTE: We don't store the session directly in req.session because it will not survive express-session's serialization and deserialization. We don't really care about cookie persistence across restarts right now. */
req.session.bhttpSessionId = sessionManager.createSession();
return Promise.promisify(req.session.save.bind(req.session))();
}
}).then(() => {
let targetUrl = req.params[0];
let rewriteUrl = createUrlRewriter(targetUrl, `http://${req.get("host")}/scrape/`);
return Promise.try(() => {
if (url.parse(targetUrl).hostname == null) {
throw new Error(`Attempted to load a relative URL (${targetUrl}); this means that something was not correctly rewritten.`);
} else {
return sessionManager.getSession(req.session.bhttpSessionId).get(targetUrl, {
headers: {
"User-Agent": req.headers['user-agent']
}
});
}
}).then((response) => {
let contentType = response.headers['content-type'];
let contentLength = response.headers['content-length']
if (contentType != null) {
res.setHeader("Content-Type", contentType);
}
if (contentLength != null) {
res.setHeader("Content-Length", contentLength);
}
let rewrittenBody;
/* FIXME: Use an actual content-type parser */
if (contentType == null) {
rewrittenBody = response.body;
} else if (contentType.includes("text/html")) {
rewrittenBody = rewriteHtmlUrls(response.body, rewriteUrl);
rewrittenBody = appendHtml(rewrittenBody, injectorHtml);
} else if (contentType.includes("text/css")) {
rewrittenBody = rewriteCssUrls(response.body.toString(), rewriteUrl);
} else {
rewrittenBody = response.body;
}
res.send(rewrittenBody);
});
});
});
/* TODO: Maybe make a 404 a hard error, as it is likely to occur when URLs are incorrectly rewritten? How to deal with crawlers trying nonsense URLs? */
app.listen(3000, () => {
console.log("Listening on port 3000...");
});