You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
153 lines
4.5 KiB
JavaScript
153 lines
4.5 KiB
JavaScript
5 years ago
|
"use strict";
|
||
|
|
||
|
const Promise = require("bluebird");
|
||
|
const express = require("express");
|
||
|
const expressReactViews = require("@joepie91/express-react-views");
|
||
|
const morgan = require("morgan");
|
||
|
const path = require("path");
|
||
|
const bodyParser = require("body-parser");
|
||
|
const expressSession = require("express-session");
|
||
|
const browserify = require("browserify");
|
||
|
const watchifyMiddleware = require("watchify-middleware");
|
||
|
const fs = require("fs");
|
||
|
const defaultValue = require("default-value");
|
||
|
const url = require("url");
|
||
|
|
||
|
const tinyLr = require("tiny-lr");
|
||
|
const chokidar = require("chokidar");
|
||
|
|
||
|
const createUrlRewriter = require("./url-rewriter");
|
||
|
const createSessionManager = require("./session-manager");
|
||
|
const rewriteCssUrls = require("./rewrite-css-urls");
|
||
|
const rewriteHtmlUrls = require("./rewrite-html-urls");
|
||
|
const appendHtml = require("./append-html");
|
||
|
|
||
|
let injectorHtml = fs.readFileSync(path.join(__dirname, "./data/injector.html"), "utf8");
|
||
|
|
||
|
let sessionManager = createSessionManager();
|
||
|
|
||
|
if (process.env.NODE_ENV === "development") {
|
||
|
let reloadServer = tinyLr();
|
||
|
reloadServer.listen(35729);
|
||
|
|
||
|
let firstReloadDone = false;
|
||
|
reloadServer.on("MSG /create", (id, url) => {
|
||
|
if (firstReloadDone === false) {
|
||
|
firstReloadDone = true;
|
||
|
reloadPage();
|
||
|
}
|
||
|
});
|
||
|
|
||
|
function reloadPage(files) {
|
||
|
reloadServer.changed({ body: { files: defaultValue(files, [ "*" ]) } });
|
||
|
}
|
||
|
|
||
|
chokidar.watch(path.join(__dirname, "src/**/*.{js,jsx}")).on("all", () => {
|
||
|
reloadPage();
|
||
|
});
|
||
|
|
||
|
chokidar.watch(path.join(__dirname, "public/*.css")).on("all", (event, changedPath) => {
|
||
|
let relativeChangedPath = path.relative(path.join(__dirname, "public"), changedPath);
|
||
|
|
||
|
reloadPage(relativeChangedPath);
|
||
|
});
|
||
|
}
|
||
|
|
||
|
let app = express();
|
||
|
|
||
|
app.engine("jsx", expressReactViews.createEngine({}));
|
||
|
app.set("view engine", "jsx");
|
||
|
app.set("views", path.join(__dirname, "views"));
|
||
|
|
||
|
if (process.env.NODE_ENV === "development") {
|
||
|
app.use("/scraping-tool-bundle.js", watchifyMiddleware(browserify("src/injector/index.jsx", {
|
||
|
basedir: __dirname,
|
||
|
debug: true,
|
||
|
cache: {},
|
||
|
extensions: [".jsx"],
|
||
|
transform: [
|
||
|
["babelify", {
|
||
|
presets: ["@babel/preset-env", "@babel/preset-react"],
|
||
|
}]
|
||
|
]
|
||
|
})));
|
||
|
}
|
||
|
|
||
|
app.use(express.static(path.join(__dirname, "public")));
|
||
|
|
||
|
app.use(morgan("dev"));
|
||
|
app.use(bodyParser.urlencoded({ extended: false }));
|
||
|
|
||
|
app.use(expressSession({
|
||
|
secret: "foobar",
|
||
|
resave: false,
|
||
|
saveUninitialized: false
|
||
|
}));
|
||
|
|
||
|
app.get("/", (req, res) => {
|
||
|
res.render("index");
|
||
|
});
|
||
|
|
||
|
app.post("/browse", (req, res) => {
|
||
|
res.redirect(`/scrape/${encodeURIComponent(req.body.url)}`);
|
||
|
});
|
||
|
|
||
|
app.get(/^\/scrape\/(.+)/, (req, res) => {
|
||
|
return Promise.try(() => {
|
||
|
if (req.session.cookieJar == null) {
|
||
|
/* NOTE: We don't store the session directly in req.session because it will not survive express-session's serialization and deserialization. We don't really care about cookie persistence across restarts right now. */
|
||
|
req.session.bhttpSessionId = sessionManager.createSession();
|
||
|
return Promise.promisify(req.session.save.bind(req.session))();
|
||
|
}
|
||
|
}).then(() => {
|
||
|
let targetUrl = req.params[0];
|
||
|
|
||
|
let rewriteUrl = createUrlRewriter(targetUrl, `http://${req.get("host")}/scrape/`);
|
||
|
|
||
|
return Promise.try(() => {
|
||
|
if (url.parse(targetUrl).hostname == null) {
|
||
|
throw new Error(`Attempted to load a relative URL (${targetUrl}); this means that something was not correctly rewritten.`);
|
||
|
} else {
|
||
|
return sessionManager.getSession(req.session.bhttpSessionId).get(targetUrl, {
|
||
|
headers: {
|
||
|
"User-Agent": req.headers['user-agent']
|
||
|
}
|
||
|
});
|
||
|
}
|
||
|
}).then((response) => {
|
||
|
let contentType = response.headers['content-type'];
|
||
|
let contentLength = response.headers['content-length']
|
||
|
|
||
|
if (contentType != null) {
|
||
|
res.setHeader("Content-Type", contentType);
|
||
|
}
|
||
|
|
||
|
if (contentLength != null) {
|
||
|
res.setHeader("Content-Length", contentLength);
|
||
|
}
|
||
|
|
||
|
let rewrittenBody;
|
||
|
|
||
|
/* FIXME: Use an actual content-type parser */
|
||
|
if (contentType == null) {
|
||
|
rewrittenBody = response.body;
|
||
|
} else if (contentType.includes("text/html")) {
|
||
|
rewrittenBody = rewriteHtmlUrls(response.body, rewriteUrl);
|
||
|
rewrittenBody = appendHtml(rewrittenBody, injectorHtml);
|
||
|
} else if (contentType.includes("text/css")) {
|
||
|
rewrittenBody = rewriteCssUrls(response.body.toString(), rewriteUrl);
|
||
|
} else {
|
||
|
rewrittenBody = response.body;
|
||
|
}
|
||
|
|
||
|
res.send(rewrittenBody);
|
||
|
});
|
||
|
});
|
||
|
});
|
||
|
|
||
|
/* TODO: Maybe make a 404 a hard error, as it is likely to occur when URLs are incorrectly rewritten? How to deal with crawlers trying nonsense URLs? */
|
||
|
|
||
|
app.listen(3000, () => {
|
||
|
console.log("Listening on port 3000...");
|
||
|
});
|