master
Sven Slootweg 5 years ago
commit 67e45697d2

1
.gitignore vendored

@ -0,0 +1 @@
node_modules

152
app.js

@ -0,0 +1,152 @@
"use strict";
const Promise = require("bluebird");
const express = require("express");
const expressReactViews = require("@joepie91/express-react-views");
const morgan = require("morgan");
const path = require("path");
const bodyParser = require("body-parser");
const expressSession = require("express-session");
const browserify = require("browserify");
const watchifyMiddleware = require("watchify-middleware");
const fs = require("fs");
const defaultValue = require("default-value");
const url = require("url");
const tinyLr = require("tiny-lr");
const chokidar = require("chokidar");
const createUrlRewriter = require("./url-rewriter");
const createSessionManager = require("./session-manager");
const rewriteCssUrls = require("./rewrite-css-urls");
const rewriteHtmlUrls = require("./rewrite-html-urls");
const appendHtml = require("./append-html");
let injectorHtml = fs.readFileSync(path.join(__dirname, "./data/injector.html"), "utf8");
let sessionManager = createSessionManager();
if (process.env.NODE_ENV === "development") {
let reloadServer = tinyLr();
reloadServer.listen(35729);
let firstReloadDone = false;
reloadServer.on("MSG /create", (id, url) => {
if (firstReloadDone === false) {
firstReloadDone = true;
reloadPage();
}
});
function reloadPage(files) {
reloadServer.changed({ body: { files: defaultValue(files, [ "*" ]) } });
}
chokidar.watch(path.join(__dirname, "src/**/*.{js,jsx}")).on("all", () => {
reloadPage();
});
chokidar.watch(path.join(__dirname, "public/*.css")).on("all", (event, changedPath) => {
let relativeChangedPath = path.relative(path.join(__dirname, "public"), changedPath);
reloadPage(relativeChangedPath);
});
}
let app = express();
app.engine("jsx", expressReactViews.createEngine({}));
app.set("view engine", "jsx");
app.set("views", path.join(__dirname, "views"));
if (process.env.NODE_ENV === "development") {
app.use("/scraping-tool-bundle.js", watchifyMiddleware(browserify("src/injector/index.jsx", {
basedir: __dirname,
debug: true,
cache: {},
extensions: [".jsx"],
transform: [
["babelify", {
presets: ["@babel/preset-env", "@babel/preset-react"],
}]
]
})));
}
app.use(express.static(path.join(__dirname, "public")));
app.use(morgan("dev"));
app.use(bodyParser.urlencoded({ extended: false }));
app.use(expressSession({
secret: "foobar",
resave: false,
saveUninitialized: false
}));
app.get("/", (req, res) => {
res.render("index");
});
app.post("/browse", (req, res) => {
res.redirect(`/scrape/${encodeURIComponent(req.body.url)}`);
});
app.get(/^\/scrape\/(.+)/, (req, res) => {
return Promise.try(() => {
if (req.session.cookieJar == null) {
/* NOTE: We don't store the session directly in req.session because it will not survive express-session's serialization and deserialization. We don't really care about cookie persistence across restarts right now. */
req.session.bhttpSessionId = sessionManager.createSession();
return Promise.promisify(req.session.save.bind(req.session))();
}
}).then(() => {
let targetUrl = req.params[0];
let rewriteUrl = createUrlRewriter(targetUrl, `http://${req.get("host")}/scrape/`);
return Promise.try(() => {
if (url.parse(targetUrl).hostname == null) {
throw new Error(`Attempted to load a relative URL (${targetUrl}); this means that something was not correctly rewritten.`);
} else {
return sessionManager.getSession(req.session.bhttpSessionId).get(targetUrl, {
headers: {
"User-Agent": req.headers['user-agent']
}
});
}
}).then((response) => {
let contentType = response.headers['content-type'];
let contentLength = response.headers['content-length']
if (contentType != null) {
res.setHeader("Content-Type", contentType);
}
if (contentLength != null) {
res.setHeader("Content-Length", contentLength);
}
let rewrittenBody;
/* FIXME: Use an actual content-type parser */
if (contentType == null) {
rewrittenBody = response.body;
} else if (contentType.includes("text/html")) {
rewrittenBody = rewriteHtmlUrls(response.body, rewriteUrl);
rewrittenBody = appendHtml(rewrittenBody, injectorHtml);
} else if (contentType.includes("text/css")) {
rewrittenBody = rewriteCssUrls(response.body.toString(), rewriteUrl);
} else {
rewrittenBody = response.body;
}
res.send(rewrittenBody);
});
});
});
/* TODO: Maybe make a 404 a hard error, as it is likely to occur when URLs are incorrectly rewritten? How to deal with crawlers trying nonsense URLs? */
app.listen(3000, () => {
console.log("Listening on port 3000...");
});

@ -0,0 +1,11 @@
"use strict";
const cheerio = require("cheerio");
module.exports = function appendHtml(body, htmlToAppend) {
let $ = cheerio.load(body);
$("body").append($(htmlToAppend));
return $.html();
};

@ -0,0 +1,7 @@
<div class="___scraping___tool___">This is a box that isn't on the original website</div>
<link rel="stylesheet" href="/scraping-tool-stylesheet.css">
<script src="/scraping-tool-bundle.js"></script>
<div class="___scraping___tool___overlay active">
</div>

@ -0,0 +1,41 @@
{
"name": "scraping-tool-poc",
"version": "1.0.0",
"main": "index.js",
"repository": "git@git.cryto.net:joepie91/scraping-tool-poc.git",
"author": "Sven Slootweg <admin@cryto.net>",
"license": "MIT",
"dependencies": {
"@joepie91/express-react-views": "^1.0.1",
"bhttp": "^1.2.4",
"bluebird": "^3.5.5",
"body-parser": "^1.19.0",
"cheerio": "^1.0.0-rc.3",
"dataprog": "^0.1.0",
"debounce": "^1.2.0",
"default-value": "^1.0.0",
"express": "^4.17.1",
"express-session": "^1.16.2",
"morgan": "^1.9.1",
"nanoid": "^2.0.3",
"rewrite-css-urls": "^1.0.4",
"tough-cookie": "^2.3.1"
},
"devDependencies": {
"@babel/core": "^7.5.5",
"@babel/preset-env": "^7.5.5",
"@babel/preset-react": "^7.0.0",
"babelify": "^10.0.0",
"browserify": "^16.5.0",
"chokidar": "^3.0.2",
"document-ready-promise": "^3.0.1",
"nodemon": "^1.19.1",
"react": "^16.8.6",
"react-dom": "^16.8.6",
"tiny-lr": "^1.1.1",
"watchify-middleware": "^1.8.2"
},
"scripts": {
"dev": "NODE_ENV=development yarn nodemon --ignore src/injector app.js"
}
}

@ -0,0 +1,78 @@
* {
cursor: default;
}
.___scraping___tool___ {
position: fixed;
right: 32px;
top: 32px;
background-color: red;
color: white;
padding: 8px;
z-index: 999999999;
}
.___scraping___tool___overlay {
pointer-events: none;
/* opacity: 0.4; */
position: fixed;
top: 0;
bottom: 0;
left: 0;
right: 0;
z-index: 999999990;
}
.___scraping___tool___overlay.active {
/* background-color: rgba(240, 0, 0, 0.4); */
font-family: sans-serif;
font-weight: normal;
}
.___scraping___tool___hover, .___scraping___tool___selection, .___scraping___tool___secondarySelection, .___scraping___tool___tooltip {
position: absolute;
z-index: 999999991;
}
.___scraping___tool___hover {
background-color: rgba(216, 61, 255, 0.4);
}
.___scraping___tool___selection {
background-color: rgba(0, 255, 0, 0.4);
}
.___scraping___tool___tooltip {
background-color: black;
color: white;
padding: .2em .4em;
font-family: sans-serif;
font-weight: normal;
}
.___scraping___tool___secondarySelection {
background-color: rgb(104, 241, 230);
}
.___scraping___tool___candidatePicker {
pointer-events: initial;
z-index: 999999992;
position: absolute;
left: 16px;
top: 16px;
padding: .6em 1em;
background-color: rgba(43, 43, 43, 0.9);
color: white;
}
.___scraping___tool___candidate {
padding: .2em .5em;
font-size: 15px;
}
.___scraping___tool___candidate:hover {
background-color: rgb(88, 88, 88);
}

@ -0,0 +1,13 @@
"use strict";
const rewriteCssUrlsLib = require("rewrite-css-urls");
module.exports = function rewriteCssUrls(css, rewriteUrl) {
let rewritten1 = rewriteCssUrlsLib.findAndReplace(css, { replaceUrl: (ref) => rewriteUrl(ref.url) });
let rewritten2 = rewritten1.replace(/sourceMappingURL=([^ ]+)/, (_match, url) => {
return `sourceMappingURL=${rewriteUrl(url)}`;
});
return rewritten2;
};

@ -0,0 +1,55 @@
"use strict";
const cheerio = require("cheerio");
const rewriteCssUrls = require("./rewrite-css-urls");
module.exports = function (body, rewriteUrl) {
function patchAttribute(elements, attribute) {
elements.get().forEach((element) => {
let $element = $(element);
let value = $element.attr(attribute);
if (value != null) {
$element.attr(attribute, rewriteUrl(value));
}
});
}
function removeAttribute(elements, attribute) {
elements.get().forEach((element) => {
let $element = $(element);
$element.removeAttr(attribute);
});
}
let $ = cheerio.load(body);
patchAttribute($("a"), "href");
patchAttribute($("img"), "src"); /* FIXME: Responsive versions? */
patchAttribute($("link"), "href");
patchAttribute($("script"), "src");
patchAttribute($("form"), "action");
patchAttribute($("iframe"), "src");
patchAttribute($("source"), "src");
/* NOTE: The below is necessary because we're rewriting the contents of CSS and potentially JS files, intentionally. TODO: In the future, just to be safe, we should actually verify the received content against the hashes first, before trying to forward the content to the user. */
removeAttribute($("link"), "integrity");
removeAttribute($("script"), "integrity");
$("style").get().forEach((element) => {
let $element = $(element);
$element.text(rewriteCss($element.text()));
});
$("*[style]").get().forEach((element) => {
let $element = $(element);
$element.attr("style", rewriteCssUrls($element.attr("style"), rewriteUrl));
});
return $.html();
};

@ -0,0 +1,28 @@
"use strict";
const bhttp = require("bhttp");
const nanoid = require("nanoid");
module.exports = function createSessionManager() {
let map = new Map();
return {
createSession: function (options) {
let id = nanoid();
let session = bhttp.session(options);
map.set(id, session);
return id;
},
getSession: function (id) {
let session = map.get(id);
if (session == null) {
throw new Error("No such session exists; this should never happen");
} else {
return session;
}
}
};
};

@ -0,0 +1,16 @@
"use strict";
module.exports = function elementsFromPoint(x, y) {
if (typeof document.elementsFromPoint === "function") {
return document.elementsFromPoint(x, y);
} else if (typeof document.msElementsFromPoint === "function") {
/* Fix for IE/Edge */
let result = document.msElementsFromPoint(x, y);
if (result != null) {
return Array.from(result);
} else {
return [];
}
}
};

@ -0,0 +1,43 @@
"use strict";
function findIdRoot(element) {
let current = element;
while (current != null) {
if (current.id != null && current.id !== "") {
return current;
} else {
current = current.parentElement;
}
}
}
module.exports = function generateSelector(element) {
/* FIXME: check that element != null */
let root = findIdRoot(element);
let segments = [
(root != null) ? `#${root.id}` : null,
(element.classList.length > 0)
? Array.from(element.classList).map((className) => `.${className}`).join("")
: element.tagName.toLowerCase()
].filter((segment) => segment != null);
return segments.join(" ");
// console.log(root);
// let className = element.classList.item(0);
// if (root === element) {
// return `#${root.id}`;
// } else if (root != null) {
// if (className != null) {
// return `#${root.id} .${className}`;
// } else {
// return `#${root.id} ${element.tagName}`;
// }
// } else {
// return `stuff`;
// }
};

@ -0,0 +1,152 @@
"use strict";
const React = require("react");
const ReactDOM = require("react-dom");
const Promise = require("bluebird");
const documentReadyPromise = require("document-ready-promise");
const debounce = require("debounce");
const { expression } = require("dataprog");
const generateSelector = require("./generate-selector");
const useStateRef = require("./use-state-ref");
const useMemoizedPosition = require("./use-memoized-position");
const elementsFromPoint = require("./elements-from-point");
const uniqueElementId = require("./unique-element-id");
function Overlay() {
let [ scrollX, setScrollX ] = React.useState();
let [ scrollY, setScrollY ] = React.useState();
let [ hoveredElement, setHoveredElement ] = React.useState();
let [ isHovering, setIsHovering, isHoveringRef ] = useStateRef(true);
let [ isPicking, setIsPicking, isPickingRef ] = useStateRef(false);
let [ elementList, setElementList ] = React.useState([]);
let [ selectedElement, setSelectedElement ] = React.useState();
let [ pickHoveredElement, setPickHoveredElement ] = React.useState();
let [ pickedElement, setPickedElement ] = React.useState();
let enabledRef = React.useRef(true);
React.useEffect(() => {
window.addEventListener("scroll", debounce((event) => {
setScrollX(window.scrollX);
setScrollY(window.scrollY);
}), 20);
let allElements = document.querySelectorAll("*");
for (let element of allElements) {
/* TODO: Investigate whether switching to mousemove + elementFromPoint is more performant */
element.addEventListener("mouseover", (event) => {
event.stopPropagation();
if (isHoveringRef.current) {
setHoveredElement(element);
}
});
function clickHandler(event) {
if (enabledRef.current) {
event.preventDefault();
event.stopPropagation();
if (isHoveringRef.current) {
setIsHovering(false);
setIsPicking(true);
let candidateElements = elementsFromPoint(event.clientX, event.clientY);
setElementList(candidateElements);
setSelectedElement(candidateElements[0]);
}
}
}
element.addEventListener("click", clickHandler);
element.addEventListener("mousedown", clickHandler);
element.addEventListener("mouseup", clickHandler);
}
}, []);
let hoveredPosition = useMemoizedPosition(hoveredElement, [ scrollX, scrollY ]);
let selectedPosition = useMemoizedPosition(selectedElement, [ scrollX, scrollY ]);
let pickedPosition = useMemoizedPosition(pickedElement, [ scrollX, scrollY ]);
let pickHoveredPosition = useMemoizedPosition(pickHoveredElement, [ scrollX, scrollY ]);
return (
<div className="___scraping___tool___overlay active">
{(hoveredPosition != null && isHovering)
? <HoverHighlight element={hoveredElement} {... hoveredPosition} />
: null
}
{(selectedPosition != null)
? <PrimarySelectionHighlight element={selectedElement} {... selectedPosition} />
: null
}
{(pickedPosition != null && isPicking)
? <HoverHighlight element={pickedElement} {... pickedPosition} />
: null
}
{(pickHoveredPosition != null && isPicking)
? <HoverHighlight element={pickHoveredElement} {... pickHoveredPosition} />
: null
}
{(isPicking)
? <Picker candidates={elementList} onHover={setPickHoveredElement} />
: null
}
</div>
);
}
function Picker({ candidates, onHover }) {
return (
<div className="___scraping___tool___candidatePicker">
{candidates.map((candidate) => {
return <PickerCandidate
key={uniqueElementId(candidate)}
element={candidate}
onEnter={() => onHover(candidate)}
onLeave={() => onHover(null)}
/>;
})}
</div>
);
}
function PickerCandidate({ element, onEnter, onLeave }) {
return (
<div className="___scraping___tool___candidate" onMouseEnter={onEnter} onMouseLeave={onLeave}>
{generateSelector(element)}
</div>
);
}
function HoverHighlight({ x, y, width, height, element }) {
let boxStyle = { left: x, top: y, width: width, height: height };
let tooltipStyle = { left: x, top: y + height };
return (<>
<div className="___scraping___tool___hover" style={boxStyle} />
{(element != null)
? <div className="___scraping___tool___tooltip" style={tooltipStyle}>{ generateSelector(element) }</div>
: null
}
</>);
}
function PrimarySelectionHighlight({ x, y, width, height, element }) {
let boxStyle = { left: x, top: y, width: width, height: height };
return (<>
<div className="___scraping___tool___selection" style={boxStyle} />
</>);
}
Promise.try(() => {
return documentReadyPromise();
}).then(() => {
let $overlay = document.querySelector(".___scraping___tool___overlay");
let $tool = document.querySelector(".___scraping___tool");
ReactDOM.render(<Overlay />, $overlay);
});

@ -0,0 +1,11 @@
"use strict";
let map = new WeakMap();
module.exports = function uniqueElementId(element) {
if (!map.has(element)) {
map.set(element, Math.random().toString());
}
return map.get(element);
};

@ -0,0 +1,20 @@
"use strict";
const React = require("react");
module.exports = function useMemoizedPosition(element, extraDependencies = []) {
return React.useMemo(() => {
if (element != null) {
let rect = element.getBoundingClientRect();
return {
x: rect.left,
y: rect.top,
width: rect.width,
height: rect.height
};
} else {
return {};
}
}, [ element, ... extraDependencies ]);
};

@ -0,0 +1,17 @@
"use strict";
const React = require("react");
module.exports = function useStateRef(initialState) {
let [ state, setState ] = React.useState(initialState);
let ref = React.useRef(initialState);
return [
state,
(newState) => {
ref.current = newState;
setState(newState);
},
ref
];
};

@ -0,0 +1,9 @@
"use strict";
const url = require("url");
module.exports = function createUrlRewriter(originUrl, prefix) {
return function rewrite(path) {
return `${prefix}${encodeURIComponent(url.resolve(originUrl, path))}`;
};
};

@ -0,0 +1,12 @@
"use strict";
const React = require("react");
module.exports = function Index() {
return (
<form action="/browse" method="post">
<input type="text" name="url" />
<button type="submit">Go!</button>
</form>
);
};

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save