diff --git a/README.md b/README.md new file mode 100644 index 0000000..c1e8837 --- /dev/null +++ b/README.md @@ -0,0 +1,85 @@ +# cdx + +A simple streaming CDX file parser. Parses CDX files (in particular, those corresponding to WARC files) that correspond to the format as specified by the [Internet Archive](https://archive.org/web/researcher/cdx_file_format.php). All items in the CDX field legend are supported, plus the `S` field. + +## Scope and development status + +`cdx` currently only reads (compliant) CDX streams. In the future, it will likely be expanded to also be able to write CDX streams, but this is not currently supported. Error handling is currently nearly non-existent - you are expected to provide a compliant CDX stream. + +## Installation + +``` +npm install --save cdx +``` + +## Usage + +`cdx` is a streaming parser. It takes a CDX byte stream as input (regardless of the source), and outputs an object stream of CDXRecord objects with the named attributes set to the corresponding values from the CDX stream. Additionally, a plain object containing these attributes is available as the `data` attribute, for easy (JSON) serialization. + +The signature is automatically parsed from the first line of the CDX data. Specifying a custom signature is not currently supported. + +```javascript +var cdx = require("cdx"), + fs = require("fs"); + +fs.createReadStream("sample.cdx") + .pipe(cdx()) + .pipe(...); +``` + +An example that parses the sample CDX file, 'picks out' the serializable data, and then outputs it to `stdout` as serialized JSON, can be found in `sample.cdx` (you'll need to install devDependencies first to actually run that file, though). + +## Fields + +All fields are self-explanatory, hopefully. These are just adapted from [the legend provided by the Internet Archive](https://archive.org/web/researcher/cdx_legend.php), so I really have no idea what most of these do. + +* `compressedRecordSize` (for `.warc.gz`, this is the gzipped size of the record) +* `compressedDATFileOffset` +* `compressedARCFileOffset` (for `.warc.gz`, this is the gzipped starting position of the record, combine with size to get the ending position) +* `uncompressedDATFileOffset` +* `uncompressedARCFileOffset` +* `ARCDocumentLength` +* `oldStyleChecksum` +* `newStyleChecksum` +* `canonicalizedUrl` +* `canonicalizedFrame` +* `canonicalizedHost` +* `canonicalizedImage` +* `canonicalizedJumpPoint` +* `canonicalizedLink` +* `canonicalizedPath` +* `canonicalizedRedirect` +* `canonicalizedHrefURL` +* `canonicalizedSrcURL` +* `canonicalizedScriptURL` +* `originalMimeType` (for `.warc.gz`, this is the original mimetype of the document as specified by the origin webserver) +* `originalURL` (for `.warc.gz`, this is the original URL that the document was retrieved from) +* `originalFrame` +* `originalHost` +* `originalImage` +* `originalJumpPoint` +* `originalLink` +* `originalPath` +* `originalRedirect` +* `originalHrefURL` +* `originalSrcURL` +* `originalScriptURL` +* `date` (for `.warc.gz`, this is the retrieval date of the record) +* `IP` +* `fileName` (for `.warc.gz`, this is the path to the WARC file that this record lives in - may not be useful, as it may refer to a path on a different filesystem) +* `port` +* `responseCode` (for `.warc.gz`, this is the HTTP status code encountered when retrieving the document) +* `title` +* `metaTags` +* `massagedURL` +* `languageString` +* `uniqueness` +* `newsGroup` +* `rulespaceCategory` +* `multiColumnLanguageDescription` +* `someWeirdFBISWhatsChangedKindaThing` (don't ask...) +* `comment` + +## Contributing + +Contributions welcome! Please file bugs [on GitHub](http://github.com/joepie91/node-cdx), and target pull requests at the `develop` branch. Thank you! diff --git a/index.coffee b/index.coffee index f11d524..03dbad0 100644 --- a/index.coffee +++ b/index.coffee @@ -1,18 +1,9 @@ -fs = require "fs" CDXRecordCollection = require "./lib/CDXRecordCollection" -JSONStream = require "JSONStream" -stream = require "stream" -adhocStream = require "adhoc-stream" -methods = - parseFile: (file) -> - collection = new CDXRecordCollection() +spawnFunc = -> + return new CDXRecordCollection(); - fs.createReadStream file - .pipe collection.stream - .pipe adhocStream.transformSync objectMode: true, (obj) -> - @push obj.data - .pipe JSONStream.stringify(false) - .pipe process.stdout +spawnFunc.CDXRecordCollection = CDXRecordCollection +spawnFunc.CDXRecord = require "./lib/CDXRecord" -methods.parseFile "./sample.cdx" +module.exports = spawnFunc diff --git a/lib/CDXRecordCollection.coffee b/lib/CDXRecordCollection.coffee index 4674056..0fafc13 100644 --- a/lib/CDXRecordCollection.coffee +++ b/lib/CDXRecordCollection.coffee @@ -2,7 +2,16 @@ _ = require "lodash" CDXRecord = require "./CDXRecord" stream = require "stream" -module.exports = class CDXRecordCollection +module.exports = class CDXRecordCollection extends stream.Transform + constructor: (@signature = []) -> + super + @_writableState.objectMode = false + @_readableState.objectMode = true + @_headerFound = false + @_buffer = "" + + @reverseSignatureMap = _.invert @signatureMap + records: [] signature: null delimiter: null @@ -59,24 +68,6 @@ module.exports = class CDXRecordCollection signatureMarkers = signatureData[5...].split(" ") @signature = (@reverseSignatureMap[marker] for marker in signatureMarkers) - constructor: (@signature = []) -> - @reverseSignatureMap = _.invert @signatureMap - @stream = new RecordStreamer(this) - - createRecord: (data) -> - record = new CDXRecord(@delimiter, @signature) - record.parseRecord data - @records.push record - return record - -class RecordStreamer extends stream.Transform - constructor: (@collection) -> - super - @_writableState.objectMode = false - @_readableState.objectMode = true - @_headerFound = false - @_buffer = "" - _transform: (chunk, encoding, done) -> @_buffer += chunk lines = @_buffer.split(/\r?\n/) @@ -85,14 +76,20 @@ class RecordStreamer extends stream.Transform for line in lines if not @_headerFound # The first line is the header. - @collection._parseSignature line + @_parseSignature line @_headerFound = true else try - record = @collection.createRecord(line) + record = @createRecord(line) this.push record catch err this.emit "error", err return done() + + createRecord: (data) -> + record = new CDXRecord(@delimiter, @signature) + record.parseRecord data + @records.push record + return record diff --git a/lib/cdx.coffee b/lib/cdx.coffee deleted file mode 100644 index cc8ca3e..0000000 --- a/lib/cdx.coffee +++ /dev/null @@ -1,3 +0,0 @@ - - -module.exports = null diff --git a/package.json b/package.json index 89ccb2b..ccfd752 100644 --- a/package.json +++ b/package.json @@ -2,13 +2,14 @@ "name": "cdx", "version": "0.0.1", "description": "A parser and generator for (Internet Archive) CDX files.", + "homepage": "https://github.com/joepie91/node-cdx", "main": "index.js", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" }, "repository": { "type": "git", - "url": "git@git.cryto.net:projects/joepie91/node-cdx" + "url": "git://github.com/joepie91/node-cdx.git" }, "keywords": [ "archiving", @@ -26,11 +27,11 @@ "gulp-plumber": "~0.6.3", "gulp-remember": "~0.2.0", "gulp-rename": "~1.2.0", - "gulp-util": "~2.2.17" - }, - "dependencies": { + "gulp-util": "~2.2.17", "JSONStream": "^0.9.0", "adhoc-stream": "^0.0.1", + }, + "dependencies": { "lodash": "^2.4.1", "moment": "^2.8.3" } diff --git a/test.coffee b/test.coffee new file mode 100644 index 0000000..6bd69ed --- /dev/null +++ b/test.coffee @@ -0,0 +1,11 @@ +cdx = require "./" +fs = require "fs" +adhocStream = require "adhoc-stream" +JSONStream = require "JSONStream" + +fs.createReadStream "./sample.cdx" + .pipe cdx() + .pipe adhocStream.transformSync objectMode: true, (chunk) -> + @push chunk.data + .pipe JSONStream.stringify(false) + .pipe process.stdout