diff --git a/lib/Parser.js b/lib/Parser.js index fe746b1..bed817d 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -3,8 +3,7 @@ var EventEmitter = require('events').EventEmitter, inherits = require('util').inherits, inspect = require('util').inspect, utf7 = require('utf7').imap, - iconv = require('iconv-lite'), - jsencoding/*lazy-loaded*/; + jsencoding; // lazy-loaded var CH_LF = 10, LITPLACEHOLDER = String.fromCharCode(0), @@ -24,7 +23,8 @@ var CH_LF = 10, RE_ENCWORD_END = /=\?([^?]*?)\?([qb])\?(.*?)\?=$/i, RE_ENCWORD_BEGIN = /^[ \t]=\?([^?]*?)\?([qb])\?(.*?)\?=/i, RE_QENC = /(?:=([a-fA-F0-9]{2}))|_/g, - RE_SEARCH_MODSEQ = /^(.+) \(MODSEQ (.+?)\)$/i; + RE_SEARCH_MODSEQ = /^(.+) \(MODSEQ (.+?)\)$/i, + RE_LWS_ONLY = /^[ \t]*$/; function Parser(stream, debug) { if (!(this instanceof Parser)) @@ -37,12 +37,15 @@ function Parser(stream, debug) { this._literallen = 0; this._literals = []; this._buffer = ''; + this._ignoreReadable = false; this.debug = debug; this.setStream(stream); var self = this; this._cbReadable = function() { + if (self._ignoreReadable) + return; if (self._literallen > 0 && !self._body) self._tryread(self._literallen); else @@ -78,8 +81,7 @@ Parser.prototype._parse = function(data) { var body = this._body; if (datalen > this._literallen) { var litlen = this._literallen; - - i = this._literallen; + i = litlen; this._literallen = 0; body.push(data.slice(0, litlen)); } else { @@ -128,8 +130,10 @@ Parser.prototype._parse = function(data) { this._resContinue(); if (this._literallen > 0 && i < datalen) { + this._ignoreReadable = true; // literal data included in this chunk -- put it back onto stream this._stream.unshift(data.slice(i)); + this._ignoreReadable = false; i = datalen; if (!this._body) { // check if unshifted contents satisfies non-body literal length @@ -674,47 +678,154 @@ function convStr(str, literals) { return str; } -function decodeBytes(buf, encoding) { - if (iconv.encodingExists(encoding)) - return iconv.decode(buf, encoding); - else { - if (!jsencoding) - jsencoding = require('../deps/encoding/encoding'); - if (jsencoding.encodingExists(encoding)) - return jsencoding.TextDecoder(encoding).decode(buf); - else - return buf.toString('binary'); - } +function repeat(chr, len) { + var s = ''; + for (var i = 0; i < len; ++i) + s += chr; + return s; } -function decodeWords(str) { - return str.replace(RE_ENCWORD, - function(match, charset, encoding, word) { - encoding = encoding.toLowerCase(); - if (encoding === 'q') { - // q-encoding, similar to quoted-printable - return decodeBytes(new Buffer(word.replace(RE_QENC, - function(match2, byte) { - if (match2 === '_') - return ' '; - else - return String.fromCharCode(parseInt(byte, 16)); - } - ), 'binary'), charset); +function decodeBytes(buf, encoding, offset, mlen, state) { + if (!jsencoding) + jsencoding = require('../deps/encoding/encoding'); + if (jsencoding.encodingExists(encoding)) { + if (state.buffer !== undefined) { + if (state.encoding === encoding && state.consecutive) { + // concatenate buffer + current bytes in hopes of finally having + // something that's decodable + var newbuf = new Buffer(state.buffer.length + buf.length); + state.buffer.copy(newbuf, 0); + buf.copy(newbuf, state.buffer.length); + buf = newbuf; + } else { + // either: + // - the current encoded word is not separated by the previous partial + // encoded word by linear whitespace, OR + // - the current encoded word and the previous partial encoded word + // use different encodings + state.buffer = state.encoding = undefined; + state.curReplace = undefined; + } + } + var ret, isPartial = false; + try { + ret = jsencoding.TextDecoder(encoding).decode(buf); + } catch (e) { + if (e.message.indexOf('Seeking') === 0) + isPartial = true; + } + if (ret !== undefined) { + if (state.curReplace) { + // we have some previous partials which were finally "satisfied" by the + // current encoded word, so replace from the beginning of the first + // partial to the end of the current encoded word + state.replaces.push({ + fromOffset: state.curReplace[0].fromOffset, + toOffset: offset + mlen, + val: ret + }); + state.replaces.splice(state.replaces.indexOf(state.curReplace), 1); + state.curReplace = undefined; } else { - // base64 - return decodeBytes(new Buffer(word, 'base64'), charset); + // normal case where there are no previous partials and we successfully + // decoded a single encoded word + state.replaces.push({ + fromOffset: offset, + toOffset: offset + mlen, + val: ret + }); } + state.buffer = state.encoding = undefined; + return; + } else if (isPartial) { + // RFC2047 says that each decoded encoded word "MUST represent an integral + // number of characters. A multi-octet character may not be split across + // adjacent encoded-words." However, some MUAs appear to go against this, + // so we join broken encoded words separated by linear white space until + // we can successfully decode or we see a change in encoding + state.encoding = encoding; + state.buffer = buf; + if (!state.curReplace) + state.replaces.push(state.curReplace = []); + state.curReplace.push({ + fromOffset: offset, + toOffset: offset + mlen, + // the value we replace this encoded word with if it doesn't end up + // becoming part of a successful decode + val: repeat('\uFFFD', buf.length) + }); + return; + } + } + // in case of unexpected error or unsupported encoding, just substitute the + // raw bytes + state.replaces.push({ + fromOffset: offset, + toOffset: offset + mlen, + val: buf.toString('binary') }); } +function qEncReplacer(match, byte) { + if (match === '_') + return ' '; + else + return String.fromCharCode(parseInt(byte, 16)); +} +function decodeWords(str, state) { + var pendoffset = -1; + state.replaces = []; + + var bytes, m, i, len, j, lenj, seq; + // generate replacement substrings and their positions + while (m = RE_ENCWORD.exec(str)) { + state.consecutive = (pendoffset > -1 + ? RE_LWS_ONLY.test(str.substring(pendoffset, m.index)) + : false); + pendoffset = m.index + m[0].length; + if (m[2].toLowerCase() === 'q') { + // q-encoding, similar to quoted-printable + bytes = new Buffer(m[3].replace(RE_QENC, qEncReplacer), 'binary'); + } else { + // base64 + bytes = new Buffer(m[3], 'base64'); + } + decodeBytes(bytes, m[1].toLowerCase(), m.index, m[0].length, state); + } + + // perform the actual replacements + for (i = state.replaces.length - 1; i >= 0; --i) { + seq = state.replaces[i]; + if (Array.isArray(seq)) { + for (j = 0, lenj = seq.length; j < lenj; ++j) { + str = str.substring(0, seq[j].fromOffset) + + seq[j].val + + str.substring(seq[j].toOffset); + } + } else { + str = str.substring(0, seq.fromOffset) + + seq.val + + str.substring(seq.toOffset); + } + } + + return str; +} + function parseHeader(str, noDecode) { var lines = str.split(RE_CRLF), len = lines.length, header = {}, - m, h, val; - - for (var i = 0; i < len; ++i) { + state = { + buffer: undefined, + encoding: undefined, + consecutive: false, + replaces: undefined, + curReplace: undefined + }, + m, h, i, val; + + for (i = 0; i < len; ++i) { if (lines[i].length === 0) break; // empty line separates message's header and body if (lines[i][0] === '\t' || lines[i][0] === ' ') { @@ -727,7 +838,6 @@ function parseHeader(str, noDecode) { // for adjacent encoded-words ... val = val.substring(1); } - val = decodeWords(val); } header[h][header[h].length - 1] += val; } else { @@ -735,9 +845,6 @@ function parseHeader(str, noDecode) { if (m) { h = m[1].toLowerCase().trim(); if (m[2]) { - if (!noDecode) - m[2] = decodeWords(m[2]); - if (header[h] === undefined) header[h] = [m[2]]; else @@ -748,6 +855,14 @@ function parseHeader(str, noDecode) { break; } } + if (!noDecode) { + var hvs; + for (h in header) { + hvs = header[h]; + for (i = 0, len = header[h].length; i < len; ++i) + hvs[i] = decodeWords(hvs[i], state); + } + } return header; } diff --git a/package.json b/package.json index f35adae..d44b3d0 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,6 @@ "main": "./lib/Connection", "dependencies": { "utf7": "1.0.0", - "iconv-lite": "0.2.11", "readable-stream": "1.0.15" }, "scripts": {