diff --git a/lib/Parser.js b/lib/Parser.js index bf1ebfd..fb7b066 100644 --- a/lib/Parser.js +++ b/lib/Parser.js @@ -744,7 +744,7 @@ function repeat(chr, len) { return s; } -function decodeBytes(buf, encoding, offset, mlen, pendoffset, state) { +function decodeBytes(buf, encoding, offset, mlen, pendoffset, state, nextBuf) { if (!jsencoding) jsencoding = require('../deps/encoding/encoding'); if (jsencoding.encodingExists(encoding)) { @@ -767,11 +767,39 @@ function decodeBytes(buf, encoding, offset, mlen, pendoffset, state) { } } var ret, isPartial = false; - try { - ret = jsencoding.TextDecoder(encoding).decode(buf); - } catch (e) { - if (e.message.indexOf('Seeking') === 0) - isPartial = true; + if (state.remainder !== undefined) { + // use cached remainder from the previous lookahead + ret = state.remainder; + state.remainder = undefined; + } else { + try { + ret = jsencoding.TextDecoder(encoding).decode(buf); + } catch (e) { + if (e.message.indexOf('Seeking') === 0) + isPartial = true; + } + } + if (!isPartial && nextBuf) { + // try to decode a lookahead buffer (current buffer + next buffer) + // and see if it starts with the decoded value of the current buffer. + // if not, the current buffer is partial + var lookahead, lookaheadBuf = new Buffer(buf.length + nextBuf.length); + buf.copy(lookaheadBuf); + nextBuf.copy(lookaheadBuf, buf.length); + try { + lookahead = jsencoding.TextDecoder(encoding).decode(lookaheadBuf); + } catch(e) { + // cannot decode the lookahead, do nothing + } + if (lookahead !== undefined) { + if (lookahead.indexOf(ret) === 0) { + // the current buffer is whole, cache the lookahead's remainder + state.remainder = lookahead.substring(ret.length); + } else { + isPartial = true; + ret = undefined; + } + } } if (ret !== undefined) { if (state.curReplace) { @@ -836,7 +864,7 @@ function decodeWords(str, state) { var pendoffset = -1; state.replaces = []; - var bytes, m, i, j, leni, lenj, seq, replaces = [], lastReplace = {}; + var bytes, m, next, i, j, leni, lenj, seq, replaces = [], lastReplace = {}; // join consecutive q-encoded words that have the same charset first while (m = RE_ENCWORD.exec(str)) { @@ -850,6 +878,7 @@ function decodeWords(str, state) { index: m.index, length: m[0].length, pendoffset: pendoffset, + buf: undefined }; lastReplace = replaces.length && replaces[replaces.length - 1]; if (seq.consecutive @@ -872,11 +901,20 @@ function decodeWords(str, state) { if (m.encoding === 'q') { // q-encoding, similar to quoted-printable bytes = new Buffer(m.chunk.replace(RE_QENC, qEncReplacer), 'binary'); + next = undefined; } else { // base64 - bytes = new Buffer(m.chunk, 'base64'); + bytes = m.buf || new Buffer(m.chunk, 'base64'); + next = replaces[i + 1]; + if (next && next.consecutive && next.encoding === m.encoding + && next.charset === m.charset) { + // we use the next base64 chunk, if any, to determine the integrity + // of the current chunk + next.buf = new Buffer(next.chunk, 'base64'); + } } - decodeBytes(bytes, m.charset, m.index, m.length, m.pendoffset, state); + decodeBytes(bytes, m.charset, m.index, m.length, m.pendoffset, state, + next && next.buf); } // perform the actual replacements @@ -907,7 +945,8 @@ function parseHeader(str, noDecode) { encoding: undefined, consecutive: false, replaces: undefined, - curReplace: undefined + curReplace: undefined, + remainder: undefined }, m, h, i, val; diff --git a/test/test-parse-header.js b/test/test-parse-header.js index 2f7cc2c..d86c137 100644 --- a/test/test-parse-header.js +++ b/test/test-parse-header.js @@ -62,7 +62,13 @@ var CRLF = '\r\n'; ' =?utf-8?B?4Liy4LmB4Lib4Lil4LiBIOC5hiDguKPguK3=?=', CRLF, ' =?utf-8?Q?=E0=B8=9A=E0=B9=82=E0=B8=A5=E0=B8=81?=', CRLF], expected: { subject: [ 'FW: สิ่งมีชีวิตหน้าตาแปลก ๆ รอบโลก' ] }, - what: 'Folded header value (consecutive base64-encoded words)' + what: 'Folded header value (consecutive complete base64-encoded words)' + }, + { source: ['Subject: =?utf-8?B?4Lij4Li54Lib4Lig4Liy4Lie4LiX4Li14LmIIGVtYmVkIOC5g+C4meC5gOC4?=', CRLF, + ' =?utf-8?B?meC4t+C5ieC4reC5gOC4oeC4peC4peC5jOC5hOC4oeC5iOC5geC4quC4lOC4?=', CRLF, + ' =?utf-8?B?hw==?=', CRLF], + expected: { subject: [ 'รูปภาพที่ embed ในเนื้อเมลล์ไม่แสดง' ] }, + what: 'Folded header value (consecutive partial base64-encoded words)' }, // header with body { source: ['Subject: test subject', CRLF,