Parser: fix base64 decoding of multi-byte character

11 years ago · 4d3da4ab0f
parent bf6672d64e
commit 4d3da4ab0f
2 changed files with 56 additions and 11 deletions
--- a/lib/Parser.js
+++ b/lib/Parser.js
@ -744,7 +744,7 @@ function repeat(chr, len) {
  return s;
 }

-function decodeBytes(buf, encoding, offset, mlen, pendoffset, state) {
+function decodeBytes(buf, encoding, offset, mlen, pendoffset, state, nextBuf) {
  if (!jsencoding)
    jsencoding = require('../deps/encoding/encoding');
  if (jsencoding.encodingExists(encoding)) {
@ -767,11 +767,39 @@ function decodeBytes(buf, encoding, offset, mlen, pendoffset, state) {
      }
    }
    var ret, isPartial = false;
-    try {
-      ret = jsencoding.TextDecoder(encoding).decode(buf);
-    } catch (e) {
-      if (e.message.indexOf('Seeking') === 0)
-        isPartial = true;
+    if (state.remainder !== undefined) {
+      // use cached remainder from the previous lookahead
+      ret = state.remainder;
+      state.remainder = undefined;
+    } else {
+      try {
+        ret = jsencoding.TextDecoder(encoding).decode(buf);
+      } catch (e) {
+        if (e.message.indexOf('Seeking') === 0)
+          isPartial = true;
+      }
+    }
+    if (!isPartial && nextBuf) {
+      // try to decode a lookahead buffer (current buffer + next buffer)
+      // and see if it starts with the decoded value of the current buffer.
+      // if not, the current buffer is partial
+      var lookahead, lookaheadBuf = new Buffer(buf.length + nextBuf.length);
+      buf.copy(lookaheadBuf);
+      nextBuf.copy(lookaheadBuf, buf.length);
+      try {
+        lookahead = jsencoding.TextDecoder(encoding).decode(lookaheadBuf);
+      } catch(e) {
+        // cannot decode the lookahead, do nothing
+      }
+      if (lookahead !== undefined) {
+        if (lookahead.indexOf(ret) === 0) {
+          // the current buffer is whole, cache the lookahead's remainder
+          state.remainder = lookahead.substring(ret.length);
+        } else {
+          isPartial = true;
+          ret = undefined;
+        }
+      }
    }
    if (ret !== undefined) {
      if (state.curReplace) {
@ -836,7 +864,7 @@ function decodeWords(str, state) {
  var pendoffset = -1;
  state.replaces = [];

-  var bytes, m, i, j, leni, lenj, seq, replaces = [], lastReplace = {};
+  var bytes, m, next, i, j, leni, lenj, seq, replaces = [], lastReplace = {};
  
  // join consecutive q-encoded words that have the same charset first
  while (m = RE_ENCWORD.exec(str)) {
@ -850,6 +878,7 @@ function decodeWords(str, state) {
      index: m.index,
      length: m[0].length,
      pendoffset: pendoffset,
+      buf: undefined
    };
    lastReplace = replaces.length && replaces[replaces.length - 1];
    if (seq.consecutive
@ -872,11 +901,20 @@ function decodeWords(str, state) {
    if (m.encoding === 'q') {
      // q-encoding, similar to quoted-printable
      bytes = new Buffer(m.chunk.replace(RE_QENC, qEncReplacer), 'binary');
+      next = undefined;
    } else {
      // base64
-      bytes = new Buffer(m.chunk, 'base64');
+      bytes = m.buf || new Buffer(m.chunk, 'base64');
+      next = replaces[i + 1];
+      if (next && next.consecutive && next.encoding === m.encoding
+        && next.charset === m.charset) {
+        // we use the next base64 chunk, if any, to determine the integrity
+        // of the current chunk
+        next.buf = new Buffer(next.chunk, 'base64');
+      }
    }
-    decodeBytes(bytes, m.charset, m.index, m.length, m.pendoffset, state);
+    decodeBytes(bytes, m.charset, m.index, m.length, m.pendoffset, state,
+      next && next.buf);
  }

  // perform the actual replacements
@ -907,7 +945,8 @@ function parseHeader(str, noDecode) {
        encoding: undefined,
        consecutive: false,
        replaces: undefined,
-        curReplace: undefined
+        curReplace: undefined,
+        remainder: undefined
      },
      m, h, i, val;

--- a/test/test-parse-header.js
+++ b/test/test-parse-header.js
@ -62,7 +62,13 @@ var CRLF = '\r\n';
             ' =?utf-8?B?4Liy4LmB4Lib4Lil4LiBIOC5hiDguKPguK3=?=', CRLF,
             ' =?utf-8?Q?=E0=B8=9A=E0=B9=82=E0=B8=A5=E0=B8=81?=', CRLF],
    expected: { subject: [ 'FW: สิ่งมีชีวิตหน้าตาแปลก ๆ รอบโลก' ] },
-    what: 'Folded header value (consecutive base64-encoded words)'
+    what: 'Folded header value (consecutive complete base64-encoded words)'
+  },
+  { source: ['Subject: =?utf-8?B?4Lij4Li54Lib4Lig4Liy4Lie4LiX4Li14LmIIGVtYmVkIOC5g+C4meC5gOC4?=', CRLF,
+             ' =?utf-8?B?meC4t+C5ieC4reC5gOC4oeC4peC4peC5jOC5hOC4oeC5iOC5geC4quC4lOC4?=', CRLF,
+             ' =?utf-8?B?hw==?=', CRLF],
+    expected: { subject: [ 'รูปภาพที่ embed ในเนื้อเมลล์ไม่แสดง' ] },
+    what: 'Folded header value (consecutive partial base64-encoded words)'
  },
  // header with body
  { source: ['Subject: test subject', CRLF,