Parser: attempt workaround for broken mime encoded words

Some MUAs appear to break multi-byte characters across encoded word boundaries, which goes against RFC2047. This change attempts to work around this by trying to concatenate broken encoded words that are separated by linear white space.
11 years ago · a3a1d158c5
parent f2010896e9
commit a3a1d158c5
2 changed files with 154 additions and 40 deletions
--- a/lib/Parser.js
+++ b/lib/Parser.js
@ -3,8 +3,7 @@ var EventEmitter = require('events').EventEmitter,
    inherits = require('util').inherits,
    inspect = require('util').inspect,
    utf7 = require('utf7').imap,
-    iconv = require('iconv-lite'),
-    jsencoding/*lazy-loaded*/;
+    jsencoding; // lazy-loaded

 var CH_LF = 10,
    LITPLACEHOLDER = String.fromCharCode(0),
@ -24,7 +23,8 @@ var CH_LF = 10,
    RE_ENCWORD_END = /=\?([^?]*?)\?([qb])\?(.*?)\?=$/i,
    RE_ENCWORD_BEGIN = /^[ \t]=\?([^?]*?)\?([qb])\?(.*?)\?=/i,
    RE_QENC = /(?:=([a-fA-F0-9]{2}))|_/g,
-    RE_SEARCH_MODSEQ = /^(.+) \(MODSEQ (.+?)\)$/i;
+    RE_SEARCH_MODSEQ = /^(.+) \(MODSEQ (.+?)\)$/i,
+    RE_LWS_ONLY = /^[ \t]*$/;

 function Parser(stream, debug) {
  if (!(this instanceof Parser))
@ -37,12 +37,15 @@ function Parser(stream, debug) {
  this._literallen = 0;
  this._literals = [];
  this._buffer = '';
+  this._ignoreReadable = false;
  this.debug = debug;

  this.setStream(stream);

  var self = this;
  this._cbReadable = function() {
+    if (self._ignoreReadable)
+      return;
    if (self._literallen > 0 && !self._body)
      self._tryread(self._literallen);
    else
@ -78,8 +81,7 @@ Parser.prototype._parse = function(data) {
      var body = this._body;
      if (datalen > this._literallen) {
        var litlen = this._literallen;
-
-        i = this._literallen;
+        i = litlen;
        this._literallen = 0;
        body.push(data.slice(0, litlen));
      } else {
@ -128,8 +130,10 @@ Parser.prototype._parse = function(data) {
          this._resContinue();

        if (this._literallen > 0 && i < datalen) {
+          this._ignoreReadable = true;
          // literal data included in this chunk -- put it back onto stream
          this._stream.unshift(data.slice(i));
+          this._ignoreReadable = false;
          i = datalen;
          if (!this._body) {
            // check if unshifted contents satisfies non-body literal length
@ -674,47 +678,154 @@ function convStr(str, literals) {
  return str;
 }

-function decodeBytes(buf, encoding) {
-  if (iconv.encodingExists(encoding))
-    return iconv.decode(buf, encoding);
-  else {
-    if (!jsencoding)
-      jsencoding = require('../deps/encoding/encoding');
-    if (jsencoding.encodingExists(encoding))
-      return jsencoding.TextDecoder(encoding).decode(buf);
-    else
-      return buf.toString('binary');
-  }
+function repeat(chr, len) {
+  var s = '';
+  for (var i = 0; i < len; ++i)
+    s += chr;
+  return s;
 }

-function decodeWords(str) {
-  return str.replace(RE_ENCWORD,
-    function(match, charset, encoding, word) {
-      encoding = encoding.toLowerCase();
-      if (encoding === 'q') {
-        // q-encoding, similar to quoted-printable
-        return decodeBytes(new Buffer(word.replace(RE_QENC,
-          function(match2, byte) {
-            if (match2 === '_')
-              return ' ';
-            else
-              return String.fromCharCode(parseInt(byte, 16));
-          }
-        ), 'binary'), charset);
+function decodeBytes(buf, encoding, offset, mlen, state) {
+  if (!jsencoding)
+    jsencoding = require('../deps/encoding/encoding');
+  if (jsencoding.encodingExists(encoding)) {
+    if (state.buffer !== undefined) {
+      if (state.encoding === encoding && state.consecutive) {
+        // concatenate buffer + current bytes in hopes of finally having
+        // something that's decodable
+        var newbuf = new Buffer(state.buffer.length + buf.length);
+        state.buffer.copy(newbuf, 0);
+        buf.copy(newbuf, state.buffer.length);
+        buf = newbuf;
+      } else {
+        // either:
+        //   - the current encoded word is not separated by the previous partial
+        //     encoded word by linear whitespace, OR
+        //   - the current encoded word and the previous partial encoded word
+        //     use different encodings
+        state.buffer = state.encoding = undefined;
+        state.curReplace = undefined;
+      }
+    }
+    var ret, isPartial = false;
+    try {
+      ret = jsencoding.TextDecoder(encoding).decode(buf);
+    } catch (e) {
+      if (e.message.indexOf('Seeking') === 0)
+        isPartial = true;
+    }
+    if (ret !== undefined) {
+      if (state.curReplace) {
+        // we have some previous partials which were finally "satisfied" by the
+        // current encoded word, so replace from the beginning of the first
+        // partial to the end of the current encoded word
+        state.replaces.push({
+          fromOffset: state.curReplace[0].fromOffset,
+          toOffset: offset + mlen,
+          val: ret
+        });
+        state.replaces.splice(state.replaces.indexOf(state.curReplace), 1);
+        state.curReplace = undefined;
      } else {
-        // base64
-        return decodeBytes(new Buffer(word, 'base64'), charset);
+        // normal case where there are no previous partials and we successfully
+        // decoded a single encoded word
+        state.replaces.push({
+          fromOffset: offset,
+          toOffset: offset + mlen,
+          val: ret
+        });
      }
+      state.buffer = state.encoding = undefined;
+      return;
+    } else if (isPartial) {
+      // RFC2047 says that each decoded encoded word "MUST represent an integral
+      // number of characters. A multi-octet character may not be split across 
+      // adjacent encoded-words." However, some MUAs appear to go against this,
+      // so we join broken encoded words separated by linear white space until
+      // we can successfully decode or we see a change in encoding
+      state.encoding = encoding;
+      state.buffer = buf;
+      if (!state.curReplace)
+        state.replaces.push(state.curReplace = []);
+      state.curReplace.push({
+        fromOffset: offset,
+        toOffset: offset + mlen,
+        // the value we replace this encoded word with if it doesn't end up
+        // becoming part of a successful decode
+        val: repeat('\uFFFD', buf.length)
+      });
+      return;
+    }
+  }
+  // in case of unexpected error or unsupported encoding, just substitute the
+  // raw bytes
+  state.replaces.push({
+    fromOffset: offset,
+    toOffset: offset + mlen,
+    val: buf.toString('binary')
  });
 }

+function qEncReplacer(match, byte) {
+  if (match === '_')
+    return ' ';
+  else
+    return String.fromCharCode(parseInt(byte, 16));
+}
+function decodeWords(str, state) {
+  var pendoffset = -1;
+  state.replaces = [];
+
+  var bytes, m, i, len, j, lenj, seq;
+  // generate replacement substrings and their positions
+  while (m = RE_ENCWORD.exec(str)) {
+    state.consecutive = (pendoffset > -1
+                         ? RE_LWS_ONLY.test(str.substring(pendoffset, m.index))
+                         : false);
+    pendoffset = m.index + m[0].length;
+    if (m[2].toLowerCase() === 'q') {
+      // q-encoding, similar to quoted-printable
+      bytes = new Buffer(m[3].replace(RE_QENC, qEncReplacer), 'binary');
+    } else {
+      // base64
+      bytes = new Buffer(m[3], 'base64');
+    }
+    decodeBytes(bytes, m[1].toLowerCase(), m.index, m[0].length, state);
+  }
+
+  // perform the actual replacements
+  for (i = state.replaces.length - 1; i >= 0; --i) {
+    seq = state.replaces[i];
+    if (Array.isArray(seq)) {
+      for (j = 0, lenj = seq.length; j < lenj; ++j) {
+        str = str.substring(0, seq[j].fromOffset)
+              + seq[j].val
+              + str.substring(seq[j].toOffset);
+      }
+    } else {
+      str = str.substring(0, seq.fromOffset)
+            + seq.val
+            + str.substring(seq.toOffset);
+    }
+  }
+
+  return str;
+}
+
 function parseHeader(str, noDecode) {
  var lines = str.split(RE_CRLF),
      len = lines.length,
      header = {},
-      m, h, val;
-
-  for (var i = 0; i < len; ++i) {
+      state = {
+        buffer: undefined,
+        encoding: undefined,
+        consecutive: false,
+        replaces: undefined,
+        curReplace: undefined
+      },
+      m, h, i, val;
+
+  for (i = 0; i < len; ++i) {
    if (lines[i].length === 0)
      break; // empty line separates message's header and body
    if (lines[i][0] === '\t' || lines[i][0] === ' ') {
@ -727,7 +838,6 @@ function parseHeader(str, noDecode) {
          // for adjacent encoded-words ...
          val = val.substring(1);
        }
-        val = decodeWords(val);
      }
      header[h][header[h].length - 1] += val;
    } else {
@ -735,9 +845,6 @@ function parseHeader(str, noDecode) {
      if (m) {
        h = m[1].toLowerCase().trim();
        if (m[2]) {
-          if (!noDecode)
-            m[2] = decodeWords(m[2]);
-
          if (header[h] === undefined)
            header[h] = [m[2]];
          else
@ -748,6 +855,14 @@ function parseHeader(str, noDecode) {
        break;
    }
  }
+  if (!noDecode) {
+    var hvs;
+    for (h in header) {
+      hvs = header[h];
+      for (i = 0, len = header[h].length; i < len; ++i)
+        hvs[i] = decodeWords(hvs[i], state);
+    }
+  }

  return header;
 }
--- a/package.json
+++ b/package.json
@ -5,7 +5,6 @@
  "main": "./lib/Connection",
  "dependencies": {
    "utf7": "1.0.0",
-    "iconv-lite": "0.2.11",
    "readable-stream": "1.0.15"
  },
  "scripts": {