Parser: attempt workaround for broken mime encoded words

Some MUAs appear to break multi-byte characters across encoded word boundaries, which goes against RFC2047.

This change attempts to work around this by trying to concatenate broken encoded words that are separated by linear white space.
fork
Brian White 11 years ago
parent f2010896e9
commit a3a1d158c5

@ -3,8 +3,7 @@ var EventEmitter = require('events').EventEmitter,
inherits = require('util').inherits,
inspect = require('util').inspect,
utf7 = require('utf7').imap,
iconv = require('iconv-lite'),
jsencoding/*lazy-loaded*/;
jsencoding; // lazy-loaded
var CH_LF = 10,
LITPLACEHOLDER = String.fromCharCode(0),
@ -24,7 +23,8 @@ var CH_LF = 10,
RE_ENCWORD_END = /=\?([^?]*?)\?([qb])\?(.*?)\?=$/i,
RE_ENCWORD_BEGIN = /^[ \t]=\?([^?]*?)\?([qb])\?(.*?)\?=/i,
RE_QENC = /(?:=([a-fA-F0-9]{2}))|_/g,
RE_SEARCH_MODSEQ = /^(.+) \(MODSEQ (.+?)\)$/i;
RE_SEARCH_MODSEQ = /^(.+) \(MODSEQ (.+?)\)$/i,
RE_LWS_ONLY = /^[ \t]*$/;
function Parser(stream, debug) {
if (!(this instanceof Parser))
@ -37,12 +37,15 @@ function Parser(stream, debug) {
this._literallen = 0;
this._literals = [];
this._buffer = '';
this._ignoreReadable = false;
this.debug = debug;
this.setStream(stream);
var self = this;
this._cbReadable = function() {
if (self._ignoreReadable)
return;
if (self._literallen > 0 && !self._body)
self._tryread(self._literallen);
else
@ -78,8 +81,7 @@ Parser.prototype._parse = function(data) {
var body = this._body;
if (datalen > this._literallen) {
var litlen = this._literallen;
i = this._literallen;
i = litlen;
this._literallen = 0;
body.push(data.slice(0, litlen));
} else {
@ -128,8 +130,10 @@ Parser.prototype._parse = function(data) {
this._resContinue();
if (this._literallen > 0 && i < datalen) {
this._ignoreReadable = true;
// literal data included in this chunk -- put it back onto stream
this._stream.unshift(data.slice(i));
this._ignoreReadable = false;
i = datalen;
if (!this._body) {
// check if unshifted contents satisfies non-body literal length
@ -674,47 +678,154 @@ function convStr(str, literals) {
return str;
}
function decodeBytes(buf, encoding) {
if (iconv.encodingExists(encoding))
return iconv.decode(buf, encoding);
else {
if (!jsencoding)
jsencoding = require('../deps/encoding/encoding');
if (jsencoding.encodingExists(encoding))
return jsencoding.TextDecoder(encoding).decode(buf);
else
return buf.toString('binary');
}
function repeat(chr, len) {
var s = '';
for (var i = 0; i < len; ++i)
s += chr;
return s;
}
function decodeWords(str) {
return str.replace(RE_ENCWORD,
function(match, charset, encoding, word) {
encoding = encoding.toLowerCase();
if (encoding === 'q') {
// q-encoding, similar to quoted-printable
return decodeBytes(new Buffer(word.replace(RE_QENC,
function(match2, byte) {
if (match2 === '_')
return ' ';
else
return String.fromCharCode(parseInt(byte, 16));
}
), 'binary'), charset);
function decodeBytes(buf, encoding, offset, mlen, state) {
if (!jsencoding)
jsencoding = require('../deps/encoding/encoding');
if (jsencoding.encodingExists(encoding)) {
if (state.buffer !== undefined) {
if (state.encoding === encoding && state.consecutive) {
// concatenate buffer + current bytes in hopes of finally having
// something that's decodable
var newbuf = new Buffer(state.buffer.length + buf.length);
state.buffer.copy(newbuf, 0);
buf.copy(newbuf, state.buffer.length);
buf = newbuf;
} else {
// either:
// - the current encoded word is not separated by the previous partial
// encoded word by linear whitespace, OR
// - the current encoded word and the previous partial encoded word
// use different encodings
state.buffer = state.encoding = undefined;
state.curReplace = undefined;
}
}
var ret, isPartial = false;
try {
ret = jsencoding.TextDecoder(encoding).decode(buf);
} catch (e) {
if (e.message.indexOf('Seeking') === 0)
isPartial = true;
}
if (ret !== undefined) {
if (state.curReplace) {
// we have some previous partials which were finally "satisfied" by the
// current encoded word, so replace from the beginning of the first
// partial to the end of the current encoded word
state.replaces.push({
fromOffset: state.curReplace[0].fromOffset,
toOffset: offset + mlen,
val: ret
});
state.replaces.splice(state.replaces.indexOf(state.curReplace), 1);
state.curReplace = undefined;
} else {
// base64
return decodeBytes(new Buffer(word, 'base64'), charset);
// normal case where there are no previous partials and we successfully
// decoded a single encoded word
state.replaces.push({
fromOffset: offset,
toOffset: offset + mlen,
val: ret
});
}
state.buffer = state.encoding = undefined;
return;
} else if (isPartial) {
// RFC2047 says that each decoded encoded word "MUST represent an integral
// number of characters. A multi-octet character may not be split across
// adjacent encoded-words." However, some MUAs appear to go against this,
// so we join broken encoded words separated by linear white space until
// we can successfully decode or we see a change in encoding
state.encoding = encoding;
state.buffer = buf;
if (!state.curReplace)
state.replaces.push(state.curReplace = []);
state.curReplace.push({
fromOffset: offset,
toOffset: offset + mlen,
// the value we replace this encoded word with if it doesn't end up
// becoming part of a successful decode
val: repeat('\uFFFD', buf.length)
});
return;
}
}
// in case of unexpected error or unsupported encoding, just substitute the
// raw bytes
state.replaces.push({
fromOffset: offset,
toOffset: offset + mlen,
val: buf.toString('binary')
});
}
function qEncReplacer(match, byte) {
if (match === '_')
return ' ';
else
return String.fromCharCode(parseInt(byte, 16));
}
function decodeWords(str, state) {
var pendoffset = -1;
state.replaces = [];
var bytes, m, i, len, j, lenj, seq;
// generate replacement substrings and their positions
while (m = RE_ENCWORD.exec(str)) {
state.consecutive = (pendoffset > -1
? RE_LWS_ONLY.test(str.substring(pendoffset, m.index))
: false);
pendoffset = m.index + m[0].length;
if (m[2].toLowerCase() === 'q') {
// q-encoding, similar to quoted-printable
bytes = new Buffer(m[3].replace(RE_QENC, qEncReplacer), 'binary');
} else {
// base64
bytes = new Buffer(m[3], 'base64');
}
decodeBytes(bytes, m[1].toLowerCase(), m.index, m[0].length, state);
}
// perform the actual replacements
for (i = state.replaces.length - 1; i >= 0; --i) {
seq = state.replaces[i];
if (Array.isArray(seq)) {
for (j = 0, lenj = seq.length; j < lenj; ++j) {
str = str.substring(0, seq[j].fromOffset)
+ seq[j].val
+ str.substring(seq[j].toOffset);
}
} else {
str = str.substring(0, seq.fromOffset)
+ seq.val
+ str.substring(seq.toOffset);
}
}
return str;
}
function parseHeader(str, noDecode) {
var lines = str.split(RE_CRLF),
len = lines.length,
header = {},
m, h, val;
for (var i = 0; i < len; ++i) {
state = {
buffer: undefined,
encoding: undefined,
consecutive: false,
replaces: undefined,
curReplace: undefined
},
m, h, i, val;
for (i = 0; i < len; ++i) {
if (lines[i].length === 0)
break; // empty line separates message's header and body
if (lines[i][0] === '\t' || lines[i][0] === ' ') {
@ -727,7 +838,6 @@ function parseHeader(str, noDecode) {
// for adjacent encoded-words ...
val = val.substring(1);
}
val = decodeWords(val);
}
header[h][header[h].length - 1] += val;
} else {
@ -735,9 +845,6 @@ function parseHeader(str, noDecode) {
if (m) {
h = m[1].toLowerCase().trim();
if (m[2]) {
if (!noDecode)
m[2] = decodeWords(m[2]);
if (header[h] === undefined)
header[h] = [m[2]];
else
@ -748,6 +855,14 @@ function parseHeader(str, noDecode) {
break;
}
}
if (!noDecode) {
var hvs;
for (h in header) {
hvs = header[h];
for (i = 0, len = header[h].length; i < len; ++i)
hvs[i] = decodeWords(hvs[i], state);
}
}
return header;
}

@ -5,7 +5,6 @@
"main": "./lib/Connection",
"dependencies": {
"utf7": "1.0.0",
"iconv-lite": "0.2.11",
"readable-stream": "1.0.15"
},
"scripts": {

Loading…
Cancel
Save