mirror of
https://github.com/torappinfo/uweb.git
synced 2025-01-15 16:40:57 +01:00
939 lines
37 KiB
JavaScript
939 lines
37 KiB
JavaScript
|
//define(['pako', 'lzo', 'ripemd128', 'bluebird', 'mdict-parseXml', 'mdict-MCommon']
|
||
|
//pako, lzo, ripemd128, Promise, parseXml, MCommon
|
||
|
var parseXml = function (str) {
|
||
|
return (new DOMParser()).parseFromString(str, 'text/xml');
|
||
|
};
|
||
|
var MParser = (function (){
|
||
|
// Value of undefined.
|
||
|
var UNDEFINED = void 0;
|
||
|
|
||
|
// A shared UTF-16LE text decorder used to read dictionary header string.
|
||
|
var UTF_16LE = new TextDecoder('utf-16le');
|
||
|
|
||
|
/**
|
||
|
* Return the first argument as result.
|
||
|
* This function is used to simulate consequence, i.e. read data and return it, then forward to a new position.
|
||
|
* @param any data or function call
|
||
|
* @return the first arugment
|
||
|
*/
|
||
|
function conseq(/* args... */) { return arguments[0]; }
|
||
|
|
||
|
/*
|
||
|
* Decrypt encrypted data block of keyword index (attrs.Encrypted = "2").
|
||
|
* @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#keyword-index-encryption
|
||
|
* @param buf an ArrayBuffer containing source data
|
||
|
* @param key an ArrayBuffer holding decryption key, which will be supplied to ripemd128() before decryption
|
||
|
* @return an ArrayBuffer carrying decrypted data, occupying the same memory space of source buffer
|
||
|
*/
|
||
|
function decrypt(buf, key) {
|
||
|
key = ripemd128(key);
|
||
|
var byte, keylen = key.length, prev = 0x36, i = 0, len = buf.length;
|
||
|
for (; i < len; i++) {
|
||
|
byte = buf[i];
|
||
|
byte = ((byte >> 4) | (byte << 4) ); // & 0xFF; <-- it's already a byte
|
||
|
byte = byte ^ prev ^ (i & 0xFF) ^ key[i % keylen];
|
||
|
prev = buf[i];
|
||
|
buf[i] = byte;
|
||
|
}
|
||
|
return buf;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* For sliceThen(..).exec(proc, ..), mark what proc function returns is multiple values
|
||
|
* to be passed to further Promise#spread(..) call.
|
||
|
*/
|
||
|
function spreadus() {
|
||
|
var args = Array.prototype.slice.apply(arguments);
|
||
|
args._spreadus_ = true;
|
||
|
return args;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Slice part of a file/blob object, return a promise object which will resolve to an ArrayBuffer to feed subsequent process.
|
||
|
* The returned promise object is extened with an exec(proc, args...) method which can be chained with further process.
|
||
|
* @param file file or blob object
|
||
|
* @param offset start position to slice
|
||
|
* @param len length to slice
|
||
|
* @return a promise object which will resolve to an ArrayBuffer containing data been read
|
||
|
*/
|
||
|
function sliceThen(file, offset, len) {
|
||
|
var p = new Promise(function(_resolve) {
|
||
|
var reader = new FileReader();
|
||
|
reader.onload = function() { _resolve(reader.result); }
|
||
|
reader.readAsArrayBuffer(file.slice(offset, offset + len));
|
||
|
});
|
||
|
|
||
|
/**
|
||
|
* Call proc with specified arguments prepending with sliced file/blob data (ArrayBuffer) been read.
|
||
|
* @param the first argument is a function to be executed
|
||
|
* @param other optional arguments are passed to the function following auto supplied input ArrayBuffer
|
||
|
* @return a promise object which can be chained with further process through spread() method
|
||
|
*/
|
||
|
p.exec = function(proc /*, args... */) {
|
||
|
var args = Array.prototype.slice.call(arguments, 1);
|
||
|
return p.then(function(data) {
|
||
|
args.unshift(data);
|
||
|
var ret = proc.apply(null, args);
|
||
|
return resolve(ret !== UNDEFINED && ret._spreadus_ ? ret : [ret]);
|
||
|
});
|
||
|
};
|
||
|
|
||
|
return p;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Wrap value as a resolved promise.
|
||
|
*/
|
||
|
function resolve(value) { return Promise.resolve(value); }
|
||
|
|
||
|
/**
|
||
|
* Wrap value as a rejected promise.
|
||
|
*/
|
||
|
function reject(reason) { return Promise.reject(reason); }
|
||
|
|
||
|
/**
|
||
|
* Harvest any resolved promises, if all failed then return reasons.
|
||
|
*/
|
||
|
function harvest(outcomes) {
|
||
|
return Promise.settle(outcomes).then(function(results) {
|
||
|
if (results.length === 0) {
|
||
|
return reject("** NOT FOUND **");
|
||
|
}
|
||
|
|
||
|
var solved = [], failed = [];
|
||
|
for (var i = 0; i < results.length; i++) {
|
||
|
if (results[i].isResolved()) {
|
||
|
solved.push(results[i].value());
|
||
|
} else {
|
||
|
failed.push(results[i].reason());
|
||
|
}
|
||
|
}
|
||
|
return solved.length ? solved : failed;
|
||
|
});
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Create a Record Block Table object to load record block info from record section in mdx/mdd file.
|
||
|
* Retrived data is stored in an Uint32Array which contains N pairs of (offset_comp, offset_decomp) value,
|
||
|
* where N is number of record blocks.
|
||
|
*
|
||
|
* When looking up a given key for its definition:
|
||
|
* 1. Search KEY_INDEX to locate keyword block containing the given key.
|
||
|
* 2. Scanning the found keyword block to get its record offset and size.
|
||
|
* 3. Search RECORD_BLOCK_TABLE to get record block containing the record.
|
||
|
* 4. Load the found record block, using its offset and size to retrieve record content.
|
||
|
*
|
||
|
* @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#record-section
|
||
|
*/
|
||
|
function createRecordBlockTable() {
|
||
|
var pos = 0, // current position
|
||
|
arr; // backed Uint32Array
|
||
|
return {
|
||
|
// Allocate required ArrayBuffer for storing record block table, where len is number of record blocks.
|
||
|
alloc: function(len) {
|
||
|
arr = new Uint32Array(len * 2);
|
||
|
},
|
||
|
// Store offset pair value (compressed & decompressed) for a record block
|
||
|
// NOTE: offset_comp is absolute offset counted from start of mdx/mdd file.
|
||
|
put: function(offset_comp, offset_decomp) {
|
||
|
arr[pos++] = offset_comp; arr[pos++] = offset_decomp;
|
||
|
},
|
||
|
// Given offset of a keyword after decompression, return a record block info containing it, else undefined if not found.
|
||
|
find: function(keyAt) {
|
||
|
var hi = (arr.length >> 1) - 1, lo = 0, i = (lo + hi) >> 1, val = arr[(i << 1) + 1];
|
||
|
|
||
|
if (keyAt > arr[(hi << 1) + 1] || keyAt < 0) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
while (true) {
|
||
|
if (hi - lo <= 1) {
|
||
|
if (i < hi) {
|
||
|
return {
|
||
|
block_no: i,
|
||
|
comp_offset: arr[i <<= 1],
|
||
|
comp_size: arr[i + 2] - arr[i],
|
||
|
decomp_offset:arr[i + 1],
|
||
|
decomp_size: arr[i + 3] - arr[i + 1]
|
||
|
};
|
||
|
} else {
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
(keyAt < val) ? hi = i : lo = i;
|
||
|
i = (lo + hi) >> 1;
|
||
|
val = arr[(i << 1) + 1];
|
||
|
}
|
||
|
},
|
||
|
};
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Test if a value of dictionary attribute is true or not.
|
||
|
*/
|
||
|
function isTrue(v) {
|
||
|
v = ((v || false) + '').toLowerCase();
|
||
|
return v === 'yes' || v === 'true';
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Parse a MDict dictionary/resource file (mdx/mdd).
|
||
|
* @param file a File/Blob object
|
||
|
* @param ext file extension, mdx/mdd
|
||
|
* @return a Promise object which will resolve to a lookup function.
|
||
|
*/
|
||
|
function parse_mdict(file, ext) {
|
||
|
|
||
|
var KEY_INDEX, // keyword index array
|
||
|
RECORD_BLOCK_TABLE = createRecordBlockTable(); // record block table
|
||
|
|
||
|
var attrs = {}, // storing dictionary attributes
|
||
|
_v2, // true if enginge version > 2
|
||
|
_bpu, // bytes per unit when converting text size to byte length for text data
|
||
|
_tail, // need to skip extra tail bytes after decoding text
|
||
|
_decoder, // text decorder
|
||
|
|
||
|
_decryptors = [false, false],
|
||
|
// [keyword_header_decryptor, keyword_index_decryptor], only keyword_index_decryptor is supported
|
||
|
|
||
|
_searchTextLen, // search NUL to get text length
|
||
|
|
||
|
_readShort = function(scanner) { return scanner.readUint8(); },
|
||
|
// read a "short" number representing kewword text size, 8-bit for version < 2, 16-bit for version >= 2
|
||
|
|
||
|
_readNum = function(scanner) { return scanner.readInt(); },
|
||
|
// Read a number representing offset or data block size, 16-bit for version < 2, 32-bit for version >= 2
|
||
|
|
||
|
_checksum_v2 = function() {},
|
||
|
// Version >= 2.0 only checksum
|
||
|
|
||
|
_adaptKey = function(key) { return key; },
|
||
|
// adapt key by converting to lower case or stripping punctuations according to dictionary attributes (KeyCaseSensitive, StripKey)
|
||
|
|
||
|
_slice = sliceThen.bind(null, file);
|
||
|
// bind sliceThen() with file argument
|
||
|
|
||
|
/**
|
||
|
* Config scanner according to dictionary attributes.
|
||
|
*/
|
||
|
function config() {
|
||
|
attrs.Encoding = attrs.Encoding || 'UTF-16';
|
||
|
|
||
|
_searchTextLen = (attrs.Encoding === 'UTF-16')
|
||
|
? function(dv, offset) {
|
||
|
offset = offset;
|
||
|
var mark = offset;
|
||
|
while (dv.getUint16(offset)) { offset+= _bpu /* scan for \u0000 */ };
|
||
|
return offset - mark;
|
||
|
} : function(dv, offset) {
|
||
|
offset = offset;
|
||
|
var mark = offset;
|
||
|
while (dv.getUint8(offset++)) { /* scan for NUL */ }
|
||
|
return offset - mark - 1;
|
||
|
};
|
||
|
|
||
|
_decoder = new TextDecoder(attrs.Encoding || 'UTF-16LE');
|
||
|
|
||
|
_bpu = (attrs.Encoding === 'UTF-16') ? 2 : 1;
|
||
|
|
||
|
if (parseInt(attrs.GeneratedByEngineVersion, 10) >= 2.0) {
|
||
|
_v2 = true;
|
||
|
_tail = _bpu;
|
||
|
|
||
|
// HUGE dictionary file (>4G) is not supported, take only lower 32-bit
|
||
|
_readNum = function(scanner) { return scanner.forward(4), scanner.readInt(); };
|
||
|
_readShort = function(scanner) { return scanner.readUint16(); };
|
||
|
_checksum_v2 = function(scanner) { return scanner.checksum(); };
|
||
|
} else {
|
||
|
_tail = 0;
|
||
|
}
|
||
|
|
||
|
// keyword index decrypted?
|
||
|
if (attrs.Encrypted & 0x02) {
|
||
|
_decryptors[1] = decrypt;
|
||
|
}
|
||
|
|
||
|
var regexp = MCommon.REGEXP_STRIPKEY[ext];
|
||
|
if (isTrue(attrs.KeyCaseSensitive)) {
|
||
|
_adaptKey = isTrue(attrs.StripKey)
|
||
|
? function(key) { return key.replace(regexp, '$1'); }
|
||
|
: function(key) { return key; };
|
||
|
} else {
|
||
|
_adaptKey = isTrue(attrs.StripKey || (_v2 ? '' : 'yes'))
|
||
|
? function(key) { return key.toLowerCase().replace(regexp, '$1'); }
|
||
|
: function(key) { return key.toLowerCase(); };
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Read data in current offset from target data ArrayBuffer
|
||
|
function Scanner(buf, len) {
|
||
|
var offset = 0, dv = new DataView(buf);
|
||
|
|
||
|
var methods = {
|
||
|
// target data size in bytes
|
||
|
size: function() { return len || buf.byteLength; },
|
||
|
// update offset to new position
|
||
|
forward: function(len) { return offset += len; },
|
||
|
// return current offset
|
||
|
offset: function() { return offset; },
|
||
|
|
||
|
// MDict file format uses big endian to store number
|
||
|
|
||
|
// 32-bit unsigned int
|
||
|
readInt: function() { return conseq(dv.getUint32(offset, false), this.forward(4)); },
|
||
|
readUint16: function() { return conseq(dv.getUint16(offset, false), this.forward(2)); },
|
||
|
readUint8: function() { return conseq(dv.getUint8(offset, false), this.forward(1)); },
|
||
|
|
||
|
// Read a "short" number representing keyword text size, 8-bit for version < 2, 16-bit for version >= 2
|
||
|
readShort: function() { return _readShort(this); },
|
||
|
// Read a number representing offset or data block size, 16-bit for version < 2, 32-bit for version >= 2
|
||
|
readNum: function() { return _readNum(this); },
|
||
|
|
||
|
readUTF16: function(len) { return conseq(UTF_16LE.decode(new Uint8Array(buf, offset, len)), this.forward(len)); },
|
||
|
|
||
|
// Read data to an Uint8Array and decode it to text with specified encoding.
|
||
|
// Text length in bytes is determined by searching terminated NUL.
|
||
|
// NOTE: After decoding the text, it is need to forward extra "tail" bytes according to specified encoding.
|
||
|
readText: function() {
|
||
|
var len = _searchTextLen(dv, offset);
|
||
|
return conseq(_decoder.decode(new Uint8Array(buf, offset, len)), this.forward(len + _bpu));
|
||
|
},
|
||
|
// Read data to an Uint8Array and decode it to text with specified encoding.
|
||
|
// @param len length in basic unit, need to multiply byte per unit to get length in bytes
|
||
|
// NOTE: After decoding the text, it is need to forward extra "tail" bytes according to specified encoding.
|
||
|
readTextSized: function(len) {
|
||
|
len *= _bpu;
|
||
|
return conseq(_decoder.decode(new Uint8Array(buf, offset, len)), this.forward(len + _tail));
|
||
|
},
|
||
|
|
||
|
// Skip checksum, just ignore it anyway.
|
||
|
checksum: function() { this.forward(4); },
|
||
|
// Version >= 2.0 only
|
||
|
checksum_v2: function() { return _checksum_v2(this); },
|
||
|
|
||
|
// Read data block of keyword index, key block or record content.
|
||
|
// These data block are maybe in compressed (gzip or lzo) format, while keyword index maybe be encrypted.
|
||
|
// @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#compression (with typo mistake)
|
||
|
readBlock: function(len, expectedBufSize, decryptor) {
|
||
|
var comp_type = dv.getUint8(offset, false); // compression type, 0 = non, 1 = lzo, 2 = gzip
|
||
|
if (comp_type === 0) {
|
||
|
if (_v2) {
|
||
|
this.forward(8); // for version >= 2, skip comp_type (4 bytes with tailing \x00) and checksum (4 bytes)
|
||
|
}
|
||
|
return this;
|
||
|
} else {
|
||
|
// skip comp_type (4 bytes with tailing \x00) and checksum (4 bytes)
|
||
|
offset += 8; len -= 8;
|
||
|
var tmp = new Uint8Array(buf, offset, len);
|
||
|
if (decryptor) {
|
||
|
var passkey = new Uint8Array(8);
|
||
|
passkey.set(new Uint8Array(buf, offset - 4, 4)); // key part 1: checksum
|
||
|
passkey.set([0x95, 0x36, 0x00, 0x00], 4); // key part 2: fixed data
|
||
|
tmp = decryptor(tmp, passkey);
|
||
|
}
|
||
|
|
||
|
tmp = comp_type === 2 ? pako.inflate(tmp) : lzo.decompress(tmp, expectedBufSize, 1308672);
|
||
|
this.forward(len);
|
||
|
return Scanner(tmp.buffer, tmp.length);
|
||
|
}
|
||
|
},
|
||
|
|
||
|
// Read raw data as Uint8Array from current offset with specified length in bytes
|
||
|
readRaw: function(len) {
|
||
|
return conseq(new Uint8Array(buf, offset, len), this.forward(len === UNDEFINED ? buf.length - offset : len));
|
||
|
},
|
||
|
};
|
||
|
|
||
|
return Object.create(methods);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read the first 4 bytes of mdx/mdd file to get length of header_str.
|
||
|
* @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#file-structure
|
||
|
* @param input sliced file (start = 0, length = 4)
|
||
|
* @return length of header_str
|
||
|
*/
|
||
|
function read_file_head(input) {
|
||
|
return Scanner(input).readInt();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read header section, parse dictionary attributes and config scanner according to engine version attribute.
|
||
|
* @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#header-section
|
||
|
* @param input sliced file (start = 4, length = len + 48), header string + header section (max length 48)
|
||
|
* @param len lenghth of header_str
|
||
|
* @return [remained length of header section (header_str and checksum, = len + 4), original input]
|
||
|
*/
|
||
|
function read_header_sect(input, len) {
|
||
|
var scanner = Scanner(input),
|
||
|
header_str = scanner.readUTF16(len).replace(/\0$/, ''); // need to remove tailing NUL
|
||
|
|
||
|
// parse dictionary attributes
|
||
|
var xml = parseXml(header_str).querySelector('Dictionary, Library_Data').attributes;
|
||
|
|
||
|
for (var i = 0, item; i < xml.length; i++) {
|
||
|
item = xml.item(i);
|
||
|
attrs[item.nodeName] = item.nodeValue;
|
||
|
}
|
||
|
|
||
|
attrs.Encrypted = parseInt(attrs.Encrypted, 10) || 0;
|
||
|
|
||
|
MCommon.log('dictionary attributes: ', attrs);
|
||
|
config();
|
||
|
return spreadus(len + 4, input);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read keyword summary at the begining of keyword section.
|
||
|
* @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#keyword-section
|
||
|
* @param input sliced file, same as input passed to read_header_sect()
|
||
|
* @param offset start position of keyword section in sliced file, equals to length of header string plus checksum.\
|
||
|
* @return keyword_sect object
|
||
|
*/
|
||
|
function read_keyword_summary(input, offset) {
|
||
|
var scanner = Scanner(input);
|
||
|
scanner.forward(offset);
|
||
|
return {
|
||
|
num_blocks: scanner.readNum(),
|
||
|
num_entries: scanner.readNum(),
|
||
|
key_index_decomp_len: _v2 && scanner.readNum(), // Ver >= 2.0 only
|
||
|
key_index_comp_len: scanner.readNum(),
|
||
|
key_blocks_len: scanner.readNum(),
|
||
|
chksum: scanner.checksum_v2(),
|
||
|
// extra field
|
||
|
len: scanner.offset() - offset, // actual length of keyword section, varying with engine version attribute
|
||
|
};
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read keyword index part of keyword section.
|
||
|
* @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#keyword-header-encryption
|
||
|
* @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#keyword-index
|
||
|
* @param input sliced file, remained part of keyword section after keyword summary which can also be used to read following key blocks.
|
||
|
* @param keyword_summary
|
||
|
* @return [keyword_summary, array of keyword index]
|
||
|
*/
|
||
|
function read_keyword_index(input, keyword_summary) {
|
||
|
var scanner = Scanner(input).readBlock(keyword_summary.key_index_comp_len, keyword_summary.key_index_decomp_len, _decryptors[1]),
|
||
|
keyword_index = Array(keyword_summary.num_blocks),
|
||
|
offset = 0;
|
||
|
|
||
|
for (var i = 0, size; i < keyword_summary.num_blocks; i++) {
|
||
|
keyword_index[i] = {
|
||
|
num_entries: conseq(scanner.readNum(), size = scanner.readShort()),
|
||
|
// UNUSED, can be ignored
|
||
|
// first_size: size = scanner.readShort(),
|
||
|
first_word: conseq(scanner.readTextSized(size), size = scanner.readShort()),
|
||
|
// UNUSED, can be ignored
|
||
|
// last_size: size = scanner.readShort(),
|
||
|
last_word: scanner.readTextSized(size),
|
||
|
comp_size: size = scanner.readNum(),
|
||
|
decomp_size: scanner.readNum(),
|
||
|
// extra fields
|
||
|
offset: offset, // offset of the first byte for the target key block in mdx/mdd file
|
||
|
index: i // index of this key index, used to search previous/next block
|
||
|
};
|
||
|
offset += size;
|
||
|
}
|
||
|
return spreadus(keyword_summary, keyword_index);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read keyword entries inside a keyword block and fill KEY_TABLE.
|
||
|
* @param scanner scanner object to read key entries, which starts at begining of target key block
|
||
|
* @param kdx corresponding keyword index object
|
||
|
* NOTE: no need to read keyword block anymore, for debug only.
|
||
|
*/
|
||
|
function read_key_block(scanner, kdx) {
|
||
|
var scanner = scanner.readBlock(kdx.comp_size, kdx.decomp_size);
|
||
|
for (var i = 0; i < kdx.num_entries; i++) {
|
||
|
// scanner.readNum(); scanner.readText();
|
||
|
var kk = [scanner.readNum(), scanner.readText()];
|
||
|
// console.log(scanner.readNum(), scanner.readText());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Delay to scan key table, for debug onyl.
|
||
|
* @param slicedKeyBlock a promise object which will resolve to an ArrayBuffer containing keyword blocks
|
||
|
* sliced from mdx/mdd file.
|
||
|
* @param num_entries number of keyword entries
|
||
|
* @param keyword_index array of keyword index
|
||
|
* @param delay time to delay for scanning key table
|
||
|
*/
|
||
|
function willScanKeyTable(slicedKeyBlock, num_entries, keyword_index, delay) {
|
||
|
slicedKeyBlock.delay(delay).then(function (input) {
|
||
|
MCommon.log('scan key table...');
|
||
|
var scanner = Scanner(input);
|
||
|
for (var i = 0, size = keyword_index.length; i < size; i++) {
|
||
|
read_key_block(scanner, keyword_index[i]);
|
||
|
}
|
||
|
|
||
|
MCommon.log('KEY_TABLE loaded.');
|
||
|
});
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read record summary at the begining of record section.
|
||
|
* @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#record-section
|
||
|
* @param input sliced file, start = begining of record section, length = 32 (max length of record summary)
|
||
|
* @param pos begining of record section
|
||
|
* @returj record summary object
|
||
|
*/
|
||
|
function read_record_summary(input, pos) {
|
||
|
var scanner = Scanner(input),
|
||
|
record_summary = {
|
||
|
num_blocks: scanner.readNum(),
|
||
|
num_entries: scanner.readNum(),
|
||
|
index_len: scanner.readNum(),
|
||
|
blocks_len: scanner.readNum(),
|
||
|
// extra field
|
||
|
len: scanner.offset(), // actual length of record section (excluding record block index), varying with engine version attribute
|
||
|
};
|
||
|
|
||
|
// start position of record block from head of mdx/mdd file
|
||
|
record_summary.block_pos = pos + record_summary.index_len + record_summary.len;
|
||
|
|
||
|
return record_summary;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read record block index part in record section, and fill RECORD_BLOCK_TABLE
|
||
|
* @see https://github.com/zhansliu/writemdict/blob/master/fileformat.md#record-section
|
||
|
* @param input sliced file, start = begining of record block index, length = record_summary.index_len
|
||
|
* @param record_summary record summary object
|
||
|
*/
|
||
|
function read_record_block(input, record_summary) {
|
||
|
var scanner = Scanner(input),
|
||
|
size = record_summary.num_blocks,
|
||
|
record_index = Array(size),
|
||
|
p0 = record_summary.block_pos,
|
||
|
p1 = 0;
|
||
|
|
||
|
RECORD_BLOCK_TABLE.alloc(size + 1);
|
||
|
for (var i = 0, rdx; i < size; i++) {
|
||
|
record_index[i] = rdx = {
|
||
|
comp_size: scanner.readNum(),
|
||
|
decomp_size: scanner.readNum()
|
||
|
};
|
||
|
RECORD_BLOCK_TABLE.put(p0, p1);
|
||
|
p0 += rdx.comp_size;
|
||
|
p1 += rdx.decomp_size;
|
||
|
}
|
||
|
RECORD_BLOCK_TABLE.put(p0, p1);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read definition in text for given keyinfo object.
|
||
|
* @param input record block sliced from the file
|
||
|
* @param block record block index
|
||
|
* @param keyinfo a object with property of record's offset and optional size for the given keyword
|
||
|
* @return definition in text
|
||
|
*/
|
||
|
function read_definition(input, block, keyinfo) {
|
||
|
var scanner = Scanner(input).readBlock(block.comp_size, block.decomp_size);
|
||
|
scanner.forward(keyinfo.offset - block.decomp_offset);
|
||
|
return scanner.readText();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Following link to find actual definition of keyword.
|
||
|
* @param definition maybe starts with "@@@LINK=" which links to another keyword
|
||
|
* @param lookup search function
|
||
|
* @return resolved actual definition
|
||
|
*/
|
||
|
function followLink(definition, lookup) {
|
||
|
return (definition.substring(0, 8) !== '@@@LINK=')
|
||
|
? definition
|
||
|
: lookup(definition.substring(8));
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Read content in ArrayBuffer for give keyinfo object
|
||
|
* @param input record block sliced from the file
|
||
|
* @param block record block index
|
||
|
* @param keyinfo a object with property of record's offset and optional size for the given keyword
|
||
|
* @return an ArrayBuffer containing resource of image/audio/css/font etc.
|
||
|
*/
|
||
|
function read_object(input, block, keyinfo) {
|
||
|
if (input.byteLength > 0) {
|
||
|
var scanner = Scanner(input).readBlock(block.comp_size, block.decomp_size);
|
||
|
scanner.forward(keyinfo.offset - block.decomp_offset);
|
||
|
return scanner.readRaw(keyinfo.size);
|
||
|
} else {
|
||
|
throw '* OUT OF FILE RANGE * ' + keyinfo + ' @offset=' + block.comp_offset;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Find word definition for given keyinfo object.
|
||
|
* @param keyinfo a object with property of record's offset and optional size for the given keyword
|
||
|
* @return a promise object which will resolve to definition in text. Link to other keyword is followed to get actual definition.
|
||
|
*/
|
||
|
function findWord(keyinfo) {
|
||
|
var block = RECORD_BLOCK_TABLE.find(keyinfo.offset);
|
||
|
return _slice(block.comp_offset, block.comp_size)
|
||
|
.exec(read_definition, block, keyinfo)
|
||
|
.spread(function (definition) { return resolve(followLink(definition, LOOKUP.mdx)); });
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Find resource (image, sound etc.) for given keyinfo object.
|
||
|
* @param keyinfo a object with property of record's offset and optional size for the given keyword
|
||
|
* @return a promise object which will resolve to an ArrayBuffer containing resource of image/audio/css/font etc.
|
||
|
* TODO: Follow link, maybe it's too expensive and a rarely used feature?
|
||
|
*/
|
||
|
function findResource(keyinfo) {
|
||
|
var block = RECORD_BLOCK_TABLE.find(keyinfo.offset);
|
||
|
return _slice(block.comp_offset, block.comp_size)
|
||
|
.exec(read_object, block, keyinfo)
|
||
|
.spread(function (blob) { return resolve(blob); });
|
||
|
}
|
||
|
|
||
|
//------------------------------------------------------------------------------------------------
|
||
|
// Implementation for look-up
|
||
|
//------------------------------------------------------------------------------------------------
|
||
|
var slicedKeyBlock,
|
||
|
_cached_keys, // cache latest keys
|
||
|
_trail, // store latest visited record block & position when search for candidate keys
|
||
|
mutual_ticket = 0; // a oneway increased ticket used to cancel unfinished pattern match
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Reduce the key index array to an element which contains or is the nearest one matching a given phrase.
|
||
|
*/
|
||
|
function reduce(arr, phrase) {
|
||
|
var len = arr.length;
|
||
|
if (len > 1) {
|
||
|
len = len >> 1;
|
||
|
return phrase > _adaptKey(arr[len - 1].last_word)
|
||
|
? reduce(arr.slice(len), phrase)
|
||
|
: reduce(arr.slice(0, len), phrase);
|
||
|
} else {
|
||
|
return arr[0];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Reduce the array to index of an element which contains or is the nearest one matching a given phrase.
|
||
|
*/
|
||
|
function shrink(arr, phrase) {
|
||
|
var len = arr.length, sub;
|
||
|
if (len > 1) {
|
||
|
len = len >> 1;
|
||
|
var key = _adaptKey(arr[len]);
|
||
|
if (phrase < key) {
|
||
|
sub = arr.slice(0, len);
|
||
|
sub.pos = arr.pos;
|
||
|
} else {
|
||
|
sub = arr.slice(len);
|
||
|
sub.pos = (arr.pos || 0) + len;
|
||
|
}
|
||
|
return shrink(sub, phrase);
|
||
|
} else {
|
||
|
return (arr.pos || 0) + (phrase <= _adaptKey(arr[0]) ? 0 : 1);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Load keys for a keyword index object from mdx/mdd file.
|
||
|
* @param kdx keyword index object
|
||
|
*/
|
||
|
function loadKeys(kdx) {
|
||
|
if (_cached_keys && _cached_keys.pilot === kdx.first_word) {
|
||
|
return resolve(_cached_keys.list);
|
||
|
} else {
|
||
|
return slicedKeyBlock.then(function(input) {
|
||
|
var scanner = Scanner(input), list = Array(kdx.num_entries);
|
||
|
scanner.forward(kdx.offset);
|
||
|
scanner = scanner.readBlock(kdx.comp_size, kdx.decomp_size);
|
||
|
|
||
|
for (var i = 0; i < kdx.num_entries; i++) {
|
||
|
var offset = scanner.readNum();
|
||
|
list[i] = new Object(scanner.readText());
|
||
|
list[i].offset = offset;
|
||
|
if (i > 0) {
|
||
|
list[i - 1].size = offset - list[i - 1].offset;
|
||
|
}
|
||
|
}
|
||
|
_cached_keys = {list: list, pilot: kdx.first_word};
|
||
|
return list;
|
||
|
});
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Search for the first keyword match given phrase.
|
||
|
*/
|
||
|
function seekVanguard(phrase) {
|
||
|
phrase = _adaptKey(phrase);
|
||
|
var kdx = reduce(KEY_INDEX, phrase);
|
||
|
|
||
|
// look back for the first record block containing keyword for the specified phrase
|
||
|
if (phrase <= _adaptKey(kdx.last_word)) {
|
||
|
var index = kdx.index - 1, prev;
|
||
|
while (prev = KEY_INDEX[index]) {
|
||
|
if (_adaptKey(prev.last_word) !== _adaptKey(kdx.last_word)) {
|
||
|
break;
|
||
|
}
|
||
|
kdx = prev;
|
||
|
index--;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return loadKeys(kdx).then(function (list) {
|
||
|
var idx = shrink(list, phrase);
|
||
|
// look back for the first matched keyword position
|
||
|
while (idx > 0) {
|
||
|
if (_adaptKey(list[--idx]) !== _adaptKey(phrase)) {
|
||
|
idx++;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
return [kdx, Math.min(idx, list.length - 1), list];
|
||
|
});
|
||
|
}
|
||
|
|
||
|
// TODO: have to restrict max count to improve response
|
||
|
/**
|
||
|
* Append more to word list according to a filter or expected size.
|
||
|
*/
|
||
|
function appendMore(word, list, nextKdx, expectedSize, filter, ticket) {
|
||
|
if (ticket !== mutual_ticket) {
|
||
|
throw 'force terminated';
|
||
|
}
|
||
|
|
||
|
if (filter) {
|
||
|
if (_trail.count < expectedSize && nextKdx && nextKdx.first_word.substr(0, word.length) === word) {
|
||
|
return loadKeys(nextKdx).delay(30).then(function(more) {
|
||
|
MCommon.log(nextKdx);
|
||
|
_trail.offset = 0;
|
||
|
_trail.block = nextKdx.index;
|
||
|
Array.prototype.push.apply(list, more.filter(filter, _trail));
|
||
|
return appendMore(word, list, KEY_INDEX[nextKdx.index + 1], expectedSize, filter, ticket);
|
||
|
});
|
||
|
} else {
|
||
|
if (list.length === 0) {
|
||
|
_trail.exhausted = true;
|
||
|
}
|
||
|
return resolve(list);
|
||
|
}
|
||
|
} else {
|
||
|
var shortage = expectedSize - list.length;
|
||
|
if (shortage > 0 && nextKdx) {
|
||
|
console.log('go next', nextKdx);
|
||
|
_trail.block = nextKdx.index;
|
||
|
return loadKeys(nextKdx).then(function(more) {
|
||
|
_trail.offset = 0;
|
||
|
_trail.pos = Math.min(shortage, more.length);
|
||
|
Array.prototype.push.apply(list, more.slice(0, shortage));
|
||
|
console.log('$$ ' + more[shortage - 1], shortage);
|
||
|
return appendMore(word, list, KEY_INDEX[nextKdx.index + 1], expectedSize, filter, ticket);
|
||
|
});
|
||
|
} else {
|
||
|
if (_trail.pos > expectedSize) {
|
||
|
_trail.pos = expectedSize;
|
||
|
}
|
||
|
list = list.slice(0, expectedSize);
|
||
|
_trail.count = list.length;
|
||
|
_trail.total += _trail.count;
|
||
|
return resolve(list);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
function followUp() {
|
||
|
var kdx = KEY_INDEX[_trail.block];
|
||
|
return loadKeys(kdx).then(function (list) {
|
||
|
return [kdx, Math.min(_trail.offset + _trail.pos, list.length - 1), list];
|
||
|
});
|
||
|
}
|
||
|
|
||
|
function matchKeys(phrase, expectedSize, follow) {
|
||
|
expectedSize = Math.max(expectedSize || 0, 10);
|
||
|
var str = phrase.trim().toLowerCase(),
|
||
|
m = /([^?*]+)[?*]+/.exec(str),
|
||
|
word;
|
||
|
if (m) {
|
||
|
word = m[1];
|
||
|
var wildcard = new RegExp('^' + str.replace(/([\.\\\+\[\^\]\$\(\)])/g, '\\$1').replace(/\*+/g, '.*').replace(/\?/g, '.') + '$'),
|
||
|
tester = phrase[phrase.length - 1] === ' '
|
||
|
? function(s) { return wildcard.test(s); }
|
||
|
: function(s) { return wildcard.test(s) && !/ /.test(s); },
|
||
|
filter = function (s, i) {
|
||
|
if (_trail.count < expectedSize && tester(s)) {
|
||
|
_trail.count++;
|
||
|
_trail.total++;
|
||
|
_trail.pos = i + 1;
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
};
|
||
|
} else {
|
||
|
word = phrase.trim();
|
||
|
}
|
||
|
|
||
|
if (_trail && _trail.phrase !== phrase) {
|
||
|
follow = false;
|
||
|
}
|
||
|
|
||
|
if (follow && _trail && _trail.exhausted) {
|
||
|
return resolve([]);
|
||
|
}
|
||
|
|
||
|
var startFrom = follow && _trail ? followUp() : seekVanguard(word);
|
||
|
|
||
|
return startFrom.spread(function(kdx, idx, list) {
|
||
|
console.log('start ', kdx);
|
||
|
list = list.slice(idx);
|
||
|
_trail = {phrase: phrase,
|
||
|
block: kdx.index,
|
||
|
offset: idx,
|
||
|
pos: list.length,
|
||
|
count: 0,
|
||
|
total: follow ? _trail && _trail.total || 0 : 0
|
||
|
};
|
||
|
if (filter) {
|
||
|
list = list.filter(filter, _trail);
|
||
|
}
|
||
|
return appendMore(word, list, KEY_INDEX[kdx.index + 1], expectedSize, filter, ++mutual_ticket)
|
||
|
.then(function(result) {
|
||
|
if (_trail.block === KEY_INDEX.length - 1) {
|
||
|
if (_trail.offset + _trail.pos >= KEY_INDEX[_trail.block].num_entries) {
|
||
|
_trail.exhausted = true;
|
||
|
console.log('EXHAUSTED!!!!');
|
||
|
}
|
||
|
}
|
||
|
console.log('trail: ', _trail);
|
||
|
return result;
|
||
|
});
|
||
|
});
|
||
|
};
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Match the first element in list with given offset.
|
||
|
*/
|
||
|
function matchOffset(list, offset) {
|
||
|
return list.some(function(el) { return el.offset === offset ? list = [el] : false; }) ? list : [];
|
||
|
}
|
||
|
|
||
|
// Lookup functions
|
||
|
var LOOKUP = {
|
||
|
/**
|
||
|
* @param query
|
||
|
* String
|
||
|
* {phrase: .., max: .., follow: true} object
|
||
|
*/
|
||
|
mdx: function(query) {
|
||
|
if (typeof query === 'string' || query instanceof String) {
|
||
|
_trail = null;
|
||
|
var word = query.trim().toLowerCase(), offset = query.offset;
|
||
|
|
||
|
return seekVanguard(word).spread(function(kdx, idx, list) {
|
||
|
list = list.slice(idx);
|
||
|
if (offset !== UNDEFINED) {
|
||
|
list = matchOffset(list, offset);
|
||
|
} else {
|
||
|
list = list.filter(function(el) { return el.toLowerCase() === word; });
|
||
|
}
|
||
|
return harvest(list.map(findWord));
|
||
|
});
|
||
|
} else {
|
||
|
return matchKeys(query.phrase, query.max, query.follow);
|
||
|
}
|
||
|
},
|
||
|
|
||
|
// TODO: chain multiple mdd file
|
||
|
mdd: function(phrase) {
|
||
|
var word = phrase.trim().toLowerCase();
|
||
|
word = '\\' + word.replace(/(^[/\\])|([/]$)/, '');
|
||
|
word = word.replace(/\//g, '\\');
|
||
|
return seekVanguard(word).spread(function(kdx, idx, list) {
|
||
|
return list.slice(idx).filter(function(one) {
|
||
|
return one.toLowerCase() === word;
|
||
|
});
|
||
|
}).then(function(candidates) {
|
||
|
if (candidates.length === 0) {
|
||
|
throw '*RESOURCE NOT FOUND* ' + phrase;
|
||
|
} else {
|
||
|
return findResource(candidates[0]);
|
||
|
}
|
||
|
});
|
||
|
}
|
||
|
};
|
||
|
|
||
|
// ------------------------------------------
|
||
|
// start to load mdx/mdd file
|
||
|
// ------------------------------------------
|
||
|
MCommon.log('start to load ' + file.name);
|
||
|
|
||
|
var pos = 0;
|
||
|
|
||
|
// read first 4 bytes to get header length
|
||
|
return _slice(pos, 4).exec(read_file_head).spread(function(len) {
|
||
|
pos += 4; // start of header string in header section
|
||
|
return _slice(pos, len + 48)
|
||
|
.exec(read_header_sect, len);
|
||
|
|
||
|
}).spread(function(header_remain_len, input) {
|
||
|
pos += header_remain_len; // start of keyword section
|
||
|
return read_keyword_summary(input, header_remain_len);
|
||
|
|
||
|
}).then(function(keyword_summary) { MCommon.log(keyword_summary);
|
||
|
pos += keyword_summary.len; // start of key index in keyword section
|
||
|
return _slice(pos, keyword_summary.key_index_comp_len)
|
||
|
.exec(read_keyword_index, keyword_summary);
|
||
|
|
||
|
}).spread(function (keyword_summary, keyword_index) {
|
||
|
pos += keyword_summary.key_index_comp_len; // start of keyword block in keyword section
|
||
|
slicedKeyBlock = _slice(pos, keyword_summary.key_blocks_len);
|
||
|
|
||
|
/*
|
||
|
// Now it's fast enough to look up word without key table, which scans keyword from the specified key blocks in an effcient way.
|
||
|
// No need to scan the whole key table in ahead.
|
||
|
willScanKeyTable(slicedKeyBlock, keyword_summary.num_entries, keyword_index, 00);
|
||
|
// */
|
||
|
|
||
|
pos += keyword_summary.key_blocks_len; // start of record section
|
||
|
|
||
|
KEY_INDEX = keyword_index;
|
||
|
|
||
|
}).then(function () {
|
||
|
return _slice(pos, 32)
|
||
|
.exec(read_record_summary, pos);
|
||
|
|
||
|
}).spread(function (record_summary) { MCommon.log(record_summary);
|
||
|
pos += record_summary.len; // start of record blocks in record section
|
||
|
return _slice(pos, record_summary.index_len)
|
||
|
.exec(read_record_block, record_summary);
|
||
|
|
||
|
}).spread(function() { MCommon.log('-- parse done --', file.name);
|
||
|
// resolve and return lookup() function according to file extension (mdx/mdd)
|
||
|
LOOKUP[ext].description = attrs.Description;
|
||
|
return resolve(LOOKUP[ext]);
|
||
|
});
|
||
|
};
|
||
|
|
||
|
// -------------------------
|
||
|
// END OF parse_mdict()
|
||
|
// -------------------------
|
||
|
|
||
|
/**
|
||
|
* Load a set of files which will be parsed as MDict dictionary & resource (mdx/mdd).
|
||
|
*/
|
||
|
return function load(files) {
|
||
|
var resources = [];
|
||
|
Array.prototype.forEach.call(files, function(f) {
|
||
|
var ext = MCommon.getExtension(f.name, 'mdx');
|
||
|
|
||
|
resources.push(resources[ext] = parse_mdict(f, ext));
|
||
|
});
|
||
|
|
||
|
return Promise.all(resources)
|
||
|
.then(function() { return resolve(resources); });
|
||
|
};
|
||
|
|
||
|
}());
|