diff --git a/src/emitter.js b/src/emitter.js index 8cc9e1c..8378fe2 100644 --- a/src/emitter.js +++ b/src/emitter.js @@ -119,6 +119,33 @@ PEG.compiler.emitter = function(ast) { " var rightmostMatchFailuresExpected = [];", " var cache = {};", " ", + /* This needs to be in sync with |padLeft| in utils.js. */ + " function padLeft(input, padding, length) {", + " var result = input;", + " ", + " var padLength = length - input.length;", + " for (var i = 0; i < padLength; i++) {", + " result = padding + result;", + " }", + " ", + " return result;", + " }", + " ", + /* This needs to be in sync with |escape| in utils.js. */ + " function escape(ch) {", + " var charCode = ch.charCodeAt(0);", + " ", + " if (charCode < 0xFF) {", + " var escapeChar = 'x';", + " var length = 2;", + " } else {", + " var escapeChar = 'u';", + " var length = 4;", + " }", + " ", + " return '\\\\' + escapeChar + padLeft(charCode.toString(16).toUpperCase(), '0', length);", + " }", + " ", /* This needs to be in sync with |quote| in utils.js. */ " function quote(s) {", " /*", @@ -128,12 +155,11 @@ PEG.compiler.emitter = function(ast) { " * Any character may appear in the form of an escape sequence.", " */", " return '\"' + s", - " .replace(/\\\\/g, '\\\\\\\\') // backslash", - " .replace(/\"/g, '\\\\\"') // closing quote character", - " .replace(/\\r/g, '\\\\r') // carriage return", - " .replace(/\\u2028/g, '\\\\u2028') // line separator", - " .replace(/\\u2029/g, '\\\\u2029') // paragraph separator", - " .replace(/\\n/g, '\\\\n') // line feed", + " .replace(/\\\\/g, '\\\\\\\\') // backslash", + " .replace(/\"/g, '\\\\\"') // closing quote character", + " .replace(/\\r/g, '\\\\r') // carriage return", + " .replace(/\\n/g, '\\\\n') // line feed", + " .replace(/[\\x80-\\uFFFF]/g, escape) // non-ASCII characters", " + '\"';", " }", " ", diff --git a/src/parser.js b/src/parser.js index 94002a1..8d8eb3b 100644 --- a/src/parser.js +++ b/src/parser.js @@ -15,6 +15,31 @@ PEG.parser = (function(){ var rightmostMatchFailuresExpected = []; var cache = {}; + function padLeft(input, padding, length) { + var result = input; + + var padLength = length - input.length; + for (var i = 0; i < padLength; i++) { + result = padding + result; + } + + return result; + } + + function escape(ch) { + var charCode = ch.charCodeAt(0); + + if (charCode < 0xFF) { + var escapeChar = 'x'; + var length = 2; + } else { + var escapeChar = 'u'; + var length = 4; + } + + return '\\' + escapeChar + padLeft(charCode.toString(16).toUpperCase(), '0', length); + } + function quote(s) { /* * ECMA-262, 5th ed., 7.8.4: All characters may appear literally in a @@ -23,12 +48,11 @@ PEG.parser = (function(){ * Any character may appear in the form of an escape sequence. */ return '"' + s - .replace(/\\/g, '\\\\') // backslash - .replace(/"/g, '\\"') // closing quote character - .replace(/\r/g, '\\r') // carriage return - .replace(/\u2028/g, '\\u2028') // line separator - .replace(/\u2029/g, '\\u2029') // paragraph separator - .replace(/\n/g, '\\n') // line feed + .replace(/\\/g, '\\\\') // backslash + .replace(/"/g, '\\"') // closing quote character + .replace(/\r/g, '\\r') // carriage return + .replace(/\n/g, '\\n') // line feed + .replace(/[\x80-\uFFFF]/g, escape) // non-ASCII characters + '"'; } @@ -3404,13 +3428,13 @@ PEG.parser = (function(){ var savedReportMatchFailures = reportMatchFailures; reportMatchFailures = false; - if (input.substr(pos).match(/^[   ᠎ -    ]/) !== null) { + if (input.substr(pos).match(/^[ \xA0\uFEFF\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]/) !== null) { var result0 = input.charAt(pos); pos++; } else { var result0 = null; if (reportMatchFailures) { - matchFailed("[   ᠎ -    ]"); + matchFailed("[ \\xA0\\uFEFF\\u1680\\u180E\\u2000-\\u200A\\u202F\\u205F\\u3000]"); } } reportMatchFailures = savedReportMatchFailures; diff --git a/src/utils.js b/src/utils.js index 14f1018..b705142 100644 --- a/src/utils.js +++ b/src/utils.js @@ -34,6 +34,44 @@ function map(array, callback) { return result; } +/* + * Returns a string padded on the left to a desired length with a character. + * + * The code needs to be in sync with th code template in the compilation + * function for "action" nodes. + */ +function padLeft(input, padding, length) { + var result = input; + + var padLength = length - input.length; + for (var i = 0; i < padLength; i++) { + result = padding + result; + } + + return result; +} + +/* + * Returns an escape sequence for given character. Uses \x for characters <= + * 0xFF to save space, \u for the rest. + * + * The code needs to be in sync with th code template in the compilation + * function for "action" nodes. + */ +function escape(ch) { + var charCode = ch.charCodeAt(0); + + if (charCode < 0xFF) { + var escapeChar = 'x'; + var length = 2; + } else { + var escapeChar = 'u'; + var length = 4; + } + + return '\\' + escapeChar + padLeft(charCode.toString(16).toUpperCase(), '0', length); +} + /* * Surrounds the string with quotes and escapes characters inside so that the * result is a valid JavaScript string. @@ -47,14 +85,15 @@ function quote(s) { * literal except for the closing quote character, backslash, carriage return, * line separator, paragraph separator, and line feed. Any character may * appear in the form of an escape sequence. + * + * For portability, we also escape escape all non-ASCII characters. */ return '"' + s - .replace(/\\/g, '\\\\') // backslash - .replace(/"/g, '\\"') // closing quote character - .replace(/\r/g, '\\r') // carriage return - .replace(/\u2028/g, '\\u2028') // line separator - .replace(/\u2029/g, '\\u2029') // paragraph separator - .replace(/\n/g, '\\n') // line feed + .replace(/\\/g, '\\\\') // backslash + .replace(/"/g, '\\"') // closing quote character + .replace(/\r/g, '\\r') // carriage return + .replace(/\n/g, '\\n') // line feed + .replace(/[\x80-\uFFFF]/g, escape) // non-ASCII characters + '"'; }; @@ -63,17 +102,20 @@ function quote(s) { * characters in a character class of a regular expression. */ function quoteForRegexpClass(s) { - /* Based on ECMA-262, 5th ed., 7.8.5 & 15.10.1. */ + /* + * Based on ECMA-262, 5th ed., 7.8.5 & 15.10.1. + * + * For portability, we also escape escape all non-ASCII characters. + */ return s - .replace(/\\/g, '\\\\') // backslash - .replace(/\0/g, '\\0') // null, IE needs this - .replace(/\//g, '\\/') // closing slash - .replace(/]/g, '\\]') // closing bracket - .replace(/-/g, '\\-') // dash - .replace(/\r/g, '\\r') // carriage return - .replace(/\u2028/g, '\\u2028') // line separator - .replace(/\u2029/g, '\\u2029') // paragraph separator - .replace(/\n/g, '\\n') // line feed + .replace(/\\/g, '\\\\') // backslash + .replace(/\0/g, '\\0') // null, IE needs this + .replace(/\//g, '\\/') // closing slash + .replace(/]/g, '\\]') // closing bracket + .replace(/-/g, '\\-') // dash + .replace(/\r/g, '\\r') // carriage return + .replace(/\n/g, '\\n') // line feed + .replace(/[\x80-\uFFFF]/g, escape) // non-ASCII characters } /* diff --git a/test/parser-test.js b/test/parser-test.js index 2a39ba4..3754eac 100644 --- a/test/parser-test.js +++ b/test/parser-test.js @@ -435,7 +435,7 @@ test("parses bracketDelimitedCharacter", function() { parserParses("start = [\\n]", classGrammar(false, ["\n"], "[\\n]")); parserParses("start = [\\0]", classGrammar(false, ["\0"], "[\\0]")); parserParses("start = [\\x00]", classGrammar(false, ["\0"], "[\\0]")); - parserParses("start = [\\u0120]", classGrammar(false, ["\u0120"], "[\u0120]")); + parserParses("start = [\\u0120]", classGrammar(false, ["\u0120"], "[\\u0120]")); parserParses("start = [\\\n]", classGrammar(false, ["\n"], "[\\n]")); });