From 5e146fce3846f3bf58562c4afe91207c11294b7e Mon Sep 17 00:00:00 2001 From: David Majda Date: Sat, 1 Dec 2012 15:46:14 +0100 Subject: [PATCH] Text nodes: Implement text nodes Implement a new syntax to extract matched strings from expressions. For example, instead of: identifier = first:[a-zA-Z_] rest:[a-zA-Z0-9_]* { return first + rest.join(""); } you can now just write: identifier = $([a-zA-Z_] [a-zA-Z0-9_]*) This is useful mostly for "lexical" rules at the bottom of many grammars. Note that structured match results are still built for the expressions prefixed by "$", they are just ignored. I plan to optimize this later (sometime after the code generator rewrite). --- README.md | 5 + lib/compiler/passes/allocate-registers.js | 1 + lib/compiler/passes/generate-code.js | 8 ++ lib/compiler/passes/remove-proxy-rules.js | 1 + lib/compiler/passes/report-left-recursion.js | 1 + lib/compiler/passes/report-missing-rules.js | 1 + lib/parser.js | 103 ++++++++++++++---- .../passes/allocate-registers.spec.js | 21 ++++ .../passes/remove-proxy-rules.spec.js | 4 + .../passes/report-left-recursion.spec.js | 4 + .../passes/report-missing-rules.spec.js | 4 + spec/generated-parser.spec.js | 8 ++ spec/parser.spec.js | 4 + src/parser.pegjs | 9 +- 14 files changed, 154 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index ca4b6e7..a46b4ab 100644 --- a/README.md +++ b/README.md @@ -314,6 +314,11 @@ the `options` variable. Note that curly braces in the predicate code must be balanced. +#### $ *expression* + +Try to match the expression. If the match succeeds, return the matched string +instead of the match result. + #### *label* : *expression* Match the expression and remember its match result under given label. The label diff --git a/lib/compiler/passes/allocate-registers.js b/lib/compiler/passes/allocate-registers.js index b8a1ee3..7ec8af7 100644 --- a/lib/compiler/passes/allocate-registers.js +++ b/lib/compiler/passes/allocate-registers.js @@ -212,6 +212,7 @@ module.exports = function(ast) { computeExpressionScopedReuseResult(node); }, + text: computeExpressionScopedReuseResultSavePos, simple_and: computeExpressionScopedReuseResultSavePos, simple_not: computeExpressionScopedReuseResultSavePos, semantic_and: computeParams, diff --git a/lib/compiler/passes/generate-code.js b/lib/compiler/passes/generate-code.js index 79e5b97..5b6ec38 100644 --- a/lib/compiler/passes/generate-code.js +++ b/lib/compiler/passes/generate-code.js @@ -604,6 +604,13 @@ module.exports = function(ast, options) { "sequence.inner": [ '#{r(node.resultIndex)} = [#{map(pluck(node.elements, "resultIndex"), r).join(", ")}];' ], + text: [ + '#{r(node.posIndex)} = pos;', + '#block emit(node.expression)', + 'if (#{r(node.resultIndex)} !== null) {', + ' #{r(node.resultIndex)} = input.substring(pos, #{r(node.posIndex)});', + '}' + ], simple_and: [ '#{r(node.posIndex)} = pos;', 'reportFailures++;', @@ -813,6 +820,7 @@ module.exports = function(ast, options) { labeled: function(node) { return emit(node.expression); }, + text: emitSimple("text"), simple_and: emitSimple("simple_and"), simple_not: emitSimple("simple_not"), semantic_and: emitSimple("semantic_and"), diff --git a/lib/compiler/passes/remove-proxy-rules.js b/lib/compiler/passes/remove-proxy-rules.js index 6c2b4bd..b5da0d2 100644 --- a/lib/compiler/passes/remove-proxy-rules.js +++ b/lib/compiler/passes/remove-proxy-rules.js @@ -30,6 +30,7 @@ module.exports = function(ast) { choice: replaceInSubnodes("alternatives"), sequence: replaceInSubnodes("elements"), labeled: replaceInExpression, + text: replaceInExpression, simple_and: replaceInExpression, simple_not: replaceInExpression, semantic_and: nop, diff --git a/lib/compiler/passes/report-left-recursion.js b/lib/compiler/passes/report-left-recursion.js index 045a013..1c6a7d9 100644 --- a/lib/compiler/passes/report-left-recursion.js +++ b/lib/compiler/passes/report-left-recursion.js @@ -36,6 +36,7 @@ module.exports = function(ast) { }, labeled: checkExpression, + text: checkExpression, simple_and: checkExpression, simple_not: checkExpression, semantic_and: nop, diff --git a/lib/compiler/passes/report-missing-rules.js b/lib/compiler/passes/report-missing-rules.js index 9289f3a..f600862 100644 --- a/lib/compiler/passes/report-missing-rules.js +++ b/lib/compiler/passes/report-missing-rules.js @@ -18,6 +18,7 @@ module.exports = function(ast) { action: checkExpression, sequence: checkSubnodes("elements"), labeled: checkExpression, + text: checkExpression, simple_and: checkExpression, simple_not: checkExpression, semantic_and: nop, diff --git a/lib/parser.js b/lib/parser.js index e1a244e..fb2ca94 100644 --- a/lib/parser.js +++ b/lib/parser.js @@ -484,9 +484,9 @@ module.exports = (function(){ r1 = pos; r2 = pos; - r3 = parse_and(); + r3 = parse_dollar(); if (r3 !== null) { - r4 = parse_action(); + r4 = parse_suffixed(); if (r4 !== null) { r0 = [r3, r4]; } else { @@ -499,10 +499,10 @@ module.exports = (function(){ } if (r0 !== null) { reportedPos = r1; - r0 = (function(code) { + r0 = (function(expression) { return { - type: "semantic_and", - code: code + type: "text", + expression: expression }; })(r4); } @@ -514,7 +514,7 @@ module.exports = (function(){ r2 = pos; r3 = parse_and(); if (r3 !== null) { - r4 = parse_suffixed(); + r4 = parse_action(); if (r4 !== null) { r0 = [r3, r4]; } else { @@ -527,10 +527,10 @@ module.exports = (function(){ } if (r0 !== null) { reportedPos = r1; - r0 = (function(expression) { + r0 = (function(code) { return { - type: "simple_and", - expression: expression + type: "semantic_and", + code: code }; })(r4); } @@ -540,9 +540,9 @@ module.exports = (function(){ if (r0 === null) { r1 = pos; r2 = pos; - r3 = parse_not(); + r3 = parse_and(); if (r3 !== null) { - r4 = parse_action(); + r4 = parse_suffixed(); if (r4 !== null) { r0 = [r3, r4]; } else { @@ -555,10 +555,10 @@ module.exports = (function(){ } if (r0 !== null) { reportedPos = r1; - r0 = (function(code) { + r0 = (function(expression) { return { - type: "semantic_not", - code: code + type: "simple_and", + expression: expression }; })(r4); } @@ -570,7 +570,7 @@ module.exports = (function(){ r2 = pos; r3 = parse_not(); if (r3 !== null) { - r4 = parse_suffixed(); + r4 = parse_action(); if (r4 !== null) { r0 = [r3, r4]; } else { @@ -583,10 +583,10 @@ module.exports = (function(){ } if (r0 !== null) { reportedPos = r1; - r0 = (function(expression) { + r0 = (function(code) { return { - type: "simple_not", - expression: expression + type: "semantic_not", + code: code }; })(r4); } @@ -594,7 +594,36 @@ module.exports = (function(){ pos = r1; } if (r0 === null) { - r0 = parse_suffixed(); + r1 = pos; + r2 = pos; + r3 = parse_not(); + if (r3 !== null) { + r4 = parse_suffixed(); + if (r4 !== null) { + r0 = [r3, r4]; + } else { + r0 = null; + pos = r2; + } + } else { + r0 = null; + pos = r2; + } + if (r0 !== null) { + reportedPos = r1; + r0 = (function(expression) { + return { + type: "simple_not", + expression: expression + }; + })(r4); + } + if (r0 === null) { + pos = r1; + } + if (r0 === null) { + r0 = parse_suffixed(); + } } } } @@ -1150,6 +1179,42 @@ module.exports = (function(){ return r0; } + function parse_dollar() { + var r0, r1, r2, r3, r4; + + r1 = pos; + r2 = pos; + if (input.charCodeAt(pos) === 36) { + r3 = "$"; + pos++; + } else { + r3 = null; + if (reportFailures === 0) { + matchFailed("\"$\""); + } + } + if (r3 !== null) { + r4 = parse___(); + if (r4 !== null) { + r0 = [r3, r4]; + } else { + r0 = null; + pos = r2; + } + } else { + r0 = null; + pos = r2; + } + if (r0 !== null) { + reportedPos = r1; + r0 = (function() { return "$"; })(); + } + if (r0 === null) { + pos = r1; + } + return r0; + } + function parse_question() { var r0, r1, r2, r3, r4; diff --git a/spec/compiler/passes/allocate-registers.spec.js b/spec/compiler/passes/allocate-registers.spec.js index 645ca42..3e31621 100644 --- a/spec/compiler/passes/allocate-registers.spec.js +++ b/spec/compiler/passes/allocate-registers.spec.js @@ -177,6 +177,27 @@ describe("compiler pass |allocateRegisters|", function() { }); }); + describe("for text", function() { + it("allocates a position register", function() { + expect(pass).toChangeAST('start = $"a"', savePosDetails); + }); + + it("reuses its own result register for the expression", function() { + expect(pass).toChangeAST('start = $"a"', reuseResultDetails); + }); + + it("creates a new scope", function() { + expect(pass).toChangeAST('start = $(a:"a") { }', scopedDetails); + }); + + it("unblocks registers blocked by its children", function() { + expect(pass).toChangeAST( + 'start = ($(a:"a") "b") ("c" "d")', + unblockedDetails + ); + }); + }); + describe("for simple and", function() { it("allocates a position register", function() { expect(pass).toChangeAST('start = &"a"', savePosDetails); diff --git a/spec/compiler/passes/remove-proxy-rules.spec.js b/spec/compiler/passes/remove-proxy-rules.spec.js index 4030239..63b4b0b 100644 --- a/spec/compiler/passes/remove-proxy-rules.spec.js +++ b/spec/compiler/passes/remove-proxy-rules.spec.js @@ -60,6 +60,10 @@ describe("compiler pass |removeProxyRules|", function() { expect(pass).toChangeAST(proxyGrammar('start = label:proxy'), simpleDetails); }); + it("removes proxy rule from a text", function() { + expect(pass).toChangeAST(proxyGrammar('start = $proxy'), simpleDetails); + }); + it("removes proxy rule from a simple and", function() { expect(pass).toChangeAST(proxyGrammar('start = &proxy'), simpleDetails); }); diff --git a/spec/compiler/passes/report-left-recursion.spec.js b/spec/compiler/passes/report-left-recursion.spec.js index 68f8e09..a18807f 100644 --- a/spec/compiler/passes/report-left-recursion.spec.js +++ b/spec/compiler/passes/report-left-recursion.spec.js @@ -63,6 +63,10 @@ describe("compiler pass |reportLeftRecursion|", function() { expect(pass).toReportLeftRecursionIn('start = label:start'); }); + it("reports left recursion inside a text", function() { + expect(pass).toReportLeftRecursionIn('start = $start'); + }); + it("reports left recursion inside a simple and", function() { expect(pass).toReportLeftRecursionIn('start = &start'); }); diff --git a/spec/compiler/passes/report-missing-rules.spec.js b/spec/compiler/passes/report-missing-rules.spec.js index 6acaa47..0d8f17b 100644 --- a/spec/compiler/passes/report-missing-rules.spec.js +++ b/spec/compiler/passes/report-missing-rules.spec.js @@ -64,6 +64,10 @@ describe("compiler pass |reportMissingRules|", function() { expect(pass).toReportMissingRuleIn('start = label:missing'); }); + it("reports missing rule referenced from a text", function() { + expect(pass).toReportMissingRuleIn('start = $missing'); + }); + it("reports missing rule referenced from a simple and", function() { expect(pass).toReportMissingRuleIn('start = &missing'); }); diff --git a/spec/generated-parser.spec.js b/spec/generated-parser.spec.js index 09be92e..4a038e2 100644 --- a/spec/generated-parser.spec.js +++ b/spec/generated-parser.spec.js @@ -353,6 +353,14 @@ describe("generated parser", function() { }); }); + describe("text matching", function() { + it("matches correctly", function() { + var parser = PEG.buildParser('start = $("a" "b" "c")', options); + + expect(parser).toParse("abc", "abc"); + }); + }); + describe("simple and matching", function() { it("matches correctly", function() { var parser = PEG.buildParser('start = &"a" "a"', options); diff --git a/spec/parser.spec.js b/spec/parser.spec.js index d8726a0..f07a69d 100644 --- a/spec/parser.spec.js +++ b/spec/parser.spec.js @@ -272,6 +272,10 @@ describe("PEG.js grammar parser", function() { /* Canonical prefixed is "!\"abcd\"". */ it("parses prefixed", function() { + expect('start = $"abcd"?' ).toParseAs(oneRuleGrammar({ + type: "text", + expression: optionalLiteral + })); expect('start = &{ code }').toParseAs(oneRuleGrammar({ type: "semantic_and", code: " code " diff --git a/src/parser.pegjs b/src/parser.pegjs index a37475b..91bac72 100644 --- a/src/parser.pegjs +++ b/src/parser.pegjs @@ -88,7 +88,13 @@ labeled / prefixed prefixed - = and code:action { + = dollar expression:suffixed { + return { + type: "text", + expression: expression + }; + } + / and code:action { return { type: "semantic_and", code: code @@ -169,6 +175,7 @@ semicolon = ";" __ { return ";"; } slash = "/" __ { return "/"; } and = "&" __ { return "&"; } not = "!" __ { return "!"; } +dollar = "$" __ { return "$"; } question = "?" __ { return "?"; } star = "*" __ { return "*"; } plus = "+" __ { return "+"; }