Skip to content

Commit 5eda17f

Browse files
authored
Merge pull request #19 from SWAT-engineering/no-conversion-when-delimiter-occurs-elsewhere-too
No conversion when delimiter occurs elsewhere too
2 parents 899fb3d + 416bc7d commit 5eda17f

File tree

18 files changed

+161
-323
lines changed

18 files changed

+161
-323
lines changed

rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
@synopsis{
2-
Utility functions to work with grammars, productions, and symbols.
2+
Utility functions to work with grammars, productions, and symbols
33
}
44

55
module lang::rascal::grammar::Util
@@ -9,6 +9,8 @@ import Grammar;
99
import ParseTree;
1010
import String;
1111

12+
import util::ListUtil;
13+
1214
@synopsis{
1315
Utility functions for grammars
1416
}
@@ -47,6 +49,62 @@ bool isRecursive(Grammar g, Symbol s) {
4749
return check({}, s);
4850
}
4951
52+
@synopsis{
53+
Representation of a pointer to a symbol in (the list of symbols of) a
54+
production. This is useful to distinguish between different occurrences of
55+
the same symbol in a grammar (i.e., they have different pointers).
56+
}
57+
58+
alias Pointer = tuple[Production p, int index];
59+
60+
@synopsis{
61+
Finds the list of pointers -- a *trace* -- to the first occurrence of symbol
62+
`s`, if any, starting from production `p`, optionally in a particular
63+
direction (default: `forward()`). That is: if `<p1,i>` is followed by
64+
`<p2,_>` in the returned list, then `p1.symbols[i]` is a non-terminal and
65+
`p2` is one of its productions.
66+
}
67+
68+
@description{
69+
For instance, consider the following grammar:
70+
71+
```
72+
lexical X = Y;
73+
lexical Y = alt1: "[" "[" "[" Z1 "]" "]" "]" | alt2: "<" Z2 ">";
74+
lexical Z1 = "foo" "bar";
75+
lexical Z2 = "baz";
76+
```
77+
78+
The list of pointers to `"bar"`, starting from `X`, is:
79+
80+
- `<X,0>`
81+
- `<Y.alt1,3>`
82+
- `<Z1,1>`
83+
84+
The list of pointers to `"qux"` is just empty.
85+
}
86+
87+
list[Pointer] find(Grammar g, Production p, Symbol s, Direction dir = forward()) {
88+
89+
list[Pointer] doFind(set[Production] doing, Production haystack, Symbol needle) {
90+
for (haystack notin doing, i <- reorder([0..size(haystack.symbols)], dir)) {
91+
Symbol ith = delabel(haystack.symbols[i]);
92+
if (ith == needle) {
93+
return [<haystack, i>];
94+
}
95+
for (isNonTerminalType(ith), child <- lookup(g, ith)) {
96+
if (list[Pointer] l: [_, *_] := doFind(doing + haystack, child, s)) {
97+
return [<haystack, i>] + l;
98+
}
99+
}
100+
}
101+
102+
return [];
103+
}
104+
105+
return doFind({}, p, s);
106+
}
107+
50108
@synopsis{
51109
Lookups a list of productions for symbol `s` in grammar `g`, replacing
52110
formal parameters with actual parameters when needed

rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Delimiters.rsc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -193,9 +193,9 @@ default Maybe[Symbol] unique(set[Maybe[Symbol]] _) = nothing();
193193
}
194194
195195
bool isDelimiter(lit(string))
196-
= /^\w+$/ !:= string;
196+
= /^\W+$/ := string;
197197
bool isDelimiter(cilit(string))
198-
= /^\w+$/ !:= string;
198+
= isDelimiter(lit(string));
199199
200200
default bool isDelimiter(Symbol _)
201201
= false;
@@ -205,9 +205,9 @@ default bool isDelimiter(Symbol _)
205205
}
206206
207207
bool isKeyword(lit(string))
208-
= /^\w+$/ := string;
208+
= /^\w.*$/ := string;
209209
bool isKeyword(cilit(string))
210-
= /^\w+$/ := string;
210+
= isKeyword(lit(string));
211211
212212
default bool isKeyword(Symbol _)
213213
= false;

rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,16 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
199199
// Convert all units in the group to match patterns (including,
200200
// optimistically, multi-line units as-if they are single-line)
201201
for (u <- group, !u.recursive) {
202-
TmRule r = toTmRule(toRegExp(u.rsc, u.prod, guard = true))
202+
203+
// Add the guard (i.e., look-behind condition to match layout) only
204+
// when the units in the group don't begin with a delimiter. Why is
205+
// is this? We *don't* want `32` to be highlighted as a number in
206+
// `int aer32 = 34`. However, we *do* want `>bar"` to be highlighted
207+
// as a string in `"foo<x==5>bar"`. As a heuristic, if the token
208+
// starts with a delimiter (e.g., `>`), then it should be allowed
209+
// for its occurrence to not be preceded by layout.
210+
bool guard = nothing() := u.innerDelimiters.begin;
211+
TmRule r = toTmRule(toRegExp(u.rsc, u.prod, guard = guard))
203212
[name = "/inner/single/<u.name>"];
204213
205214
rules = insertIn(rules, (u: r));
@@ -217,6 +226,25 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
217226
218227
// Simple case: each unit does have an `end` inner delimiter
219228
if (_ <- group && all(u <- group, just(_) := u.innerDelimiters.end)) {
229+
230+
// Create a set of pointers to the first (resp. last) occurrence
231+
// of `pivot` in each unit, when `pivot` is a `begin` delimiter
232+
// (resp. an `end` delimiter) of the group. If `pivot` occurs
233+
// elsewhere in the grammar as well, then skip the conversion
234+
// of these multi-line units to a begin/end pattern. This is to
235+
// avoid tokenization mistakes in which the other occurrences of
236+
// `pivot` in the input are mistakenly interpreted as the
237+
// beginning or ending of a unit in the group.
238+
239+
Symbol pivot = key.val;
240+
241+
set[Pointer] pointers = {};
242+
pointers += pivot in begins ? {*find(rsc, u.prod, pivot, dir = forward()) [-1..] | u <- group} : {};
243+
pointers += pivot in ends ? {*find(rsc, u.prod, pivot, dir = backward())[-1..] | u <- group} : {};
244+
245+
if (any(/p: prod(_, [*before, pivot, *_], _) := rsc.rules, <p, size(before)> notin pointers)) {
246+
continue;
247+
}
220248
221249
// Compute a set of segments that need to be consumed between
222250
// the `begin` delimiter and the `end` delimiters. Each of these

rascal-textmate-core/src/main/rascal/lang/textmate/ConversionTests.rsc

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ bool doTransformTest(list[ConversionUnit] units, RepositoryStats expect, str nam
8686
assert actual.include == expect.include : "Actual number of top-level include patterns in repository: <actual.include>. Expected: <expect.include>.";
8787

8888
// Test behavioral properties of the TextMate grammar
89-
9089
loc lTest = lProject + "/src/main/rascal/lang/textmate/conversiontests/<name>.test";
9190
loc lTester = lProject + "/node_modules/vscode-tmgrammar-test";
9291
if (!exists(lTest)) {
@@ -103,7 +102,21 @@ bool doTransformTest(list[ConversionUnit] units, RepositoryStats expect, str nam
103102
resolveLocation(lTest).path[(windows ? 1 : 0)..]
104103
];
105104

106-
if (<output, exitCode> := execWithCode(lExec, args = args) && exitCode != 0) {
105+
// TODO: The following function serves as a workaround for a race
106+
// in (the Java-part of) the implementation of `execWithCode`. A fix is
107+
// already available but not yet released. When it is, this function
108+
// should be removed (and `execWithCode` called directly). See also:
109+
// https://github.com/usethesource/rascal/commit/1ce9e59dfd7098327bbaf55a985c2a643ff52861
110+
tuple[str, int] execWithCodeUntilSuccess() {
111+
try {
112+
return execWithCode(lExec, args = args);
113+
} catch e: {
114+
println("[LOG] Retrying after unexpected exception: <e>");
115+
return execWithCodeUntilSuccess();
116+
}
117+
}
118+
119+
if (<output, exitCode> := execWithCodeUntilSuccess() && exitCode != 0) {
107120
println(output);
108121
assert false : "Actual tokenization does not match expected tokenization (see output above for details)";
109122
}

rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Pico.rsc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@ import lang::pico::\syntax::Main;
1313
Grammar rsc = preprocess(grammar(#Program));
1414

1515
list[ConversionUnit] units = [
16-
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":="),lit("\""),lit(";"),lit("nil-type")})],{}), false, false, <nothing(),nothing()>, <nothing(),nothing()>),
16+
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":="),lit("\""),lit(";")})],{}), false, false, <nothing(),nothing()>, <nothing(),nothing()>),
1717
unit(rsc, prod(lex("WhitespaceAndComment"),[lit("%%"),conditional(\iter-star(\char-class([range(1,9),range(11,1114111)])),{\end-of-line()})],{\tag("category"("Comment"))}), false, false, <nothing(),nothing()>, <just(lit("%%")),nothing()>),
1818
unit(rsc, prod(lex("WhitespaceAndComment"),[lit("%"),iter(\char-class([range(1,36),range(38,1114111)])),lit("%")],{\tag("category"("Comment"))}), false, true, <nothing(),nothing()>, <just(lit("%")),just(lit("%"))>),
19-
unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("do"),lit("declare"),lit("fi"),lit("else"),lit("end"),lit("od"),lit("begin"),lit("natural"),lit("then"),lit("if"),lit("while"),lit("string")})],{\tag("category"("keyword.control"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>)
19+
unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("do"),lit("declare"),lit("fi"),lit("else"),lit("end"),lit("od"),lit("nil-type"),lit("begin"),lit("natural"),lit("then"),lit("if"),lit("while"),lit("string")})],{\tag("category"("keyword.control"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>)
2020
];
2121

2222
test bool analyzeTest() = doAnalyzeTest(rsc, units);

rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,14 +76,14 @@ Grammar rsc = preprocess(grammar(#Program));
7676
list[ConversionUnit] units = [
7777
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":="),lit("\\")})],{}), false, false, <nothing(),nothing()>, <nothing(),nothing()>),
7878
unit(rsc, prod(label("natural",sort("Type")),[lit("natural")],{\tag("category"("storage.type"))}), false, false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),
79-
unit(rsc, prod(label("nil",sort("Type")),[lit("nil-type")],{\tag("category"("storage.type"))}), false, false, <just(lit(":")),just(lit(";"))>, <just(lit("nil-type")),just(lit("nil-type"))>),
79+
unit(rsc, prod(label("nil",sort("Type")),[lit("nil-type")],{\tag("category"("storage.type"))}), false, false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),
8080
unit(rsc, prod(label("string",sort("Type")),[lit("string")],{\tag("category"("storage.type"))}), false, false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),
8181
unit(rsc, prod(lex("WhitespaceAndComment"),[lit("%%"),conditional(\iter-star(\char-class([range(1,9),range(11,1114111)])),{\end-of-line()})],{\tag("category"("comment.line"))}), false, false, <nothing(),nothing()>, <just(lit("%%")),nothing()>),
8282
unit(rsc, prod(lex("WhitespaceAndComment"),[lit("%"),iter(\char-class([range(1,36),range(38,1114111)])),lit("%")],{\tag("category"("comment.block"))}), false, true, <nothing(),nothing()>, <just(lit("%")),just(lit("%"))>),
8383
unit(rsc, prod(label("strcon",sort("Expression")),[label("string",lex("String"))],{\tag("category"("string.quoted.double"))}), false, true, <nothing(),nothing()>, <just(lit("\"")),just(lit("\""))>),
8484
unit(rsc, prod(label("id",sort("Expression")),[label("name",lex("Id"))],{\tag("category"("variable.other"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>),
8585
unit(rsc, prod(label("natcon",sort("Expression")),[label("natcon",lex("Natural"))],{\tag("category"("constant.numeric"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>),
86-
unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("do"),lit("declare"),lit("fi"),lit("else"),lit("end"),lit("od"),lit("begin"),lit("natural"),lit("then"),lit("if"),lit("while"),lit("string")})],{\tag("category"("keyword.control"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>)
86+
unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("do"),lit("declare"),lit("fi"),lit("else"),lit("end"),lit("od"),lit("nil-type"),lit("begin"),lit("natural"),lit("then"),lit("if"),lit("while"),lit("string")})],{\tag("category"("keyword.control"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>)
8787
];
8888

8989
test bool analyzeTest() = doAnalyzeTest(rsc, units);

rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Rascal.rsc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ import lang::rascal::\syntax::Rascal;
1313
Grammar rsc = preprocess(grammar(#Module));
1414

1515
list[ConversionUnit] units = [
16-
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("bottom-up-break"),lit(")"),lit("("),lit("%"),lit("!:="),lit("\<==\>"),lit("\<\<="),lit("!="),lit("\>="),lit("://"),lit("non-assoc"),lit("&="),lit("\<-"),lit("*="),lit("+="),lit("top-down-break"),lit(","),lit("..."),lit("/="),lit("!\<\<"),lit("=\>"),lit("!\>\>"),lit("||"),lit("\>\>"),lit("::"),lit("&&"),lit(":="),lit("#"),lit("?="),lit("\<:"),lit("==\>"),lit("^"),lit(";"),lit("{"),lit("-="),lit("$T")})],{}), false, false, <nothing(),nothing()>, <nothing(),nothing()>),
16+
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit(","),lit(")"),lit("("),lit("%"),lit("\<==\>"),lit("\<\<="),lit("!="),lit("\>="),lit("://"),lit("&="),lit("\<-"),lit("-="),lit("*="),lit("+="),lit("..."),lit("/="),lit("!:="),lit("$"),lit("!\<\<"),lit("=\>"),lit("!\>\>"),lit("||"),lit("\>\>"),lit("::"),lit("&&"),lit(":="),lit("#"),lit("?="),lit("\<:"),lit("==\>"),lit("^"),lit(";"),lit("{")})],{}), false, false, <nothing(),nothing()>, <nothing(),nothing()>),
1717
unit(rsc, prod(label("stderrOutput",lex("Output")),[conditional(lit("⚠"),{\begin-of-line()}),\iter-star(\char-class([range(1,9),range(11,12),range(14,1114111)])),lit("\n")],{\tag("category"("StdErr"))}), false, false, <nothing(),nothing()>, <just(lit("⚠")),just(lit("\n"))>),
1818
unit(rsc, prod(label("stdoutOutput",lex("Output")),[conditional(lit("≫"),{\begin-of-line()}),\iter-star(\char-class([range(1,9),range(11,12),range(14,1114111)])),lit("\n")],{\tag("category"("StdOut"))}), false, false, <nothing(),nothing()>, <just(lit("≫")),just(lit("\n"))>),
1919
unit(rsc, prod(label("resultOutput",lex("Output")),[lit("⇨"),\iter-star(\char-class([range(1,9),range(11,12),range(14,1114111)])),lit("\n")],{\tag("category"("Result"))}), false, false, <nothing(),nothing()>, <just(lit("⇨")),just(lit("\n"))>),
@@ -35,8 +35,8 @@ list[ConversionUnit] units = [
3535
unit(rsc, prod(lex("CaseInsensitiveStringConstant"),[lit("\'"),label("chars",\iter-star(lex("StringCharacter"))),lit("\'")],{\tag("category"("Constant"))}), false, true, <nothing(),nothing()>, <just(lit("\'")),just(lit("\'"))>),
3636
unit(rsc, prod(lex("PreStringChars"),[lit("\""),\iter-star(lex("StringCharacter")),lit("\<")],{\tag("category"("Constant"))}), false, true, <nothing(),nothing()>, <just(lit("\"")),just(lit("\<"))>),
3737
unit(rsc, prod(lex("StringConstant"),[lit("\""),label("chars",\iter-star(lex("StringCharacter"))),lit("\"")],{\tag("category"("Constant"))}), false, true, <nothing(),nothing()>, <just(lit("\"")),just(lit("\""))>),
38-
unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("lexical"),lit("loc"),lit("if"),lit("assoc"),lit("test"),lit("lrel"),lit("throws"),lit("clear"),lit("module"),lit("any"),lit("int"),lit("quit"),lit("o"),lit("anno"),lit("true"),lit("public"),lit("keyword"),lit("for"),lit("tuple"),lit("bracket"),lit("bag"),lit("it"),lit("visit"),lit("do"),lit("data"),lit("layout"),lit("bool"),lit("edit"),lit("join"),lit("is"),lit("import"),lit("view"),lit("in"),lit("rat"),lit("modules"),lit("continue"),lit("left"),lit("num"),lit("assert"),lit("throw"),lit("one"),lit("help"),lit("default"),lit("all"),lit("global"),lit("syntax"),lit("false"),lit("finally"),lit("private"),lit("mod"),lit("java"),lit("node"),lit("start"),lit("set"),lit("right"),lit("variable"),lit("map"),lit("10"),lit("on"),lit("break"),lit("dynamic"),lit("solve"),lit("fail"),lit("unimport"),lit("outermost"),lit("real"),lit("list"),lit("insert"),lit("innermost"),lit("declarations"),lit("else"),lit("rel"),lit("function"),lit("notin"),lit("filter"),lit("datetime"),lit("catch"),lit("try"),lit("renaming"),lit("tag"),lit("has"),lit("Z"),lit("when"),lit("type"),lit("append"),lit("extend"),lit("switch"),lit("void"),lit("history"),lit("T"),lit("while"),lit("str"),lit("value"),lit("undeclare"),lit("case"),lit("alias"),lit("return"),lit("0")})],{\tag("category"("keyword.control"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>)
38+
unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("lexical"),lit("loc"),lit("test"),lit("lrel"),lit("throws"),lit("clear"),lit("top-down-break"),lit("module"),lit("any"),lit("int"),lit("quit"),lit("bottom-up-break"),lit("o"),lit("anno"),lit("true"),lit("public"),lit("keyword"),lit("for"),lit("tuple"),lit("bracket"),lit("bag"),lit("it"),lit("visit"),lit("do"),lit("data"),lit("layout"),lit("bool"),lit("edit"),lit("join"),lit("is"),lit("import"),lit("view"),lit("in"),lit("rat"),lit("modules"),lit("continue"),lit("left"),lit("num"),lit("assert"),lit("throw"),lit("one"),lit("help"),lit("default"),lit("all"),lit("global"),lit("syntax"),lit("false"),lit("finally"),lit("private"),lit("mod"),lit("java"),lit("node"),lit("start"),lit("set"),lit("if"),lit("bottom-up"),lit("right"),lit("variable"),lit("map"),lit("10"),lit("on"),lit("break"),lit("dynamic"),lit("solve"),lit("fail"),lit("unimport"),lit("outermost"),lit("real"),lit("list"),lit("insert"),lit("innermost"),lit("declarations"),lit("else"),lit("rel"),lit("function"),lit("notin"),lit("filter"),lit("datetime"),lit("catch"),lit("try"),lit("renaming"),lit("tag"),lit("has"),lit("top-down"),lit("Z"),lit("when"),lit("type"),lit("append"),lit("extend"),lit("non-assoc"),lit("assoc"),lit("switch"),lit("void"),lit("history"),lit("T"),lit("while"),lit("str"),lit("value"),lit("undeclare"),lit("case"),lit("alias"),lit("return"),lit("0")})],{\tag("category"("keyword.control"))}), false, false, <nothing(),nothing()>, <nothing(),nothing()>)
3939
];
4040

4141
test bool analyzeTest() = doAnalyzeTest(rsc, units);
42-
test bool transformTest() = doTransformTest(units, <20, 8, 0>);
42+
test bool transformTest() = doTransformTest(units, <20, 4, 0>, name = "Rascal");
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# SYNTAX TEST "Rascal"
2+
3+
"foo bar"
4+
# ^^^^^^^^^ Constant
5+
6+
"foo<x + 1>bar"
7+
# ^^^^^ ^^^^^ Constant
8+
# ^^^^^ -Constant
9+
10+
### TODO: The following test shows that, currently, multi-line strings are
11+
### disabled. This is because the converter determines that:
12+
### - `>` doesn't uniquely delineate interpolation (it could also be
13+
### greater-than in expressions or prioritize-before in grammars);
14+
### - `"` doesn't uniquely delineate strings (it could also be the end of
15+
### interpolation).
16+
### Therefore, to avoid excessive tokenization errors, the converter doesn't
17+
### generate begin/end patterns that begin with `>` or `"`. This might be
18+
### improved in the future.
19+
20+
"foo
21+
# ^^^^ -Constant
22+
bar"
23+
# ^^^^ -Constant

rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/RascalClass.rsc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,4 +48,4 @@ list[ConversionUnit] units = [
4848
];
4949

5050
test bool analyzeTest() = doAnalyzeTest(rsc, units);
51-
test bool transformTest() = doTransformTest(units, <5, 1, 0>);
51+
test bool transformTest() = doTransformTest(units, <5, 1, 0>, name = "RascalClass");
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# SYNTAX TEST "RascalClass"

0 commit comments

Comments
 (0)