Skip to content
This repository was archived by the owner on Apr 22, 2020. It is now read-only.

Commit 81a8ec2

Browse files
author
mikesamuel
committed
fixed issue 12: implemented lexing of regular expression literals using
an approach based on javascripts lexical grammar to decide when a / begins a regexp literal. This is more conservative than javascript since I don't attempt to handle lexically valid but syntactically invalid javascript. There is one case where a regexp literal in a syntactically valid javascript will not be recognized for (var fieldName in /foo/) { ... } I have never seen this in practice. Someone might iterate over a regexp to iterate out parenthetical matches, but they would have to assign the regexp to a variable first, since javascript does not allow pooling of regexp literals.
1 parent ca84bd1 commit 81a8ec2

File tree

2 files changed

+178
-7
lines changed

2 files changed

+178
-7
lines changed

src/prettify.js

Lines changed: 76 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,51 @@ function PR_endsWith(s, suffix) {
239239
suffix == s.substring(s.length - suffix.length, s.length);
240240
}
241241

242+
/** a set of tokens that can precede a regular expression literal in javascript.
243+
* http://www.mozilla.org/js/language/js20/rationale/syntax.html has the full
244+
* list, but I've removed ones that might be problematic when seen in languages
245+
* that don't support regular expression literals.
246+
*
247+
* <p>Specifically, I've removed any keywords that can't precede a regexp
248+
* literal in a syntactically legal javascript program, and I've removed the
249+
* "in" keyword since it's not a keyword in many languages, and might be used
250+
* as a count of inches.
251+
* @private
252+
*/
253+
var REGEXP_PRECEDER_PATTERN = (function () {
254+
var preceders = [
255+
"!", "!=", "!==", "#", "%", "%=", "&", "&&", "&&=",
256+
"&=", "(", "*", "*=", /* "+", */ "+=", ",", /* "-", */ "-=",
257+
"->", /*".", "..", "...", handled below */ "/", "/=", ":", "::", ";",
258+
"<", "<<", "<<=", "<=", "=", "==", "===", ">",
259+
">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[",
260+
"^", "^=", "^^", "^^=", "{", "|", "|=", "||",
261+
"||=", "~", "break", "case", "continue", "delete",
262+
"do", "else", "finally", "instanceof",
263+
"return", "throw", "try", "typeof"
264+
];
265+
var pattern = '(?:' +
266+
'(?:(?:^|[^0-9\.])\\.{1,3})|' + // a dot that's not part of a number
267+
'(?:(?:^|[^\\+])\\+)|' + // allow + but not ++
268+
'(?:(?:^|[^\\-])-)' // allow - but not --
269+
;
270+
for (var i = 0; i < preceders.length; ++i) {
271+
var preceder = preceders[i];
272+
if (PR_isWordChar(preceder.charAt(0))) {
273+
pattern += '|\\b' + preceder;
274+
} else {
275+
pattern += '|' + preceder.replace(/([^=<>:&])/g, '\\$1');
276+
}
277+
}
278+
pattern += ')\\s*$'; // matches at end
279+
return new RegExp(pattern);
280+
// CAVEAT: this does not properly handle the case where a regular expression
281+
// immediately follows another since a regular expression may have flags
282+
// for case-sensitivity and the like. Having regexp tokens adjacent is not
283+
// valid in any language I'm aware of, so I'm punting.
284+
// TODO: maybe style special characters inside a regexp as punctuation.
285+
})();
286+
242287
/** true iff prefix matches the first prefix characters in chars[0:len].
243288
* @private
244289
*/
@@ -679,6 +724,8 @@ function PR_splitStringAndCommentTokens(chunks) {
679724
var state = 0; // FSM state variable
680725
var delim = -1; // string delimiter
681726
var k = 0; // absolute position of beginning of current chunk
727+
var lookBehind = []; // the last 16 characters processed collapsing space
728+
var lastCh = '';
682729

683730
for (var ci = 0, nc = chunks.length; ci < nc; ++ci) {
684731
var chunk = chunks[ci];
@@ -699,8 +746,8 @@ function PR_splitStringAndCommentTokens(chunks) {
699746
} else if (ch == '/') {
700747
state = 3;
701748
} else if (ch == '#') {
702-
tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN));
703749
state = 4;
750+
tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN));
704751
}
705752
} else if (1 == state) {
706753
if (ch == delim) {
@@ -719,10 +766,21 @@ function PR_splitStringAndCommentTokens(chunks) {
719766
state = 5;
720767
tokenEnds.push(new PR_TokenEnd(k + last, PR_PLAIN));
721768
} else {
722-
state = 0;
723-
// next loop will reenter state 0 without same value of i, so
724-
// ch will be reconsidered as start of new token.
725-
next = i;
769+
// check the last token and see if we should treat this as the start
770+
// of a regular expression literal.
771+
if ((!lookBehind.length ||
772+
REGEXP_PRECEDER_PATTERN.test(lookBehind.join('')))) {
773+
// treat regular expression as a string with delimiter /
774+
state = 1;
775+
delim = '/';
776+
tokenEnds.push(new PR_TokenEnd(k + last, PR_PLAIN));
777+
} else {
778+
state = 0;
779+
// next loop will reenter state 0 without same value of i, so
780+
// ch will be reconsidered as start of new token.
781+
next = i;
782+
continue;
783+
}
726784
}
727785
} else if (4 == state) {
728786
if (ch == '\r' || ch == '\n') {
@@ -737,10 +795,23 @@ function PR_splitStringAndCommentTokens(chunks) {
737795
if (ch == '/') {
738796
state = 0;
739797
tokenEnds.push(new PR_TokenEnd(k + next, PR_COMMENT));
798+
continue; // skip lookbehind
740799
} else if (ch != '*') {
741800
state = 5;
742801
}
743802
}
803+
804+
// push char on lookbehind if it's not a comment token. Don't
805+
// waste space with lots of space ; just leave enough to indicate
806+
// boundaries.
807+
if (3 > state || state > 6) {
808+
var isSpace = PR_isSpaceChar(ch);
809+
if (!(lastCh === ' ' && isSpace)) {
810+
if (lookBehind.length > 16) { lookBehind.shift(); }
811+
lastCh = isSpace ? ' ' : ch;
812+
lookBehind.push(lastCh);
813+
}
814+
}
744815
}
745816
}
746817
k += s.length;

tests/prettify_test.html

Lines changed: 102 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,47 @@ <h1>Javascript</h1>
150150
document.write(fib(10));
151151
</pre>
152152

153+
<h1>Issue 12 - Javascript Regular Expressions</h1>
154+
<pre class="prettyprint" id="issue12">
155+
/foo/; // a slash starting a line treated as a regexp beginning
156+
"foo".match(/fo+$/);
157+
// this line comment not treated as a regular expressions
158+
"foo /bar/".test(/"baz"/); // test string and regexp boundaries
159+
var division = /\b\d+\/\d+/g; // test char sets and escaping of specials
160+
var allSpecials = /([^\(\)\[\]\{\}\-\?\+\*\.\^\$\/]+)\\/;
161+
162+
// test that slash used in numeric context treated as an operator
163+
1 / 2;
164+
1. / x;
165+
x / y;
166+
(x) / y;
167+
1 /* foo */ / 2;
168+
1 /* foo *// 2;
169+
1/2;
170+
1./x;
171+
x/y;
172+
(x)/y;
173+
174+
// test split over two lines. line comment should not fool it
175+
1//
176+
/2;
177+
178+
x++/y;
179+
x--/y;
180+
x[y] / z;
181+
f() / n;
182+
183+
// test that slash after non postfix operator is start of regexp
184+
log('matches = ' + /foo/.test(foo));
185+
186+
// test keyword preceders
187+
return /a regexp/;
188+
division = notreturn / not_a_regexp / 2; // keyword suffix does not match
189+
190+
// & not used as prefix operator in javascript but this should still work
191+
&/foo/;
192+
</pre>
193+
153194
<h1>Perl</h1>
154195
<pre class="prettyprint" id="perl">
155196
#!/usr/bin/perl
@@ -747,7 +788,7 @@ <h1>Bug 8 - tabs mangled</h1>
747788
'&nbsp; &nbsp; &nbsp; `END`COM// PHP has a plethora of comment types' +
748789
'`END`PLN<br>' +
749790
'&nbsp; &nbsp; &nbsp; `END`COM\/* What is a<br>' +
750-
'&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;"plethora"? */`END`PLN<br>' +
791+
'&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;"plethora"? *\/`END`PLN<br>' +
751792
'&nbsp; &nbsp; &nbsp; `END`KWDfunction`END`PLN fib`END`PUN(`END' +
752793
'`PLN$n`END`PUN)`END`PLN `END`PUN{`END`PLN<br>' +
753794
'&nbsp; &nbsp; &nbsp; &nbsp; `END`COM# I don\'t know.`END`PLN<br>' +
@@ -827,7 +868,66 @@ <h1>Bug 8 - tabs mangled</h1>
827868
'`END<b>`PLNeleven`END</b>`PLN &nbsp;`END`TYPTwelve`END`PLN &nbsp;`END' +
828869
'<b>`PLNthirteen`END</b>`PLN &nbsp; &nbsp; &nbsp; &nbsp;`END' +
829870
'`TYPFourteen`END`PLN &nbsp; &nbsp; &nbsp; &nbsp;fifteen `END`' +
830-
'PUN|`END')
871+
'PUN|`END'),
872+
issue12: (
873+
'`STR/foo/`END`PUN;`END`PLN &nbsp;`END`COM// a slash starting a line ' +
874+
'treated as a regexp beginning`END`PLN<br>' +
875+
'`END`STR"foo"`END`PUN.`END`PLNmatch`END`PUN(`END`STR/fo+$/`END' +
876+
'`PUN);`END`PLN<br>' +
877+
'`END`COM// this line comment not treated as a regular expressions`END' +
878+
'`PLN<br>' +
879+
'`END`STR"foo /bar/"`END`PUN.`END`PLNtest`END`PUN(`END`STR/"baz"/`END' +
880+
'`PUN);`END`PLN &nbsp;`END`COM// test string and regexp boundaries' +
881+
'`END`PLN<br>' +
882+
'`END`KWDvar`END`PLN division `END`PUN=`END`PLN `END' +
883+
'`STR/\\b\\d+\\/\\d+/`END`PLNg`END`PUN;`END`PLN &nbsp;`END' +
884+
'`COM// test char sets and escaping of specials`END`PLN<br>' +
885+
'`END`KWDvar`END`PLN allSpecials `END`PUN=`END`PLN `END' +
886+
'`STR/([^\\(\\)\\[\\]\\{\\}\\-\\?\\+\\*\\.\\^\\$\\/]+)\\\\/`END' +
887+
'`PUN;`END`PLN<br>' +
888+
'<br>' +
889+
'`END`COM// test that slash used in numeric context treated as an ' +
890+
'operator`END`PLN<br>' +
891+
'`END`LIT1`END`PLN `END`PUN/`END`PLN `END`LIT2`END`PUN;`END`PLN<br>' +
892+
'`END`LIT1`END`PUN.`END`PLN `END`PUN/`END`PLN x`END`PUN;`END`PLN<br>' +
893+
'x `END`PUN/`END`PLN y`END`PUN;`END`PLN<br>' +
894+
'`END`PUN(`END`PLNx`END`PUN)`END`PLN `END`PUN/`END`PLN y`END`PUN;`END' +
895+
'`PLN<br>' +
896+
'`END`LIT1`END`PLN `END`COM/* foo */`END`PLN `END`PUN/`END`PLN `END' +
897+
'`LIT2`END`PUN;`END`PLN<br>' +
898+
'`END`LIT1`END`PLN `END`COM/* foo */`END`PUN/`END`PLN `END`LIT2`END' +
899+
'`PUN;`END`PLN<br>' +
900+
'`END`LIT1`END`PUN/`END`LIT2`END`PUN;`END`PLN<br>' +
901+
'`END`LIT1`END`PUN./`END`PLNx`END`PUN;`END`PLN<br>' +
902+
'x`END`PUN/`END`PLNy`END`PUN;`END`PLN<br>' +
903+
'`END`PUN(`END`PLNx`END`PUN)/`END`PLNy`END`PUN;`END`PLN<br>' +
904+
'<br>' +
905+
'`END`COM// test split over two lines. &nbsp;line comment should not ' +
906+
'fool it`END`PLN<br>' +
907+
'`END`LIT1`END`COM//`END`PLN<br>' +
908+
'`END`PUN/`END`LIT2`END`PUN;`END`PLN<br>' +
909+
'<br>' +
910+
'x`END`PUN++/`END`PLNy`END`PUN;`END`PLN<br>' +
911+
'x`END`PUN--/`END`PLNy`END`PUN;`END`PLN<br>' +
912+
'x`END`PUN[`END`PLNy`END`PUN]`END`PLN `END`PUN/`END`PLN z`END`PUN;`END' +
913+
'`PLN<br>' +
914+
'f`END`PUN()`END`PLN `END`PUN/`END`PLN n`END`PUN;`END`PLN<br>' +
915+
'<br>' +
916+
'`END`COM// test that slash after non postfix operator is start of ' +
917+
'regexp`END`PLN<br>' +
918+
'log`END`PUN(`END`STR\'matches = \'`END`PLN `END`PUN+`END`PLN `END' +
919+
'`STR/foo/`END`PUN.`END`PLNtest`END`PUN(`END`PLNfoo`END`PUN));`END' +
920+
'`PLN<br>' +
921+
'<br>' +
922+
'`END`COM// test keyword preceders`END`PLN<br>' +
923+
'`END`KWDreturn`END`PLN `END`STR/a regexp/`END`PUN;`END`PLN<br>' +
924+
'division `END`PUN=`END`PLN notreturn `END`PUN/`END`PLN not_a_regexp ' +
925+
'`END`PUN/`END`PLN `END`LIT2`END`PUN;`END`PLN &nbsp;`END`COM// ' +
926+
'keyword suffix does not match`END`PLN<br>' +
927+
'<br>' +
928+
'`END`COM// &amp; not used as prefix operator in javascript but this ' +
929+
'should still work`END`PLN<br>' +
930+
'`END`PUN&amp;`END`STR/foo/`END`PUN;`END')
831931
};
832932

833933

0 commit comments

Comments
 (0)