Skip to content

Commit 5c6adcd

Browse files
committed
Reject literal non-NFC characters
Such characters will have to be spelled using `\u{...}` syntax instead.
1 parent df4db32 commit 5c6adcd

File tree

5 files changed

+25
-8
lines changed

5 files changed

+25
-8
lines changed

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ enum ParseError: Error, Hashable {
4343
case expectedEscape
4444
case invalidEscape(Character)
4545
case confusableCharacter(Character)
46+
case literalCharMustBeNFC(Character)
4647

4748
case quoteMayNotSpanMultipleLines
4849
case unsetExtendedSyntaxMayNotSpanMultipleLines
@@ -145,6 +146,8 @@ extension ParseError: CustomStringConvertible {
145146
return "invalid escape sequence '\\\(c)'"
146147
case .confusableCharacter(let c):
147148
return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead"
149+
case .literalCharMustBeNFC(let c):
150+
return "'\(c)' may not have the expected scalars; specify them explicitly with '\\u{...}' instead"
148151
case .quoteMayNotSpanMultipleLines:
149152
return "quoted sequence may not span multiple lines in multi-line literal"
150153
case .unsetExtendedSyntaxMayNotSpanMultipleLines:

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2055,13 +2055,14 @@ extension Parser {
20552055
return .invalid
20562056
}
20572057

2058-
guard let charLoc = p.tryEatWithLoc() else {
2058+
guard let charWithLoc = p.tryEatWithLoc() else {
20592059
// We check at the beginning of the function for `isEmpty`, so we should
20602060
// not be at the end of the input here.
20612061
p.unreachable("Unexpected end of input")
20622062
return nil
20632063
}
2064-
let char = charLoc.value
2064+
let char = charWithLoc.value
2065+
let charLoc = charWithLoc.location
20652066
switch char {
20662067
case ")", "|":
20672068
if customCC {
@@ -2092,7 +2093,11 @@ extension Parser {
20922093
let scalars = char.unicodeScalars
20932094
if scalars.count > 1 && scalars.first!.isASCII && char != "\r\n" &&
20942095
!char.isLetter && !char.isNumber {
2095-
p.error(.confusableCharacter(char), at: charLoc.location)
2096+
p.error(.confusableCharacter(char), at: charLoc)
2097+
}
2098+
// Reject unescaped non-NFC characters.
2099+
if !char.isNFC {
2100+
p.error(.literalCharMustBeNFC(char), at: charLoc)
20962101
}
20972102
break
20982103
}

Tests/RegexTests/CompileTests.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ extension RegexTests {
332332
doesNotContain: [.match, .consumeBy, .matchScalarUnchecked])
333333
// quoted literal is not all ascii -> match scalar when possible, always do boundary checks
334334
expectProgram(
335-
for: "aaa\u{301}",
335+
for: #"aaa\u{301}"#,
336336
contains: [.match, .matchScalar],
337337
doesNotContain: [.consumeBy, .matchScalarUnchecked])
338338
// scalar mode -> always emit match scalar without boundary checks
@@ -347,7 +347,7 @@ extension RegexTests {
347347
contains: [.matchScalarUnchecked],
348348
doesNotContain: [.match, .consumeBy, .matchScalar])
349349
expectProgram(
350-
for: "aaa\u{301}",
350+
for: #"aaa\u{301}"#,
351351
semanticLevel: .unicodeScalar,
352352
contains: [.matchScalarUnchecked],
353353
doesNotContain: [.match, .consumeBy, .matchScalar])

Tests/RegexTests/MatchTests.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ extension RegexTests {
226226

227227
// MARK: Allowed combining characters
228228

229-
firstMatchTest("e\u{301}", input: "e\u{301}", match: "e\u{301}")
229+
firstMatchTest(#"e\u{301}"#, input: "e\u{301}", match: "e\u{301}")
230230
firstMatchTest("1\u{358}", input: "1\u{358}", match: "1\u{358}")
231231
firstMatchTest(#"\ \#u{361}"#, input: " \u{361}", match: " \u{361}")
232232

@@ -774,7 +774,7 @@ extension RegexTests {
774774
firstMatchTest(#"[\d]"#, input: "1️⃣", match: "1️⃣")
775775
firstMatchTest(#"(?P)[\d]"#, input: "1️⃣", match: nil)
776776
firstMatchTest("[0-2&&1-3]", input: "1️⃣", match: nil)
777-
firstMatchTest("[1-2e\u{301}]", input: "1️⃣", match: nil)
777+
firstMatchTest(#"[1-2e\u{301}]"#, input: "1️⃣", match: nil)
778778

779779
firstMatchTest(#"[\u{3A9}-\u{3A9}]"#, input: "\u{3A9}", match: "\u{3A9}")
780780

Tests/RegexTests/ParseTests.swift

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,10 +374,19 @@ extension RegexTests {
374374

375375
// MARK: Allowed combining characters
376376

377-
parseTest("e\u{301}", "e\u{301}")
378377
parseTest("1\u{358}", "1\u{358}")
379378
parseTest(#"\ \#u{361}"#, " \u{361}")
380379

380+
// We don't allow non-NFC literal characters.
381+
parseTest(
382+
"e\u{301}", "e\u{301}", throwsError: .literalCharMustBeNFC("e\u{301}")
383+
)
384+
parseTest("\u{E9}", "e\u{301}")
385+
386+
// Can't be escaped either, must be written using `\u{...}`.
387+
parseTest(
388+
"\\e\u{301}", "e\u{301}", throwsError: .invalidEscape("e\u{301}"))
389+
381390
// MARK: Alternations
382391

383392
parseTest(

0 commit comments

Comments
 (0)