diff --git a/Package.swift b/Package.swift index 4e9f1c24d..18764fcaf 100644 --- a/Package.swift +++ b/Package.swift @@ -75,15 +75,17 @@ let package = Package( name: "RegexBuilder", dependencies: ["_StringProcessing", "_RegexParser"], swiftSettings: publicStdlibSettings), + .target(name: "TestSupport", + swiftSettings: [availabilityDefinition]), .testTarget( name: "RegexTests", - dependencies: ["_StringProcessing"], + dependencies: ["_StringProcessing", "TestSupport"], swiftSettings: [ .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]), ]), .testTarget( name: "RegexBuilderTests", - dependencies: ["_StringProcessing", "RegexBuilder"], + dependencies: ["_StringProcessing", "RegexBuilder", "TestSupport"], swiftSettings: [ .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]) ]), diff --git a/Sources/TestSupport/TestSupport.swift b/Sources/TestSupport/TestSupport.swift new file mode 100644 index 000000000..b60adb63f --- /dev/null +++ b/Sources/TestSupport/TestSupport.swift @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import XCTest + +// We need to split this out of the test files, as it needs to be compiled +// *without* `-disable-availability-checking` to ensure the #available check is +// not compiled into a no-op. + +#if os(Linux) +public func XCTExpectFailure( + _ message: String? = nil, body: () throws -> Void +) rethrows {} +#endif + +/// Guards certain tests to make sure we have a new stdlib available. +public func ensureNewStdlib( + file: StaticString = #file, line: UInt = #line +) -> Bool { + guard #available(SwiftStdlib 5.7, *) else { + XCTExpectFailure { XCTFail("Unsupported stdlib", file: file, line: line) } + return false + } + return true +} diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index b03ce8c39..8706327f7 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -755,8 +755,10 @@ extension AST.Atom { /// Whether this atom is valid as the operand of a custom character class /// range. public var isValidCharacterClassRangeBound: Bool { - // If we have a literal character value for this, it can be used as a bound. - if literalCharacterValue != nil { return true } + if let c = literalCharacterValue { + // We only match character range bounds that are single scalar NFC. + return c.hasExactlyOneScalar && c.isNFC + } switch kind { // \cx, \C-x, \M-x, \M-\C-x, \N{...} case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter: diff --git a/Sources/_RegexParser/Utility/Misc.swift b/Sources/_RegexParser/Utility/Misc.swift index d37dfbd4a..70dc7a7d5 100644 --- a/Sources/_RegexParser/Utility/Misc.swift +++ b/Sources/_RegexParser/Utility/Misc.swift @@ -19,6 +19,21 @@ extension Substring { var string: String { String(self) } } +extension Character { + /// Whether this character is made up of exactly one Unicode scalar value. + public var hasExactlyOneScalar: Bool { + let scalars = unicodeScalars + return scalars.index(after: scalars.startIndex) == scalars.endIndex + } + + /// Whether the given character is in NFC form. + internal var isNFC: Bool { + if isASCII { return true } + let str = String(self) + return str._nfcCodeUnits.elementsEqual(str.utf8) + } +} + extension CustomStringConvertible { @_alwaysEmitIntoClient public var halfWidthCornerQuoted: String { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 668d16eb6..c96775500 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -361,38 +361,60 @@ extension DSLTree.CustomCharacterClass.Member { } return c case let .range(low, high): - // TODO: - guard let lhs = low.literalCharacterValue else { + guard let lhs = low.literalCharacterValue?.singleScalar, lhs.isNFC else { throw Unsupported("\(low) in range") } - guard let rhs = high.literalCharacterValue else { + guard let rhs = high.literalCharacterValue?.singleScalar, rhs.isNFC else { throw Unsupported("\(high) in range") } + guard lhs <= rhs else { + throw Unsupported("Invalid range \(low)-\(high)") + } - if opts.isCaseInsensitive { - let lhsLower = lhs.lowercased() - let rhsLower = rhs.lowercased() - guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) { - // TODO: semantic level - return input.index(after: curIdx) - } - return nil + let isCaseInsensitive = opts.isCaseInsensitive + let isCharacterSemantic = opts.semanticLevel == .graphemeCluster + + return { input, bounds in + let curIdx = bounds.lowerBound + let nextIndex = isCharacterSemantic + ? input.index(after: curIdx) + : input.unicodeScalars.index(after: curIdx) + + // Under grapheme semantics, we compare based on single NFC scalars. If + // such a character is not single scalar under NFC, the match fails. In + // scalar semantics, we compare the exact scalar value to the NFC + // bounds. + let scalar = isCharacterSemantic ? input[curIdx].singleNFCScalar + : input.unicodeScalars[curIdx] + guard let scalar = scalar else { return nil } + let scalarRange = lhs ... rhs + if scalarRange.contains(scalar) { + return nextIndex } - } else { - guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhs...rhs).contains(input[curIdx]) { - // TODO: semantic level - return input.index(after: curIdx) + + // Check for case insensitive matches. + func matchesCased( + _ cased: (UnicodeScalar.Properties) -> String + ) -> Bool { + let casedStr = cased(scalar.properties) + // In character semantic mode, we need to map to NFC. In scalar + // semantics, we should have an exact scalar. + let mapped = isCharacterSemantic ? casedStr.singleNFCScalar + : casedStr.singleScalar + guard let mapped = mapped else { return false } + return scalarRange.contains(mapped) + } + if isCaseInsensitive { + if scalar.properties.changesWhenLowercased, + matchesCased(\.lowercaseMapping) { + return nextIndex + } + if scalar.properties.changesWhenUppercased, + matchesCased(\.uppercaseMapping) { + return nextIndex } - return nil } + return nil } case let .custom(ccc): diff --git a/Sources/_StringProcessing/Unicode/CharacterProps.swift b/Sources/_StringProcessing/Unicode/CharacterProps.swift index 80f6819a6..e0be4e386 100644 --- a/Sources/_StringProcessing/Unicode/CharacterProps.swift +++ b/Sources/_StringProcessing/Unicode/CharacterProps.swift @@ -11,10 +11,3 @@ // TODO - -extension Character { - /// Whether this character is made up of exactly one Unicode scalar value. - var hasExactlyOneScalar: Bool { - unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex - } -} diff --git a/Sources/_StringProcessing/Unicode/NFC.swift b/Sources/_StringProcessing/Unicode/NFC.swift new file mode 100644 index 000000000..5c2c4aa48 --- /dev/null +++ b/Sources/_StringProcessing/Unicode/NFC.swift @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +@_spi(_Unicode) +import Swift + +extension UnicodeScalar { + /// Checks whether the scalar is in NFC form. + var isNFC: Bool { Character(self).singleNFCScalar == self } +} + +extension Character { + /// If the given character consists of a single NFC scalar, returns it. If + /// there are multiple NFC scalars, returns `nil`. + var singleNFCScalar: UnicodeScalar? { + // SwiftStdlib is always >= 5.7 for a shipped StringProcessing. + guard #available(SwiftStdlib 5.7, *) else { return nil } + var nfcIter = String(self)._nfc.makeIterator() + guard let scalar = nfcIter.next(), nfcIter.next() == nil else { return nil } + return scalar + } + + /// If the given character contains a single scalar, returns it. If none or + /// multiple scalars are present, returns `nil`. + var singleScalar: UnicodeScalar? { + hasExactlyOneScalar ? unicodeScalars.first! : nil + } +} + +extension String { + /// If the given string consists of a single NFC scalar, returns it. If none + /// or multiple NFC scalars are present, returns `nil`. + var singleNFCScalar: UnicodeScalar? { + guard !isEmpty && index(after: startIndex) == endIndex else { return nil } + return first!.singleNFCScalar + } + + /// If the given string contains a single scalar, returns it. If none or + /// multiple scalars are present, returns `nil`. + var singleScalar: UnicodeScalar? { + let scalars = unicodeScalars + guard !scalars.isEmpty && + scalars.index(after: scalars.startIndex) == scalars.endIndex + else { return nil } + return scalars.first! + } +} diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 05375a1f7..1d186e0bc 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -12,10 +12,7 @@ import XCTest import _StringProcessing import RegexBuilder - -#if os(Linux) -func XCTExpectFailure(_ message: String? = nil, body: () throws -> Void) rethrows {} -#endif +import TestSupport class RegexDSLTests: XCTestCase { func _testDSLCaptures( @@ -77,6 +74,9 @@ class RegexDSLTests: XCTestCase { let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" func testCharacterClasses() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + try _testDSLCaptures( ("a c", ("a c", " ", "c")), matchType: (Substring, Substring, Substring).self, ==) @@ -251,6 +251,9 @@ class RegexDSLTests: XCTestCase { } func testCharacterClassOperations() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + try _testDSLCaptures( ("bcdefn1a", "bcdefn1a"), ("nbcdef1a", nil), // fails symmetric difference lookahead @@ -594,6 +597,9 @@ class RegexDSLTests: XCTestCase { } func testQuantificationBehavior() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + // Eager by default try _testDSLCaptures( ("abc1def2", ("abc1def2", "2")), diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index c087974a7..f24ae89d9 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -12,6 +12,7 @@ import XCTest @testable import _RegexParser @testable import _StringProcessing +import TestSupport struct MatchError: Error { var message: String @@ -319,8 +320,6 @@ extension RegexTests { input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t") - firstMatchTest(#"\r\n"#, input: "\r\n", match: "\r\n") - // MARK: Quotes firstMatchTest( @@ -571,6 +570,9 @@ extension RegexTests { } func testMatchCharacterClasses() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Character classes firstMatchTest(#"abc\d"#, input: "xyzabc123", match: "abc1") @@ -767,10 +769,14 @@ extension RegexTests { } firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}") - // FIXME: This produces a different result with and without optimizations. - firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil, xfail: true) - firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil, - validateOptimizations: false) + firstMatchTest(#"[12]"#, input: "1️⃣", match: nil) + firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil) + firstMatchTest(#"[\d]"#, input: "1️⃣", match: "1️⃣") + firstMatchTest(#"(?P)[\d]"#, input: "1️⃣", match: nil) + firstMatchTest("[0-2&&1-3]", input: "1️⃣", match: nil) + firstMatchTest("[1-2e\u{301}]", input: "1️⃣", match: nil) + + firstMatchTest(#"[\u{3A9}-\u{3A9}]"#, input: "\u{3A9}", match: "\u{3A9}") // Currently not supported in the matching engine. for c: UnicodeScalar in ["a", "b", "c"] { @@ -825,6 +831,35 @@ extension RegexTests { firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: "abc", syntax: .experimental) firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: #""abc""#) + + for semantics in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + // Case sensitivity and ranges. + for ch in "abcD" { + firstMatchTest("[a-cD]", input: String(ch), match: String(ch)) + } + for ch in "ABCd" { + firstMatchTest("[a-cD]", input: String(ch), match: nil) + } + for ch in "abcABCdD" { + let input = String(ch) + firstMatchTest( + "(?i)[a-cd]", input: input, match: input, semanticLevel: semantics) + firstMatchTest( + "(?i)[A-CD]", input: input, match: input, semanticLevel: semantics) + } + for ch in "XYZ[\\]^_`abcd" { + let input = String(ch) + firstMatchTest( + "[X-cd]", input: input, match: input, semanticLevel: semantics) + } + for ch in "XYZ[\\]^_`abcxyzABCdD" { + let input = String(ch) + firstMatchTest( + "(?i)[X-cd]", input: input, match: input, semanticLevel: semantics) + firstMatchTest( + "(?i)[X-cD]", input: input, match: input, semanticLevel: semantics) + } + } } func testCharacterProperties() { @@ -1037,6 +1072,9 @@ extension RegexTests { } func testMatchAnchors() throws { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Anchors firstMatchTests( #"^\d+"#, @@ -1085,8 +1123,6 @@ extension RegexTests { (" 123\n456\n", nil), ("123 456", "456")) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) firstMatchTests( #"\d+\b"#, ("123", "123"), @@ -1104,7 +1140,6 @@ extension RegexTests { ("123", "23"), (" 123", "23"), ("123 456", "23")) -#endif // TODO: \G and \K do { @@ -1135,9 +1170,10 @@ extension RegexTests { ("Sol Cafe", nil), xfail: true) } - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) func testLevel2WordBoundaries() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Level 2 Word Boundaries firstMatchTest(#"\b😊\b"#, input: "🔥😊👍", match: "😊") firstMatchTest(#"\b👨🏽\b"#, input: "👩🏻👶🏿👨🏽🧑🏾👩🏼", match: "👨🏽") @@ -1153,9 +1189,11 @@ extension RegexTests { firstMatchTest(#"can\B\'\Bt"#, input: "I can't do that.", match: "can't") firstMatchTest(#"\b÷\b"#, input: "3 ÷ 3 = 1", match: "÷") } -#endif - + func testMatchGroups() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Groups // Named captures @@ -1379,6 +1417,9 @@ extension RegexTests { } func testMatchExamples() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // Backreferences matchTest( #"(sens|respons)e and \1ibility"#, @@ -1428,8 +1469,6 @@ extension RegexTests { xfail: true ) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) // HTML tags matchTest( #"<([a-zA-Z][a-zA-Z0-9]*)\b[^>]*>.*?"#, @@ -1447,7 +1486,6 @@ extension RegexTests { ("pass me the the kettle", ["the"]), ("this doesn't have any", nil) ) -#endif // Floats flatCaptureTest( @@ -1463,8 +1501,79 @@ extension RegexTests { firstMatchTest(#".+"#, input: "a\nb", match: "a") firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") } + + func testMatchNewlines() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + + for semantics in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + firstMatchTest( + #"\r\n"#, input: "\r\n", match: "\r\n", + semanticLevel: semantics + ) + firstMatchTest( + #"\r\n"#, input: "\n", match: nil, semanticLevel: semantics) + firstMatchTest( + #"\r\n"#, input: "\r", match: nil, semanticLevel: semantics) + + // \r\n is not treated as ASCII. + firstMatchTest( + #"^\p{ASCII}$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^\r$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\r]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^\n$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\n]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\u{0}-\u{7F}]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + + let scalarSemantics = semantics == .unicodeScalar + firstMatchTest( + #"\p{ASCII}"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"\r"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\r]"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"\n"#, input: "\r\n", match: scalarSemantics ? "\n" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\n]"#, input: "\r\n", match: scalarSemantics ? "\n" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\u{0}-\u{7F}]"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + } + } func testCaseSensitivity() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + matchTest( #"c..e"#, ("cafe", true), @@ -1527,6 +1636,9 @@ extension RegexTests { } func testASCIIClasses() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // 'D' ASCII-only digits matchTest( #"\d+"#, @@ -1555,8 +1667,6 @@ extension RegexTests { ("aeiou", true), ("åe\u{301}ïôú", false)) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) matchTest( #"abcd\b.+"#, ("abcd ef", true), @@ -1572,7 +1682,6 @@ extension RegexTests { ("abcd ef", true), ("abcdef", false), ("abcdéf", false)) -#endif // 'S' ASCII-only spaces matchTest( @@ -1698,6 +1807,9 @@ extension RegexTests { var eComposed: String { "é" } var eDecomposed: String { "e\u{301}" } + var eComposedUpper: String { "É" } + var eDecomposedUpper: String { "E\u{301}" } + func testIndividualScalars() { // Expectation: A standalone Unicode scalar value in a regex literal // can match either that specific scalar value or participate in matching @@ -1784,31 +1896,62 @@ extension RegexTests { } func testCanonicalEquivalenceCustomCharacterClass() throws { - // Expectation: Concatenations with custom character classes should be able - // to match within a grapheme cluster. That is, a regex should be able to - // match the scalar values that comprise a grapheme cluster in separate, - // or repeated, custom character classes. - + // Expectation: Custom character class matches do not cross grapheme + // character boundaries by default. When matching with Unicode scalar + // semantics, grapheme cluster boundaries are ignored, so matching + // sequences of custom character classes can succeed. + + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + matchTest( #"[áéíóú]$"#, (eComposed, true), (eDecomposed, true)) - // FIXME: Custom char classes don't use canonical equivalence with composed characters - firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed, - xfail: true) - firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, - xfail: true) - firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, - xfail: true) + for input in [eDecomposed, eComposed] { + // Unicode scalar semantics means that only the decomposed version can + // match here. + let match = input.unicodeScalars.count == 2 ? input : nil + firstMatchTest( + #"e[\u{301}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"e[\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[e][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[e-e][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[a-z][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + } + for input in [eComposed, eDecomposed] { + // Grapheme cluster semantics means that we can't match the 'e' separately + // from the accent. + firstMatchTest(#"e[\u{301}]$"#, input: input, match: nil) + firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[e][\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[e-e][\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: input, match: nil) + + // A range that covers é (U+E9). Inputs are mapped to NFC, so match. + firstMatchTest(#"[\u{E8}-\u{EA}]"#, input: input, match: input) + } - // FIXME: Custom char classes don't match decomposed characters - firstMatchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) + // A range that covers É (U+C9). Inputs are mapped to NFC, so match. + for input in [eComposedUpper, eDecomposedUpper] { + firstMatchTest(#"[\u{C8}-\u{CA}]"#, input: input, match: input) + firstMatchTest(#"[\u{C9}-\u{C9}]"#, input: input, match: input) + } + // Case insensitive matching of É (U+C9). + for input in [eComposed, eDecomposed, eComposedUpper, eDecomposedUpper] { + firstMatchTest(#"(?i)[\u{C8}-\u{CA}]"#, input: input, match: input) + firstMatchTest(#"(?i)[\u{C9}-\u{C9}]"#, input: input, match: input) + } let flag = "🇰🇷" firstMatchTest(#"🇰🇷"#, input: flag, match: flag) @@ -1817,27 +1960,33 @@ extension RegexTests { firstMatchTest(#"\u{1F1F0 1F1F7}"#, input: flag, match: flag) // First Unicode scalar followed by CCC of regional indicators - firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag, - xfail: true) - - // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character + firstMatchTest( + #"^\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) + // A CCC of regional indicators followed by the second Unicode scalar + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) // A CCC of regional indicators x 2 - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag, - xfail: true) + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]{2}$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) + // A CCC of N regional indicators + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]+$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) - // FIXME: A single CCC of regional indicators matches the whole flag character - // A CCC of regional indicators followed by the second Unicode scalar - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag, - xfail: true) // A single CCC of regional indicators - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil, - xfail: true) - - // A single CCC of actual flag emojis / combined regional indicators - firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: flag, match: flag) - // This succeeds (correctly) because \u{1F1F0} is lexicographically - // within the CCC range - firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: "\u{1F1F0}abc", match: "\u{1F1F0}") + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil) + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil, + semanticLevel: .unicodeScalar + ) } func testAnyChar() throws { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 52a272915..f5e93c2bd 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -374,10 +374,21 @@ extension RegexTests { // MARK: Allowed combining characters - parseTest("e\u{301}", "e\u{301}") parseTest("1\u{358}", "1\u{358}") parseTest(#"\ \#u{361}"#, " \u{361}") + parseTest("e\u{301}", "e\u{301}") + parseTest("[e\u{301}]", charClass("e\u{301}")) + parseTest("\u{E9}", "e\u{301}") + parseTest("[\u{E9}]", charClass("e\u{301}")) + + parseTest( + "\\e\u{301}", "e\u{301}", throwsError: .invalidEscape("e\u{301}")) + parseTest( + "[\\e\u{301}]", charClass("e\u{301}"), + throwsError: .invalidEscape("e\u{301}") + ) + // MARK: Alternations parseTest( @@ -2885,6 +2896,34 @@ extension RegexTests { diagnosticTest(#"[a-\Qbc\E]"#, .unsupported("range with quoted sequence")) diagnosticTest(#"[\Qbc\E-de]"#, .unsupported("range with quoted sequence")) + diagnosticTest(#"|([🇦🇫-🇿🇼])?"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"|([👨‍👩‍👦-👩‍👩‍👧‍👧])?"#, .invalidCharacterClassRangeOperand) + + // Not single-scalar NFC. + diagnosticTest("[e\u{301}-e\u{302}]", .invalidCharacterClassRangeOperand) + + // These scalar values expand under NFC. + let nfcExpandingScalars: [UInt32] = [ + 0x344, 0x958, 0x959, 0x95A, 0x95B, 0x95C, 0x95D, 0x95E, 0x95F, 0x9DC, + 0x9DD, 0x9DF, 0xA33, 0xA36, 0xA59, 0xA5A, 0xA5B, 0xA5E, 0xB5C, 0xB5D, + 0xF43, 0xF4D, 0xF52, 0xF57, 0xF5C, 0xF69, 0xF73, 0xF75, 0xF76, 0xF78, + 0xF81, 0xF93, 0xF9D, 0xFA2, 0xFA7, 0xFAC, 0xFB9, 0x2ADC, 0xFB1D, 0xFB1F, + 0xFB2A, 0xFB2B, 0xFB2C, 0xFB2D, 0xFB2E, 0xFB2F, 0xFB30, 0xFB31, 0xFB32, + 0xFB33, 0xFB34, 0xFB35, 0xFB36, 0xFB38, 0xFB39, 0xFB3A, 0xFB3B, 0xFB3C, + 0xFB3E, 0xFB40, 0xFB41, 0xFB43, 0xFB44, 0xFB46, 0xFB47, 0xFB48, 0xFB49, + 0xFB4A, 0xFB4B, 0xFB4C, 0xFB4D, 0xFB4E, 0x1D15E, 0x1D15F, 0x1D160, + 0x1D161, 0x1D162, 0x1D163, 0x1D164, 0x1D1BB, 0x1D1BC, 0x1D1BD, 0x1D1BE, + 0x1D1BF, 0x1D1C0 + ] + for scalar in nfcExpandingScalars { + let hex = String(scalar, radix: 16) + diagnosticTest( + #"[\u{\#(hex)}-\u{\#(hex)}]"#, .invalidCharacterClassRangeOperand) + } + + // The NFC form of U+2126 is U+3A9. + diagnosticTest(#"[\u{2126}-\u{2126}]"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index aa3639ea6..11479bfb6 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -21,6 +21,7 @@ import XCTest @testable // for internal `matches(of:)` import _StringProcessing +import TestSupport extension UnicodeScalar { var value4Digits: String { @@ -316,6 +317,9 @@ extension UTS18Tests { // surrogate followed by a trailing surrogate shall be handled as a single // code point in matching. func testSupplementaryCodePoints() { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + XCTAssertTrue("👍".contains(regex(#"\u{1F44D}"#))) XCTAssertTrue("👍".contains(regex(#"[\u{1F440}-\u{1F44F}]"#))) XCTAssertTrue("👍👎".contains(regex(#"^[\u{1F440}-\u{1F44F}]+$"#))) @@ -388,6 +392,9 @@ extension UTS18Tests { } func testCharacterClassesWithStrings() { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + let regex = regex(#"[a-z🧐🇧🇪🇧🇫🇧🇬]"#) XCTAssertEqual("🧐", "🧐".wholeMatch(of: regex)?.0) XCTAssertEqual("🇧🇫", "🇧🇫".wholeMatch(of: regex)?.0)