Skip to content

Fix character class range matching #570

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,17 @@ let package = Package(
name: "RegexBuilder",
dependencies: ["_StringProcessing", "_RegexParser"],
swiftSettings: publicStdlibSettings),
.target(name: "TestSupport",
swiftSettings: [availabilityDefinition]),
.testTarget(
name: "RegexTests",
dependencies: ["_StringProcessing"],
dependencies: ["_StringProcessing", "TestSupport"],
swiftSettings: [
.unsafeFlags(["-Xfrontend", "-disable-availability-checking"]),
]),
.testTarget(
name: "RegexBuilderTests",
dependencies: ["_StringProcessing", "RegexBuilder"],
dependencies: ["_StringProcessing", "RegexBuilder", "TestSupport"],
swiftSettings: [
.unsafeFlags(["-Xfrontend", "-disable-availability-checking"])
]),
Expand Down
33 changes: 33 additions & 0 deletions Sources/TestSupport/TestSupport.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

import XCTest

// We need to split this out of the test files, as it needs to be compiled
// *without* `-disable-availability-checking` to ensure the #available check is
// not compiled into a no-op.

#if os(Linux)
public func XCTExpectFailure(
_ message: String? = nil, body: () throws -> Void
) rethrows {}
#endif

/// Guards certain tests to make sure we have a new stdlib available.
public func ensureNewStdlib(
file: StaticString = #file, line: UInt = #line
) -> Bool {
guard #available(SwiftStdlib 5.7, *) else {
XCTExpectFailure { XCTFail("Unsupported stdlib", file: file, line: line) }
return false
}
return true
}
6 changes: 4 additions & 2 deletions Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -755,8 +755,10 @@ extension AST.Atom {
/// Whether this atom is valid as the operand of a custom character class
/// range.
public var isValidCharacterClassRangeBound: Bool {
// If we have a literal character value for this, it can be used as a bound.
if literalCharacterValue != nil { return true }
if let c = literalCharacterValue {
// We only match character range bounds that are single scalar NFC.
return c.hasExactlyOneScalar && c.isNFC
}
switch kind {
// \cx, \C-x, \M-x, \M-\C-x, \N{...}
case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:
Expand Down
15 changes: 15 additions & 0 deletions Sources/_RegexParser/Utility/Misc.swift
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,21 @@ extension Substring {
var string: String { String(self) }
}

extension Character {
/// Whether this character is made up of exactly one Unicode scalar value.
public var hasExactlyOneScalar: Bool {
let scalars = unicodeScalars
return scalars.index(after: scalars.startIndex) == scalars.endIndex
}

/// Whether the given character is in NFC form.
internal var isNFC: Bool {
if isASCII { return true }
let str = String(self)
return str._nfcCodeUnits.elementsEqual(str.utf8)
}
}

extension CustomStringConvertible {
@_alwaysEmitIntoClient
public var halfWidthCornerQuoted: String {
Expand Down
70 changes: 46 additions & 24 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -361,38 +361,60 @@ extension DSLTree.CustomCharacterClass.Member {
}
return c
case let .range(low, high):
// TODO:
guard let lhs = low.literalCharacterValue else {
guard let lhs = low.literalCharacterValue?.singleScalar, lhs.isNFC else {
throw Unsupported("\(low) in range")
}
guard let rhs = high.literalCharacterValue else {
guard let rhs = high.literalCharacterValue?.singleScalar, rhs.isNFC else {
throw Unsupported("\(high) in range")
}
guard lhs <= rhs else {
throw Unsupported("Invalid range \(low)-\(high)")
}

if opts.isCaseInsensitive {
let lhsLower = lhs.lowercased()
let rhsLower = rhs.lowercased()
guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
return { input, bounds in
// TODO: check for out of bounds?
let curIdx = bounds.lowerBound
if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
// TODO: semantic level
return input.index(after: curIdx)
}
return nil
let isCaseInsensitive = opts.isCaseInsensitive
let isCharacterSemantic = opts.semanticLevel == .graphemeCluster

return { input, bounds in
let curIdx = bounds.lowerBound
let nextIndex = isCharacterSemantic
? input.index(after: curIdx)
: input.unicodeScalars.index(after: curIdx)

// Under grapheme semantics, we compare based on single NFC scalars. If
// such a character is not single scalar under NFC, the match fails. In
// scalar semantics, we compare the exact scalar value to the NFC
// bounds.
let scalar = isCharacterSemantic ? input[curIdx].singleNFCScalar
: input.unicodeScalars[curIdx]
guard let scalar = scalar else { return nil }
let scalarRange = lhs ... rhs
if scalarRange.contains(scalar) {
return nextIndex
}
} else {
guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
return { input, bounds in
// TODO: check for out of bounds?
let curIdx = bounds.lowerBound
if (lhs...rhs).contains(input[curIdx]) {
// TODO: semantic level
return input.index(after: curIdx)

// Check for case insensitive matches.
func matchesCased(
_ cased: (UnicodeScalar.Properties) -> String
) -> Bool {
let casedStr = cased(scalar.properties)
// In character semantic mode, we need to map to NFC. In scalar
// semantics, we should have an exact scalar.
let mapped = isCharacterSemantic ? casedStr.singleNFCScalar
: casedStr.singleScalar
guard let mapped = mapped else { return false }
return scalarRange.contains(mapped)
}
if isCaseInsensitive {
if scalar.properties.changesWhenLowercased,
matchesCased(\.lowercaseMapping) {
return nextIndex
}
if scalar.properties.changesWhenUppercased,
matchesCased(\.uppercaseMapping) {
return nextIndex
}
return nil
}
return nil
}

case let .custom(ccc):
Expand Down
7 changes: 0 additions & 7 deletions Sources/_StringProcessing/Unicode/CharacterProps.swift
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,3 @@


// TODO

extension Character {
/// Whether this character is made up of exactly one Unicode scalar value.
var hasExactlyOneScalar: Bool {
unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex
}
}
55 changes: 55 additions & 0 deletions Sources/_StringProcessing/Unicode/NFC.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

@_spi(_Unicode)
import Swift

extension UnicodeScalar {
/// Checks whether the scalar is in NFC form.
var isNFC: Bool { Character(self).singleNFCScalar == self }
}

extension Character {
/// If the given character consists of a single NFC scalar, returns it. If
/// there are multiple NFC scalars, returns `nil`.
var singleNFCScalar: UnicodeScalar? {
// SwiftStdlib is always >= 5.7 for a shipped StringProcessing.
guard #available(SwiftStdlib 5.7, *) else { return nil }
var nfcIter = String(self)._nfc.makeIterator()
guard let scalar = nfcIter.next(), nfcIter.next() == nil else { return nil }
return scalar
}

/// If the given character contains a single scalar, returns it. If none or
/// multiple scalars are present, returns `nil`.
var singleScalar: UnicodeScalar? {
hasExactlyOneScalar ? unicodeScalars.first! : nil
}
}

extension String {
/// If the given string consists of a single NFC scalar, returns it. If none
/// or multiple NFC scalars are present, returns `nil`.
var singleNFCScalar: UnicodeScalar? {
guard !isEmpty && index(after: startIndex) == endIndex else { return nil }
return first!.singleNFCScalar
}

/// If the given string contains a single scalar, returns it. If none or
/// multiple scalars are present, returns `nil`.
var singleScalar: UnicodeScalar? {
let scalars = unicodeScalars
guard !scalars.isEmpty &&
scalars.index(after: scalars.startIndex) == scalars.endIndex
else { return nil }
return scalars.first!
}
}
14 changes: 10 additions & 4 deletions Tests/RegexBuilderTests/RegexDSLTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,7 @@
import XCTest
import _StringProcessing
import RegexBuilder

#if os(Linux)
func XCTExpectFailure(_ message: String? = nil, body: () throws -> Void) rethrows {}
#endif
import TestSupport

class RegexDSLTests: XCTestCase {
func _testDSLCaptures<Content: RegexComponent, MatchType>(
Expand Down Expand Up @@ -77,6 +74,9 @@ class RegexDSLTests: XCTestCase {
let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n"

func testCharacterClasses() throws {
// Must have new stdlib for character class ranges.
guard ensureNewStdlib() else { return }

try _testDSLCaptures(
("a c", ("a c", " ", "c")),
matchType: (Substring, Substring, Substring).self, ==)
Expand Down Expand Up @@ -251,6 +251,9 @@ class RegexDSLTests: XCTestCase {
}

func testCharacterClassOperations() throws {
// Must have new stdlib for character class ranges.
guard ensureNewStdlib() else { return }

try _testDSLCaptures(
("bcdefn1a", "bcdefn1a"),
("nbcdef1a", nil), // fails symmetric difference lookahead
Expand Down Expand Up @@ -594,6 +597,9 @@ class RegexDSLTests: XCTestCase {
}

func testQuantificationBehavior() throws {
// Must have new stdlib for character class ranges.
guard ensureNewStdlib() else { return }

// Eager by default
try _testDSLCaptures(
("abc1def2", ("abc1def2", "2")),
Expand Down
Loading