From 021c2fa669defa59f0fbd91cc4481cbe49f0f004 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Mon, 10 Apr 2023 14:08:17 -0600 Subject: [PATCH 01/13] Simplify instruction fetching --- Sources/_StringProcessing/Engine/Metrics.swift | 2 +- Sources/_StringProcessing/Engine/Processor.swift | 6 +++++- Sources/_StringProcessing/Utility/Protocols.swift | 10 ---------- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Metrics.swift b/Sources/_StringProcessing/Engine/Metrics.swift index 753c3c3d1..49a669a0b 100644 --- a/Sources/_StringProcessing/Engine/Metrics.swift +++ b/Sources/_StringProcessing/Engine/Metrics.swift @@ -21,7 +21,7 @@ extension Processor { } mutating func measure() { - let (opcode, _) = fetch().destructure + let (opcode, _) = fetch() if metrics.instructionCounts.keys.contains(opcode) { metrics.instructionCounts[opcode]! += 1 } else { diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 98aee7803..2d34ab3ea 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -160,6 +160,10 @@ extension Processor { } extension Processor { + func fetch() -> (Instruction.OpCode, Instruction.Payload) { + instructions[controller.pc].destructure + } + var slice: Input.SubSequence { // TODO: Should we whole-scale switch to slices, or // does that depend on options for some anchors? @@ -449,7 +453,7 @@ extension Processor { } #endif - let (opcode, payload) = fetch().destructure + let (opcode, payload) = fetch() switch opcode { case .invalid: fatalError("Invalid program") diff --git a/Sources/_StringProcessing/Utility/Protocols.swift b/Sources/_StringProcessing/Utility/Protocols.swift index 7542a17dd..24ffbcf70 100644 --- a/Sources/_StringProcessing/Utility/Protocols.swift +++ b/Sources/_StringProcessing/Utility/Protocols.swift @@ -44,13 +44,3 @@ protocol ProcessorProtocol { var registers: Registers { get } } -extension ProcessorProtocol { - func fetch() -> Instruction { - instructions[currentPC] - } - - var callStack: Array { [] } -// var savePoints: Array { [] } - var registers: Array { [] } - -} From ccf39927b8e8fc9f65ff98cb8688b859ac346c88 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Sat, 1 Apr 2023 10:33:24 -0600 Subject: [PATCH 02/13] Refactor metrics out, and void their storage in release builds --- .../_StringProcessing/Engine/Metrics.swift | 65 ++++++++++++++++++- .../_StringProcessing/Engine/Processor.swift | 35 ++++------ .../_StringProcessing/Engine/Tracing.swift | 5 ++ Sources/_StringProcessing/Executor.swift | 4 +- .../_StringProcessing/Utility/Traced.swift | 2 +- 5 files changed, 82 insertions(+), 29 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Metrics.swift b/Sources/_StringProcessing/Engine/Metrics.swift index 49a669a0b..372a7e1b4 100644 --- a/Sources/_StringProcessing/Engine/Metrics.swift +++ b/Sources/_StringProcessing/Engine/Metrics.swift @@ -1,13 +1,71 @@ extension Processor { +#if PROCESSOR_MEASUREMENTS_ENABLED struct ProcessorMetrics { var instructionCounts: [Instruction.OpCode: Int] = [:] var backtracks: Int = 0 var resets: Int = 0 + var cycleCount: Int = 0 + + var isTracingEnabled: Bool = false + var shouldMeasureMetrics: Bool = false + + init(isTracingEnabled: Bool, shouldMeasureMetrics: Bool) { + self.isTracingEnabled = isTracingEnabled + self.shouldMeasureMetrics = shouldMeasureMetrics + } } - +#else + struct ProcessorMetrics { + var isTracingEnabled: Bool { false } + var shouldMeasureMetrics: Bool { false } + var cycleCount: Int { 0 } + + init(isTracingEnabled: Bool, shouldMeasureMetrics: Bool) { } + } +#endif +} + +extension Processor { + + mutating func startCycleMetrics() { +#if PROCESSOR_MEASUREMENTS_ENABLED + if metrics.cycleCount == 0 { + trace() + measureMetrics() + } +#endif + } + + mutating func endCycleMetrics() { +#if PROCESSOR_MEASUREMENTS_ENABLED + metrics.cycleCount += 1 + trace() + measureMetrics() + _checkInvariants() +#endif + } +} + +extension Processor.ProcessorMetrics { + + mutating func addReset() { +#if PROCESSOR_MEASUREMENTS_ENABLED + self.resets += 1 +#endif + } + + mutating func addBacktrack() { +#if PROCESSOR_MEASUREMENTS_ENABLED + self.backtracks += 1 +#endif + } +} + +extension Processor { +#if PROCESSOR_MEASUREMENTS_ENABLED func printMetrics() { print("===") - print("Total cycle count: \(cycleCount)") + print("Total cycle count: \(metrics.cycleCount)") print("Backtracks: \(metrics.backtracks)") print("Resets: \(metrics.resets)") print("Instructions:") @@ -30,8 +88,9 @@ extension Processor { } mutating func measureMetrics() { - if shouldMeasureMetrics { + if metrics.shouldMeasureMetrics { measure() } } +#endif } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 2d34ab3ea..6586d36ad 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -86,11 +86,7 @@ struct Processor { var failureReason: Error? = nil - // MARK: Metrics, debugging, etc. - var cycleCount = 0 - var isTracingEnabled: Bool - let shouldMeasureMetrics: Bool - var metrics: ProcessorMetrics = ProcessorMetrics() + var metrics: ProcessorMetrics } extension Processor { @@ -116,8 +112,11 @@ extension Processor { self.subjectBounds = subjectBounds self.searchBounds = searchBounds self.matchMode = matchMode - self.isTracingEnabled = isTracingEnabled - self.shouldMeasureMetrics = shouldMeasureMetrics + + self.metrics = ProcessorMetrics( + isTracingEnabled: isTracingEnabled, + shouldMeasureMetrics: shouldMeasureMetrics) + self.currentPosition = searchBounds.lowerBound // Initialize registers with end of search bounds @@ -144,8 +143,8 @@ extension Processor { self.state = .inProgress self.failureReason = nil - - if shouldMeasureMetrics { metrics.resets += 1 } + + metrics.addReset() _checkInvariants() } @@ -400,8 +399,8 @@ extension Processor { storedCaptures = capEnds registers.ints = intRegisters registers.positions = posRegisters - - if shouldMeasureMetrics { metrics.backtracks += 1 } + + metrics.addBacktrack() } mutating func abort(_ e: Error? = nil) { @@ -440,18 +439,8 @@ extension Processor { _checkInvariants() assert(state == .inProgress) -#if PROCESSOR_MEASUREMENTS_ENABLED - if cycleCount == 0 { - trace() - measureMetrics() - } - defer { - cycleCount += 1 - trace() - measureMetrics() - _checkInvariants() - } -#endif + startCycleMetrics() + defer { endCycleMetrics() } let (opcode, payload) = fetch() switch opcode { diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index 725319b00..b0ce67555 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -9,7 +9,12 @@ // //===----------------------------------------------------------------------===// + +// TODO: Remove this protocol (and/or reuse it for something like a FastProcessor) extension Processor: TracedProcessor { + var cycleCount: Int { metrics.cycleCount } + var isTracingEnabled: Bool { metrics.isTracingEnabled } + var isFailState: Bool { state == .fail } var isAcceptState: Bool { state == .accept } diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 253858d1f..0453fcd80 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -31,7 +31,7 @@ struct Executor { subjectBounds: subjectBounds, searchBounds: searchBounds) #if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.shouldMeasureMetrics { cpu.printMetrics() } } + defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } #endif var low = searchBounds.lowerBound let high = searchBounds.upperBound @@ -60,7 +60,7 @@ struct Executor { var cpu = engine.makeProcessor( input: input, bounds: subjectBounds, matchMode: mode) #if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.shouldMeasureMetrics { cpu.printMetrics() } } + defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } #endif return try _match(input, from: subjectBounds.lowerBound, using: &cpu) } diff --git a/Sources/_StringProcessing/Utility/Traced.swift b/Sources/_StringProcessing/Utility/Traced.swift index 112a601b1..198564fe1 100644 --- a/Sources/_StringProcessing/Utility/Traced.swift +++ b/Sources/_StringProcessing/Utility/Traced.swift @@ -13,7 +13,7 @@ // TODO: Place shared formatting and trace infrastructure here protocol Traced { - var isTracingEnabled: Bool { get set } + var isTracingEnabled: Bool { get } } protocol TracedProcessor: ProcessorProtocol, Traced { From d5ffed3c0d2a1cc512ded370d30b7e9b8c7ac78d Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 12 Apr 2023 15:36:30 -0600 Subject: [PATCH 03/13] put scalar matching on String, consistent few percent --- .../_StringProcessing/Engine/MEQuantify.swift | 7 ++- .../_StringProcessing/Engine/Processor.swift | 61 ++++++++++++++----- 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 0e2f3923a..0a58f7ed2 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -5,8 +5,11 @@ extension Processor { case .bitset: next = _doMatchBitset(registers[payload.bitset]) case .asciiChar: - next = _doMatchScalar( - UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) + next = input.matchScalar( + UnicodeScalar.init(_value: UInt32(payload.asciiChar)), + at: currentPosition, + limitedBy: end, + boundaryCheck: true) case .builtin: // We only emit .quantify if it consumes a single character next = input._matchBuiltinCC( diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 6586d36ad..20148f478 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -272,21 +272,10 @@ extension Processor { currentPosition < end ? input.unicodeScalars[currentPosition] : nil } - func _doMatchScalar(_ s: Unicode.Scalar, _ boundaryCheck: Bool) -> Input.Index? { - if s == loadScalar(), - let idx = input.unicodeScalars.index( - currentPosition, - offsetBy: 1, - limitedBy: end), - (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) { - return idx - } else { - return nil - } - } - mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { - guard let next = _doMatchScalar(s, boundaryCheck) else { + guard let next = input.matchScalar( + s, at: currentPosition, limitedBy: end, boundaryCheck: boundaryCheck + ) else { signalFailure() return false } @@ -695,4 +684,48 @@ extension Processor { controller.step() } } + + func sleep() { + var i = 0 + for c in input { + if i > 20 { break } + i += 1 + if c == "C" { + blackHole(c) + } + } + } +} + +@inline(never) +func blackHole(_ t: T) { _ = t } + +extension String { + + // func consumeScalar(_ n: Distance) -> Bool { + + // } + + func matchScalar( + _ scalar: Unicode.Scalar, + at pos: Index, + limitedBy end: String.Index, + boundaryCheck: Bool + ) -> Index? { + assert(end <= endIndex) + + guard pos < end, unicodeScalars[pos] == scalar else { + return nil + } + + let idx = unicodeScalars.index(after: pos) + guard idx <= end else { return nil } + + if boundaryCheck && !isOnGraphemeClusterBoundary(idx) { + return nil + } + + return idx + } + } From ba8b57e9f6f23b64cf4d3f809d73ab4e1ea3671e Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 12 Apr 2023 16:03:50 -0600 Subject: [PATCH 04/13] sink match onto string --- .../_StringProcessing/Engine/Processor.swift | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 20148f478..06725a466 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -232,11 +232,13 @@ extension Processor { // Match against the current input element. Returns whether // it succeeded vs signaling an error. mutating func match(_ e: Element) -> Bool { - guard let cur = load(), cur == e else { + guard let next = input.match( + e, at: currentPosition, limitedBy: end + ) else { signalFailure() return false } - _uncheckedForcedConsumeOne() + currentPosition = next return true } @@ -256,6 +258,7 @@ extension Processor { isScalarMode: Bool ) -> Bool { if isScalarMode { + // TODO: sink to specialized method on string, needs benchmark for s in seq.unicodeScalars { guard matchScalar(s, boundaryCheck: false) else { return false } } @@ -702,6 +705,23 @@ func blackHole(_ t: T) { _ = t } extension String { + func match( + _ char: Character, + at pos: Index, + limitedBy end: String.Index + ) -> Index? { + // TODO: This can be greatly sped up with string internals + // TODO: This is also very much quick-check-able + assert(end <= endIndex) + + guard pos < end, self[pos] == char else { return nil } + + let idx = index(after: pos) + guard idx <= end else { return nil } + + return idx + } + // func consumeScalar(_ n: Distance) -> Bool { // } From 670ff9b4d97a06f6883045304639739b5cc8e8e0 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 12 Apr 2023 16:18:29 -0600 Subject: [PATCH 05/13] sink match bitset --- .../_StringProcessing/Engine/MEQuantify.swift | 3 +- .../_StringProcessing/Engine/Processor.swift | 35 ++++++++++++++----- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 0a58f7ed2..09ab71168 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -3,7 +3,8 @@ extension Processor { var next: Input.Index? switch payload.type { case .bitset: - next = _doMatchBitset(registers[payload.bitset]) + next = input.matchBitset( + registers[payload.bitset], at: currentPosition, limitedBy: end) case .asciiChar: next = input.matchScalar( UnicodeScalar.init(_value: UInt32(payload.asciiChar)), diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 06725a466..97dde4e36 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -243,6 +243,7 @@ extension Processor { } mutating func matchCaseInsensitive(_ e: Element) -> Bool { + // TODO: need benchmark coverage guard let cur = load(), cur.lowercased() == e.lowercased() else { signalFailure() return false @@ -290,6 +291,7 @@ extension Processor { _ s: Unicode.Scalar, boundaryCheck: Bool ) -> Bool { + // TODO: needs benchmark coverage guard let curScalar = loadScalar(), s.properties.lowercaseMapping == curScalar.properties.lowercaseMapping, let idx = input.unicodeScalars.index( @@ -305,21 +307,15 @@ extension Processor { return true } - func _doMatchBitset(_ bitset: DSLTree.CustomCharacterClass.AsciiBitset) -> Input.Index? { - if let cur = load(), bitset.matches(char: cur) { - return input.index(after: currentPosition) - } else { - return nil - } - } - // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset mutating func matchBitset( _ bitset: DSLTree.CustomCharacterClass.AsciiBitset ) -> Bool { - guard let next = _doMatchBitset(bitset) else { + guard let next = input.matchBitset( + bitset, at: currentPosition, limitedBy: end + ) else { signalFailure() return false } @@ -748,4 +744,25 @@ extension String { return idx } + func matchBitset( + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + at pos: Index, + limitedBy end: Index + ) -> Index? { + // TODO: extremely quick-check-able + // TODO: can be sped up with string internals + + assert(end <= endIndex) + + guard pos < end, bitset.matches(char: self[pos]) else { + return nil + } + + let idx = index(after: pos) + guard idx <= end else { return nil } + + return idx + } + + } From 1a4cde7a5004c9eb09e70781d70c5e2a1d41b8ba Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 12 Apr 2023 16:35:01 -0600 Subject: [PATCH 06/13] sink matchSeq --- .../_StringProcessing/Engine/Processor.swift | 38 +++++++++++++++---- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 97dde4e36..e8b521693 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -180,6 +180,7 @@ extension Processor { // Returns whether the advance succeeded. On failure, our // save point was restored mutating func consume(_ n: Distance) -> Bool { + // TODO: need benchmark coverage guard let idx = input.index( currentPosition, offsetBy: n.rawValue, limitedBy: end ) else { @@ -192,6 +193,7 @@ extension Processor { // Advances in unicode scalar view mutating func consumeScalar(_ n: Distance) -> Bool { + // TODO: need benchmark coverage guard let idx = input.unicodeScalars.index( currentPosition, offsetBy: n.rawValue, limitedBy: end ) else { @@ -223,11 +225,6 @@ extension Processor { func load() -> Element? { currentPosition < end ? input[currentPosition] : nil } - func load(count: Int) -> Input.SubSequence? { - let slice = self.slice[currentPosition...].prefix(count) - guard slice.count == count else { return nil } - return slice - } // Match against the current input element. Returns whether // it succeeded vs signaling an error. @@ -259,16 +256,21 @@ extension Processor { isScalarMode: Bool ) -> Bool { if isScalarMode { - // TODO: sink to specialized method on string, needs benchmark + // TODO: needs benchmark coverage for s in seq.unicodeScalars { guard matchScalar(s, boundaryCheck: false) else { return false } } return true } - for e in seq { - guard match(e) else { return false } + guard let next = input.matchSeq( + seq, at: currentPosition, limitedBy: end + ) else { + signalFailure() + return false } + + currentPosition = next return true } @@ -327,6 +329,7 @@ extension Processor { mutating func matchBitsetScalar( _ bitset: DSLTree.CustomCharacterClass.AsciiBitset ) -> Bool { + // TODO: needs benchmark coverage guard let curScalar = loadScalar(), bitset.matches(scalar: curScalar), let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end) else { @@ -718,6 +721,25 @@ extension String { return idx } + func matchSeq( + _ seq: Substring, + at pos: Index, + limitedBy end: Index + ) -> Index? { + // TODO: This can be greatly sped up with string internals + // TODO: This is also very much quick-check-able + assert(end <= endIndex) + + var cur = pos + for e in seq { + guard cur < end, self[cur] == e else { return nil } + self.formIndex(after: &cur) + } + + guard cur <= end else { return nil } + return cur + } + // func consumeScalar(_ n: Distance) -> Bool { // } From 89fa6c70aa2148debbfc0125e16c1f4e4b5bde6c Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 12 Apr 2023 16:46:02 -0600 Subject: [PATCH 07/13] cleanup --- .../_StringProcessing/Engine/MEBuiltins.swift | 4 ++++ .../_StringProcessing/Engine/MEQuantify.swift | 1 + .../_StringProcessing/Engine/Processor.swift | 18 ------------------ 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 4ec7a71dd..b50d1c213 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -30,6 +30,7 @@ extension Processor { } func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { + // TODO: needs benchmark coverage if currentPosition == subjectBounds.lowerBound { return true } switch payload.semanticLevel { case .graphemeCluster: @@ -40,6 +41,7 @@ extension Processor { } func isAtEndOfLine(_ payload: AssertionPayload) -> Bool { + // TODO: needs benchmark coverage if currentPosition == subjectBounds.upperBound { return true } switch payload.semanticLevel { case .graphemeCluster: @@ -50,6 +52,8 @@ extension Processor { } mutating func builtinAssert(by payload: AssertionPayload) throws -> Bool { + // TODO: needs benchmark coverage + // Future work: Optimize layout and dispatch switch payload.kind { case .startOfSubject: return currentPosition == subjectBounds.lowerBound diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 09ab71168..81b80b00e 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -20,6 +20,7 @@ extension Processor { isStrictASCII: payload.builtinIsStrict, isScalarSemantics: false) case .any: + // TODO: call out to existing code with quick check let matched = currentPosition != input.endIndex && (!input[currentPosition].isNewline || payload.anyMatchesNewline) next = matched ? input.index(after: currentPosition) : nil diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index e8b521693..3ff767207 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -686,22 +686,8 @@ extension Processor { controller.step() } } - - func sleep() { - var i = 0 - for c in input { - if i > 20 { break } - i += 1 - if c == "C" { - blackHole(c) - } - } - } } -@inline(never) -func blackHole(_ t: T) { _ = t } - extension String { func match( @@ -740,10 +726,6 @@ extension String { return cur } - // func consumeScalar(_ n: Distance) -> Bool { - - // } - func matchScalar( _ scalar: Unicode.Scalar, at pos: Index, From 7335fa77c524d6377dd04c32e1f31362354fdd3b Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 12 Apr 2023 17:08:52 -0600 Subject: [PATCH 08/13] cleanup --- Sources/_StringProcessing/Unicode/WordBreaking.swift | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift index 50da079f6..f1f9573c1 100644 --- a/Sources/_StringProcessing/Unicode/WordBreaking.swift +++ b/Sources/_StringProcessing/Unicode/WordBreaking.swift @@ -12,6 +12,7 @@ @_spi(_Unicode) import Swift +// TODO: Sink onto String extension Processor { func atSimpleBoundary( _ usesAsciiWord: Bool, @@ -20,9 +21,11 @@ extension Processor { func matchesWord(at i: Input.Index) -> Bool { switch semanticLevel { case .graphemeCluster: + // TODO: needs benchmark coverage let c = input[i] return c.isWordCharacter && (c.isASCII || !usesAsciiWord) case .unicodeScalar: + // TODO: needs benchmark coverage let c = input.unicodeScalars[i] return (c.properties.isAlphabetic || c == "_") && (c.isASCII || !usesAsciiWord) } @@ -51,6 +54,7 @@ extension String { using cache: inout Set?, _ maxIndex: inout String.Index? ) -> Bool { + // TODO: needs benchmark coverage guard i != startIndex, i != endIndex else { return true } From 8694dead049c0ca40c420add40181e428778bcb8 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Thu, 13 Apr 2023 06:30:46 -0600 Subject: [PATCH 09/13] Bug fix in newline hot path, and apply hot path to quantified dot --- .../_StringProcessing/Engine/MEBuiltins.swift | 16 +++++++------ .../_StringProcessing/Engine/MEQuantify.swift | 24 ++++++++++++------- Tests/RegexTests/MatchTests.swift | 5 ++++ 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index b50d1c213..d8c8c347b 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -63,10 +63,10 @@ extension Processor { switch payload.semanticLevel { case .graphemeCluster: return input.index(after: currentPosition) == subjectBounds.upperBound - && input[currentPosition].isNewline + && input[currentPosition].isNewline case .unicodeScalar: return input.unicodeScalars.index(after: currentPosition) == subjectBounds.upperBound - && input.unicodeScalars[currentPosition].isNewline + && input.unicodeScalars[currentPosition].isNewline } case .endOfSubject: return currentPosition == subjectBounds.upperBound @@ -121,6 +121,7 @@ extension Processor { // MARK: Matching `.` extension String { + // TODO: Should the below have a `limitedBy` parameter? func _matchAnyNonNewline( at currentPosition: String.Index, @@ -155,11 +156,11 @@ extension String { return .unknown } switch asciiValue { - case ._lineFeed, ._carriageReturn: - return .definite(nil) - default: - assert(!isCRLF) - return .definite(next) + case (._lineFeed)...(._carriageReturn): + return .definite(nil) + default: + assert(!isCRLF) + return .definite(next) } } @@ -183,6 +184,7 @@ extension String { // MARK: - Built-in character class matching extension String { + // TODO: Should the below have a `limitedBy` parameter? // Mentioned in ProgrammersManual.md, update docs if redesigned func _matchBuiltinCC( diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 81b80b00e..873627567 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,31 +1,37 @@ extension Processor { func _doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { - var next: Input.Index? + // FIXME: is the below updated for scalar semantics? switch payload.type { case .bitset: - next = input.matchBitset( + return input.matchBitset( registers[payload.bitset], at: currentPosition, limitedBy: end) case .asciiChar: - next = input.matchScalar( + return input.matchScalar( UnicodeScalar.init(_value: UInt32(payload.asciiChar)), at: currentPosition, limitedBy: end, boundaryCheck: true) case .builtin: + // FIXME: bounds check? endIndex or end? + // We only emit .quantify if it consumes a single character - next = input._matchBuiltinCC( + return input._matchBuiltinCC( payload.builtin, at: currentPosition, isInverted: payload.builtinIsInverted, isStrictASCII: payload.builtinIsStrict, isScalarSemantics: false) case .any: - // TODO: call out to existing code with quick check - let matched = currentPosition != input.endIndex - && (!input[currentPosition].isNewline || payload.anyMatchesNewline) - next = matched ? input.index(after: currentPosition) : nil + // FIXME: endIndex or end? + guard currentPosition < input.endIndex else { return nil } + + if payload.anyMatchesNewline { + return input.index(after: currentPosition) + } + + return input._matchAnyNonNewline( + at: currentPosition, isScalarSemantics: false) } - return next } /// Generic quantify instruction interpreter diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index e8e41a114..a6c9babbe 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1891,6 +1891,11 @@ extension RegexTests { func testSingleLineMode() { firstMatchTest(#".+"#, input: "a\nb", match: "a") firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") + + // We recognize LF, line tab, FF, and CR as newlines by default + firstMatchTest(#"."#, input: "\u{A}\u{B}\u{C}\u{D}\nb", match: "b") + firstMatchTest(#".+"#, input: "\u{A}\u{B}\u{C}\u{D}\nbb", match: "bb") + } func testMatchNewlines() { From bf48e477de6e8c84628c0503e29c4079052b4ea1 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Thu, 13 Apr 2023 14:20:24 -0600 Subject: [PATCH 10/13] Run scalar semantic benchmarks --- Sources/RegexBenchmark/Benchmark.swift | 92 +++++------ Sources/RegexBenchmark/BenchmarkRunner.swift | 143 +++++++++++++++++- .../Suite/CustomCharacterClasses.swift | 72 ++++----- 3 files changed, 213 insertions(+), 94 deletions(-) diff --git a/Sources/RegexBenchmark/Benchmark.swift b/Sources/RegexBenchmark/Benchmark.swift index 2622639fb..f807a8e55 100644 --- a/Sources/RegexBenchmark/Benchmark.swift +++ b/Sources/RegexBenchmark/Benchmark.swift @@ -71,6 +71,13 @@ struct NSBenchmark: RegexBenchmark { enum NSMatchType { case allMatches case first + + init(_ type: Benchmark.MatchType) { + switch type { + case .whole, .first: self = .first + case .allMatches: self = .allMatches + } + } } func run() { @@ -126,7 +133,7 @@ struct CrossBenchmark { /// The base name of the benchmark var baseName: String - /// The string to compile in differnet engines + /// The string to compile in different engines var regex: String /// The text to search @@ -143,57 +150,32 @@ struct CrossBenchmark { /// Whether or not to do firstMatch as well or just allMatches var includeFirst: Bool = false + /// Whether to also run scalar-semantic mode + var alsoRunScalarSemantic: Bool = true + func register(_ runner: inout BenchmarkRunner) { - let swiftRegex = try! Regex(regex) - let nsRegex: NSRegularExpression if isWhole { - nsRegex = try! NSRegularExpression(pattern: "^" + regex + "$") + runner.registerCrossBenchmark( + nameBase: baseName, + input: input, + pattern: regex, + .whole, + alsoRunScalarSemantic: alsoRunScalarSemantic) } else { - nsRegex = try! NSRegularExpression(pattern: regex) - } + runner.registerCrossBenchmark( + nameBase: baseName, + input: input, + pattern: regex, + .allMatches, + alsoRunScalarSemantic: alsoRunScalarSemantic) - if isWhole { - runner.register( - Benchmark( - name: baseName + "Whole", - regex: swiftRegex, - pattern: regex, - type: .whole, - target: input)) - runner.register( - NSBenchmark( - name: baseName + "Whole" + CrossBenchmark.nsSuffix, - regex: nsRegex, - type: .first, - target: input)) - } else { - runner.register( - Benchmark( - name: baseName + "All", - regex: swiftRegex, - pattern: regex, - type: .allMatches, - target: input)) - runner.register( - NSBenchmark( - name: baseName + "All" + CrossBenchmark.nsSuffix, - regex: nsRegex, - type: .allMatches, - target: input)) if includeFirst || runner.includeFirstOverride { - runner.register( - Benchmark( - name: baseName + "First", - regex: swiftRegex, - pattern: regex, - type: .first, - target: input)) - runner.register( - NSBenchmark( - name: baseName + "First" + CrossBenchmark.nsSuffix, - regex: nsRegex, - type: .first, - target: input)) + runner.registerCrossBenchmark( + nameBase: baseName, + input: input, + pattern: regex, + .first, + alsoRunScalarSemantic: alsoRunScalarSemantic) } } } @@ -209,20 +191,16 @@ struct CrossInputListBenchmark { /// The list of strings to search var inputs: [String] + + /// Also run in scalar-semantic mode + var alsoRunScalarSemantic: Bool = true func register(_ runner: inout BenchmarkRunner) { - let swiftRegex = try! Regex(regex) - runner.register(InputListBenchmark( + runner.registerCrossBenchmark( name: baseName, - regex: swiftRegex, + inputList: inputs, pattern: regex, - targets: inputs - )) - runner.register(InputListNSBenchmark( - name: baseName + CrossBenchmark.nsSuffix, - regex: regex, - targets: inputs - )) + alsoRunScalarSemantic: alsoRunScalarSemantic) } } diff --git a/Sources/RegexBenchmark/BenchmarkRunner.swift b/Sources/RegexBenchmark/BenchmarkRunner.swift index 641c03224..316825953 100644 --- a/Sources/RegexBenchmark/BenchmarkRunner.swift +++ b/Sources/RegexBenchmark/BenchmarkRunner.swift @@ -4,6 +4,16 @@ import Foundation /// The number of times to re-run the benchmark if results are too varying private var rerunCount: Int { 3 } +extension Benchmark.MatchType { + fileprivate var nameSuffix: String { + switch self { + case .whole: return "_Whole" + case .first: return "_First" + case .allMatches: return "_All" + } + } +} + struct BenchmarkRunner { let suiteName: String var suite: [any RegexBenchmark] = [] @@ -16,12 +26,141 @@ struct BenchmarkRunner { // Forcibly include firstMatch benchmarks for all CrossBenchmarks let includeFirstOverride: Bool + + // Register a cross-benchmark + mutating func registerCrossBenchmark( + nameBase: String, + input: String, + pattern: String, + _ type: Benchmark.MatchType, + alsoRunScalarSemantic: Bool = true + ) { + let swiftRegex = try! Regex(pattern) + let nsRegex: NSRegularExpression + if type == .whole { + nsRegex = try! NSRegularExpression(pattern: "^" + pattern + "$") + } else { + nsRegex = try! NSRegularExpression(pattern: pattern) + } + let nameSuffix = type.nameSuffix + + register( + Benchmark( + name: nameBase + nameSuffix, + regex: swiftRegex, + pattern: pattern, + type: type, + target: input)) + register( + NSBenchmark( + name: nameBase + nameSuffix + CrossBenchmark.nsSuffix, + regex: nsRegex, + type: .init(type), + target: input)) + + if alsoRunScalarSemantic { + register( + Benchmark( + name: nameBase + nameSuffix + "_Scalar", + regex: swiftRegex.matchingSemantics(.unicodeScalar), + pattern: pattern, + type: type, + target: input)) + register( + NSBenchmark( + name: nameBase + nameSuffix + "_Scalar" + CrossBenchmark.nsSuffix, + regex: nsRegex, + type: .init(type), + target: input)) + } + } + + // Register a cross-benchmark list + mutating func registerCrossBenchmark( + name: String, + inputList: [String], + pattern: String, + alsoRunScalarSemantic: Bool = true + ) { + let swiftRegex = try! Regex(pattern) + register(InputListBenchmark( + name: name, + regex: swiftRegex, + pattern: pattern, + targets: inputList + )) + register(InputListNSBenchmark( + name: name + CrossBenchmark.nsSuffix, + regex: pattern, + targets: inputList + )) + + if alsoRunScalarSemantic { + register(InputListBenchmark( + name: name, + regex: swiftRegex.matchingSemantics(.unicodeScalar), + pattern: pattern, + targets: inputList + )) + register(InputListNSBenchmark( + name: name + CrossBenchmark.nsSuffix, + regex: pattern, + targets: inputList + )) + } + + } + + // Register a swift-only benchmark + mutating func register( + nameBase: String, + input: String, + pattern: String, + _ swiftRegex: Regex, + _ type: Benchmark.MatchType, + alsoRunScalarSemantic: Bool = true + ) { + let nameSuffix = type.nameSuffix + + register( + Benchmark( + name: nameBase + nameSuffix, + regex: swiftRegex, + pattern: pattern, + type: type, + target: input)) + + if alsoRunScalarSemantic { + register( + Benchmark( + name: nameBase + nameSuffix + "_Scalar", + regex: swiftRegex, + pattern: pattern, + type: type, + target: input)) + } + } - mutating func register(_ benchmark: some RegexBenchmark) { + private mutating func register(_ benchmark: NSBenchmark) { suite.append(benchmark) } - mutating func register(_ benchmark: some SwiftRegexBenchmark) { + private mutating func register(_ benchmark: Benchmark) { + var benchmark = benchmark + if enableTracing { + benchmark.enableTracing() + } + if enableMetrics { + benchmark.enableMetrics() + } + suite.append(benchmark) + } + + private mutating func register(_ benchmark: InputListNSBenchmark) { + suite.append(benchmark) + } + + private mutating func register(_ benchmark: InputListBenchmark) { var benchmark = benchmark if enableTracing { benchmark.enableTracing() diff --git a/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift b/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift index 61d7b197f..27b2b07b4 100644 --- a/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift +++ b/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift @@ -12,53 +12,55 @@ extension BenchmarkRunner { let input = Inputs.graphemeBreakData - register(Benchmark( - name: "BasicCCC", - regex: try! Regex(basic), + // TODO: Which of these can be cross-benchmarks? + + register( + nameBase: "BasicCCC", + input: input, pattern: basic, - type: .allMatches, - target: input)) + try! Regex(basic), + .allMatches) - register(Benchmark( - name: "BasicRangeCCC", - regex: try! Regex(basicRange), + register( + nameBase: "BasicRangeCCC", + input: input, pattern: basicRange, - type: .allMatches, - target: input)) + try! Regex(basicRange), + .allMatches) - register(Benchmark( - name: "CaseInsensitiveCCC", - regex: try! Regex(caseInsensitive), + register( + nameBase: "CaseInsensitiveCCC", + input: input, pattern: caseInsensitive, - type: .allMatches, - target: input)) + try! Regex(caseInsensitive), + .allMatches) - register(Benchmark( - name: "InvertedCCC", - regex: try! Regex(inverted), + register( + nameBase: "InvertedCCC", + input: input, pattern: inverted, - type: .allMatches, - target: input)) + try! Regex(inverted), + .allMatches) - register(Benchmark( - name: "SubtractionCCC", - regex: try! Regex(subtraction), + register( + nameBase: "SubtractionCCC", + input: input, pattern: subtraction, - type: .allMatches, - target: input)) + try! Regex(subtraction), + .allMatches) - register(Benchmark( - name: "IntersectionCCC", - regex: try! Regex(intersection), + register( + nameBase: "IntersectionCCC", + input: input, pattern: intersection, - type: .allMatches, - target: input)) + try! Regex(intersection), + .allMatches) - register(Benchmark( - name: "symDiffCCC", - regex: try! Regex(symmetricDifference), + register( + nameBase: "symDiffCCC", + input: input, pattern: symmetricDifference, - type: .allMatches, - target: input)) + try! Regex(symmetricDifference), + .allMatches) } } From 4dcafd4f8074a4298d27f7cd2d8d4adb7f890980 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Thu, 13 Apr 2023 14:47:22 -0600 Subject: [PATCH 11/13] add scalar to the name --- Sources/RegexBenchmark/BenchmarkRunner.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Sources/RegexBenchmark/BenchmarkRunner.swift b/Sources/RegexBenchmark/BenchmarkRunner.swift index 316825953..b067b9679 100644 --- a/Sources/RegexBenchmark/BenchmarkRunner.swift +++ b/Sources/RegexBenchmark/BenchmarkRunner.swift @@ -97,13 +97,13 @@ struct BenchmarkRunner { if alsoRunScalarSemantic { register(InputListBenchmark( - name: name, + name: name + "_Scalar", regex: swiftRegex.matchingSemantics(.unicodeScalar), pattern: pattern, targets: inputList )) register(InputListNSBenchmark( - name: name + CrossBenchmark.nsSuffix, + name: name + "_Scalar" + CrossBenchmark.nsSuffix, regex: pattern, targets: inputList )) From f0934e389e7e752663baba7acd577c7bef1558d0 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 19 Apr 2023 09:42:26 -0600 Subject: [PATCH 12/13] Finish refactoring logic onto String --- Sources/RegexBenchmark/Utils/Stats.swift | 4 +- .../_StringProcessing/Engine/MEQuantify.swift | 14 +- .../_StringProcessing/Engine/Processor.swift | 227 +++++++++--------- .../Utility/AsciiBitset.swift | 4 +- 4 files changed, 121 insertions(+), 128 deletions(-) diff --git a/Sources/RegexBenchmark/Utils/Stats.swift b/Sources/RegexBenchmark/Utils/Stats.swift index 0cc9156a4..175826a0b 100644 --- a/Sources/RegexBenchmark/Utils/Stats.swift +++ b/Sources/RegexBenchmark/Utils/Stats.swift @@ -3,8 +3,8 @@ import Foundation enum Stats {} extension Stats { - // Maximum allowed standard deviation is 5% of the median runtime - static let maxAllowedStdev = 0.05 + // Maximum allowed standard deviation is 7.5% of the median runtime + static let maxAllowedStdev = 0.075 static func tTest(_ a: Measurement, _ b: Measurement) -> Bool { // Student's t-test diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 873627567..cb1cfc177 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,16 +1,22 @@ extension Processor { func _doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { - // FIXME: is the below updated for scalar semantics? + // TODO: This optimization is only enabled for grapheme cluster semantics, + // we want these for scalar semantics as well. + switch payload.type { case .bitset: return input.matchBitset( - registers[payload.bitset], at: currentPosition, limitedBy: end) + registers[payload.bitset], + at: currentPosition, + limitedBy: end, + isScalarSemantics: false) case .asciiChar: return input.matchScalar( UnicodeScalar.init(_value: UInt32(payload.asciiChar)), at: currentPosition, limitedBy: end, - boundaryCheck: true) + boundaryCheck: true, + isCaseInsensitive: false) case .builtin: // FIXME: bounds check? endIndex or end? @@ -41,7 +47,7 @@ extension Processor { var trips = 0 var extraTrips = payload.extraTrips var savePoint = startQuantifierSavePoint() - + while true { if trips >= payload.minTrips { if extraTrips == 0 { break } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 3ff767207..1fba9d517 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -35,7 +35,7 @@ struct Processor { /// of the search. `input` can be a "supersequence" of the subject, while /// `input[subjectBounds]` is the logical entity that is being searched. let input: Input - + /// The bounds of the logical subject in `input`. /// /// `subjectBounds` represents the bounds of the string or substring that a @@ -46,7 +46,7 @@ struct Processor { /// `subjectBounds` is always equal to or a subrange of /// `input.startIndex.. - + /// The bounds within the subject for an individual search. /// /// `searchBounds` is equal to `subjectBounds` in some cases, but can be a @@ -62,7 +62,7 @@ struct Processor { let instructions: InstructionList // MARK: Resettable state - + /// The current search position while processing. /// /// `currentPosition` must always be in the range `subjectBounds` or equal @@ -81,7 +81,7 @@ struct Processor { var wordIndexCache: Set? = nil var wordIndexMaxIndex: String.Index? = nil - + var state: State = .inProgress var failureReason: Error? = nil @@ -122,7 +122,7 @@ extension Processor { // Initialize registers with end of search bounds self.registers = Registers(program, searchBounds.upperBound) self.storedCaptures = Array( - repeating: .init(), count: program.registerInfo.captures) + repeating: .init(), count: program.registerInfo.captures) _checkInvariants() } @@ -169,18 +169,12 @@ extension Processor { input[searchBounds] } - // Advance in our input, without any checks or failure signalling - mutating func _uncheckedForcedConsumeOne() { - assert(currentPosition != end) - input.formIndex(after: ¤tPosition) - } - // Advance in our input // // Returns whether the advance succeeded. On failure, our // save point was restored mutating func consume(_ n: Distance) -> Bool { - // TODO: need benchmark coverage + // TODO: needs benchmark coverage guard let idx = input.index( currentPosition, offsetBy: n.rawValue, limitedBy: end ) else { @@ -190,10 +184,10 @@ extension Processor { currentPosition = idx return true } - + // Advances in unicode scalar view mutating func consumeScalar(_ n: Distance) -> Bool { - // TODO: need benchmark coverage + // TODO: needs benchmark coverage guard let idx = input.unicodeScalars.index( currentPosition, offsetBy: n.rawValue, limitedBy: end ) else { @@ -226,11 +220,24 @@ extension Processor { currentPosition < end ? input[currentPosition] : nil } + + // MARK: Match functions + // + // TODO: refactor these such that `cycle()` calls the corresponding String + // method directly, and all the step, signalFailure, and + // currentPosition logic is collected into a single place inside + // cycle(). + // Match against the current input element. Returns whether // it succeeded vs signaling an error. - mutating func match(_ e: Element) -> Bool { + mutating func match( + _ e: Element, isCaseInsensitive: Bool + ) -> Bool { guard let next = input.match( - e, at: currentPosition, limitedBy: end + e, + at: currentPosition, + limitedBy: end, + isCaseInsensitive: isCaseInsensitive ) else { signalFailure() return false @@ -239,32 +246,17 @@ extension Processor { return true } - mutating func matchCaseInsensitive(_ e: Element) -> Bool { - // TODO: need benchmark coverage - guard let cur = load(), cur.lowercased() == e.lowercased() else { - signalFailure() - return false - } - _uncheckedForcedConsumeOne() - return true - } - // Match against the current input prefix. Returns whether // it succeeded vs signaling an error. mutating func matchSeq( _ seq: Substring, - isScalarMode: Bool + isScalarSemantics: Bool ) -> Bool { - if isScalarMode { - // TODO: needs benchmark coverage - for s in seq.unicodeScalars { - guard matchScalar(s, boundaryCheck: false) else { return false } - } - return true - } - guard let next = input.matchSeq( - seq, at: currentPosition, limitedBy: end + seq, + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics ) else { signalFailure() return false @@ -274,13 +266,17 @@ extension Processor { return true } - func loadScalar() -> Unicode.Scalar? { - currentPosition < end ? input.unicodeScalars[currentPosition] : nil - } - - mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { + mutating func matchScalar( + _ s: Unicode.Scalar, + boundaryCheck: Bool, + isCaseInsensitive: Bool + ) -> Bool { guard let next = input.matchScalar( - s, at: currentPosition, limitedBy: end, boundaryCheck: boundaryCheck + s, + at: currentPosition, + limitedBy: end, + boundaryCheck: boundaryCheck, + isCaseInsensitive: isCaseInsensitive ) else { signalFailure() return false @@ -289,34 +285,18 @@ extension Processor { return true } - mutating func matchScalarCaseInsensitive( - _ s: Unicode.Scalar, - boundaryCheck: Bool - ) -> Bool { - // TODO: needs benchmark coverage - guard let curScalar = loadScalar(), - s.properties.lowercaseMapping == curScalar.properties.lowercaseMapping, - let idx = input.unicodeScalars.index( - currentPosition, - offsetBy: 1, - limitedBy: end), - (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) - else { - signalFailure() - return false - } - currentPosition = idx - return true - } - // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset mutating func matchBitset( - _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + isScalarSemantics: Bool ) -> Bool { guard let next = input.matchBitset( - bitset, at: currentPosition, limitedBy: end + bitset, + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics ) else { signalFailure() return false @@ -325,21 +305,6 @@ extension Processor { return true } - // Equivalent of matchBitset but emitted when in unicode scalar semantic mode - mutating func matchBitsetScalar( - _ bitset: DSLTree.CustomCharacterClass.AsciiBitset - ) -> Bool { - // TODO: needs benchmark coverage - guard let curScalar = loadScalar(), - bitset.matches(scalar: curScalar), - let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end) else { - signalFailure() - return false - } - currentPosition = idx - return true - } - // Matches the next character/scalar if it is not a newline mutating func matchAnyNonNewline( isScalarSemantics: Bool @@ -425,7 +390,7 @@ extension Processor { // TODO: What should we do here? fatalError("Invalid code: Tried to clear save points when empty") } - + mutating func cycle() { _checkInvariants() assert(state == .inProgress) @@ -519,39 +484,25 @@ extension Processor { } case .match: let (isCaseInsensitive, reg) = payload.elementPayload - if isCaseInsensitive { - if matchCaseInsensitive(registers[reg]) { - controller.step() - } - } else { - if match(registers[reg]) { - controller.step() - } + if match(registers[reg], isCaseInsensitive: isCaseInsensitive) { + controller.step() } case .matchScalar: let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload - if caseInsensitive { - if matchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck) { - controller.step() - } - } else { - if matchScalar(scalar, boundaryCheck: boundaryCheck) { - controller.step() - } + if matchScalar( + scalar, + boundaryCheck: boundaryCheck, + isCaseInsensitive: caseInsensitive + ) { + controller.step() } case .matchBitset: let (isScalar, reg) = payload.bitsetPayload let bitset = registers[reg] - if isScalar { - if matchBitsetScalar(bitset) { - controller.step() - } - } else { - if matchBitset(bitset) { - controller.step() - } + if matchBitset(bitset, isScalarSemantics: isScalar) { + controller.step() } case .matchBuiltin: @@ -642,7 +593,7 @@ extension Processor { signalFailure() return } - if matchSeq(input[range], isScalarMode: isScalarMode) { + if matchSeq(input[range], isScalarSemantics: isScalarMode) { controller.step() } @@ -688,18 +639,29 @@ extension Processor { } } +// MARK: String matchers +// +// TODO: Refactor into separate file, formalize patterns + extension String { func match( _ char: Character, at pos: Index, - limitedBy end: String.Index + limitedBy end: String.Index, + isCaseInsensitive: Bool ) -> Index? { // TODO: This can be greatly sped up with string internals // TODO: This is also very much quick-check-able assert(end <= endIndex) - guard pos < end, self[pos] == char else { return nil } + guard pos < end else { return nil } + + if isCaseInsensitive { + guard self[pos].lowercased() == char.lowercased() else { return nil } + } else { + guard self[pos] == char else { return nil } + } let idx = index(after: pos) guard idx <= end else { return nil } @@ -710,16 +672,25 @@ extension String { func matchSeq( _ seq: Substring, at pos: Index, - limitedBy end: Index + limitedBy end: Index, + isScalarSemantics: Bool ) -> Index? { // TODO: This can be greatly sped up with string internals // TODO: This is also very much quick-check-able assert(end <= endIndex) var cur = pos - for e in seq { - guard cur < end, self[cur] == e else { return nil } - self.formIndex(after: &cur) + + if isScalarSemantics { + for e in seq.unicodeScalars { + guard cur < end, unicodeScalars[cur] == e else { return nil } + self.unicodeScalars.formIndex(after: &cur) + } + } else { + for e in seq { + guard cur < end, self[cur] == e else { return nil } + self.formIndex(after: &cur) + } } guard cur <= end else { return nil } @@ -730,12 +701,23 @@ extension String { _ scalar: Unicode.Scalar, at pos: Index, limitedBy end: String.Index, - boundaryCheck: Bool + boundaryCheck: Bool, + isCaseInsensitive: Bool ) -> Index? { + // TODO: extremely quick-check-able + // TODO: can be sped up with string internals assert(end <= endIndex) - guard pos < end, unicodeScalars[pos] == scalar else { - return nil + guard pos < end else { return nil } + let curScalar = unicodeScalars[pos] + + if isCaseInsensitive { + guard curScalar.properties.lowercaseMapping == scalar.properties.lowercaseMapping + else { + return nil + } + } else { + guard curScalar == scalar else { return nil } } let idx = unicodeScalars.index(after: pos) @@ -751,20 +733,25 @@ extension String { func matchBitset( _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, at pos: Index, - limitedBy end: Index + limitedBy end: Index, + isScalarSemantics: Bool ) -> Index? { // TODO: extremely quick-check-able // TODO: can be sped up with string internals - assert(end <= endIndex) - guard pos < end, bitset.matches(char: self[pos]) else { - return nil + guard pos < end else { return nil } + + let idx: String.Index + if isScalarSemantics { + guard bitset.matches(unicodeScalars[pos]) else { return nil } + idx = unicodeScalars.index(after: pos) + } else { + guard bitset.matches(self[pos]) else { return nil } + idx = index(after: pos) } - let idx = index(after: pos) guard idx <= end else { return nil } - return idx } diff --git a/Sources/_StringProcessing/Utility/AsciiBitset.swift b/Sources/_StringProcessing/Utility/AsciiBitset.swift index ad3159820..e063447a0 100644 --- a/Sources/_StringProcessing/Utility/AsciiBitset.swift +++ b/Sources/_StringProcessing/Utility/AsciiBitset.swift @@ -57,7 +57,7 @@ extension DSLTree.CustomCharacterClass { } } - internal func matches(char: Character) -> Bool { + internal func matches(_ char: Character) -> Bool { let matched: Bool if let val = char._singleScalarAsciiValue { matched = matches(val) @@ -71,7 +71,7 @@ extension DSLTree.CustomCharacterClass { return matched } - internal func matches(scalar: Unicode.Scalar) -> Bool { + internal func matches(_ scalar: Unicode.Scalar) -> Bool { let matched: Bool if scalar.isASCII { let val = UInt8(ascii: scalar) From a13c594d660f11a0af1284b1f62d50a16d20d752 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 19 Apr 2023 11:13:47 -0600 Subject: [PATCH 13/13] renaming --- Sources/_StringProcessing/Engine/MEBuiltins.swift | 14 +++++++------- Sources/_StringProcessing/Engine/MEQuantify.swift | 4 ++-- Sources/_StringProcessing/Engine/Processor.swift | 2 +- .../_StringProcessing/_CharacterClassModel.swift | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index d8c8c347b..5e446b472 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -15,7 +15,7 @@ extension Processor { isStrictASCII: Bool, isScalarSemantics: Bool ) -> Bool { - guard let next = input._matchBuiltinCC( + guard let next = input.matchBuiltinCC( cc, at: currentPosition, isInverted: isInverted, @@ -123,7 +123,7 @@ extension Processor { extension String { // TODO: Should the below have a `limitedBy` parameter? - func _matchAnyNonNewline( + func matchAnyNonNewline( at currentPosition: String.Index, isScalarSemantics: Bool ) -> String.Index? { @@ -145,7 +145,7 @@ extension String { } @inline(__always) - func _quickMatchAnyNonNewline( + private func _quickMatchAnyNonNewline( at currentPosition: String.Index, isScalarSemantics: Bool ) -> QuickResult { @@ -165,7 +165,7 @@ extension String { } @inline(never) - func _thoroughMatchAnyNonNewline( + private func _thoroughMatchAnyNonNewline( at currentPosition: String.Index, isScalarSemantics: Bool ) -> String.Index? { @@ -187,7 +187,7 @@ extension String { // TODO: Should the below have a `limitedBy` parameter? // Mentioned in ProgrammersManual.md, update docs if redesigned - func _matchBuiltinCC( + func matchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, isInverted: Bool, @@ -222,7 +222,7 @@ extension String { // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(__always) - func _quickMatchBuiltinCC( + private func _quickMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, isInverted: Bool, @@ -240,7 +240,7 @@ extension String { // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(never) - func _thoroughMatchBuiltinCC( + private func _thoroughMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, isInverted: Bool, diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index cb1cfc177..7ca3ae84a 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -21,7 +21,7 @@ extension Processor { // FIXME: bounds check? endIndex or end? // We only emit .quantify if it consumes a single character - return input._matchBuiltinCC( + return input.matchBuiltinCC( payload.builtin, at: currentPosition, isInverted: payload.builtinIsInverted, @@ -35,7 +35,7 @@ extension Processor { return input.index(after: currentPosition) } - return input._matchAnyNonNewline( + return input.matchAnyNonNewline( at: currentPosition, isScalarSemantics: false) } } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 1fba9d517..2895ce5b3 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -309,7 +309,7 @@ extension Processor { mutating func matchAnyNonNewline( isScalarSemantics: Bool ) -> Bool { - guard let next = input._matchAnyNonNewline( + guard let next = input.matchAnyNonNewline( at: currentPosition, isScalarSemantics: isScalarSemantics ) else { diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index cdee66ddb..c053b31e4 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -79,7 +79,7 @@ struct _CharacterClassModel: Hashable { let isScalarSemantics = matchLevel == .unicodeScalar - return input._matchBuiltinCC( + return input.matchBuiltinCC( cc, at: currentPosition, isInverted: isInverted,