diff --git a/Sources/RegexBenchmark/Utils/Stats.swift b/Sources/RegexBenchmark/Utils/Stats.swift index 0cc9156a4..175826a0b 100644 --- a/Sources/RegexBenchmark/Utils/Stats.swift +++ b/Sources/RegexBenchmark/Utils/Stats.swift @@ -3,8 +3,8 @@ import Foundation enum Stats {} extension Stats { - // Maximum allowed standard deviation is 5% of the median runtime - static let maxAllowedStdev = 0.05 + // Maximum allowed standard deviation is 7.5% of the median runtime + static let maxAllowedStdev = 0.075 static func tTest(_ a: Measurement, _ b: Measurement) -> Bool { // Student's t-test diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index d8c8c347b..5e446b472 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -15,7 +15,7 @@ extension Processor { isStrictASCII: Bool, isScalarSemantics: Bool ) -> Bool { - guard let next = input._matchBuiltinCC( + guard let next = input.matchBuiltinCC( cc, at: currentPosition, isInverted: isInverted, @@ -123,7 +123,7 @@ extension Processor { extension String { // TODO: Should the below have a `limitedBy` parameter? - func _matchAnyNonNewline( + func matchAnyNonNewline( at currentPosition: String.Index, isScalarSemantics: Bool ) -> String.Index? { @@ -145,7 +145,7 @@ extension String { } @inline(__always) - func _quickMatchAnyNonNewline( + private func _quickMatchAnyNonNewline( at currentPosition: String.Index, isScalarSemantics: Bool ) -> QuickResult { @@ -165,7 +165,7 @@ extension String { } @inline(never) - func _thoroughMatchAnyNonNewline( + private func _thoroughMatchAnyNonNewline( at currentPosition: String.Index, isScalarSemantics: Bool ) -> String.Index? { @@ -187,7 +187,7 @@ extension String { // TODO: Should the below have a `limitedBy` parameter? // Mentioned in ProgrammersManual.md, update docs if redesigned - func _matchBuiltinCC( + func matchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, isInverted: Bool, @@ -222,7 +222,7 @@ extension String { // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(__always) - func _quickMatchBuiltinCC( + private func _quickMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, isInverted: Bool, @@ -240,7 +240,7 @@ extension String { // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(never) - func _thoroughMatchBuiltinCC( + private func _thoroughMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, isInverted: Bool, diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 873627567..7ca3ae84a 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,21 +1,27 @@ extension Processor { func _doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { - // FIXME: is the below updated for scalar semantics? + // TODO: This optimization is only enabled for grapheme cluster semantics, + // we want these for scalar semantics as well. + switch payload.type { case .bitset: return input.matchBitset( - registers[payload.bitset], at: currentPosition, limitedBy: end) + registers[payload.bitset], + at: currentPosition, + limitedBy: end, + isScalarSemantics: false) case .asciiChar: return input.matchScalar( UnicodeScalar.init(_value: UInt32(payload.asciiChar)), at: currentPosition, limitedBy: end, - boundaryCheck: true) + boundaryCheck: true, + isCaseInsensitive: false) case .builtin: // FIXME: bounds check? endIndex or end? // We only emit .quantify if it consumes a single character - return input._matchBuiltinCC( + return input.matchBuiltinCC( payload.builtin, at: currentPosition, isInverted: payload.builtinIsInverted, @@ -29,7 +35,7 @@ extension Processor { return input.index(after: currentPosition) } - return input._matchAnyNonNewline( + return input.matchAnyNonNewline( at: currentPosition, isScalarSemantics: false) } } @@ -41,7 +47,7 @@ extension Processor { var trips = 0 var extraTrips = payload.extraTrips var savePoint = startQuantifierSavePoint() - + while true { if trips >= payload.minTrips { if extraTrips == 0 { break } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 3ff767207..0350a37db 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -35,7 +35,7 @@ struct Processor { /// of the search. `input` can be a "supersequence" of the subject, while /// `input[subjectBounds]` is the logical entity that is being searched. let input: Input - + /// The bounds of the logical subject in `input`. /// /// `subjectBounds` represents the bounds of the string or substring that a @@ -46,7 +46,7 @@ struct Processor { /// `subjectBounds` is always equal to or a subrange of /// `input.startIndex.. - + /// The bounds within the subject for an individual search. /// /// `searchBounds` is equal to `subjectBounds` in some cases, but can be a @@ -62,7 +62,7 @@ struct Processor { let instructions: InstructionList // MARK: Resettable state - + /// The current search position while processing. /// /// `currentPosition` must always be in the range `subjectBounds` or equal @@ -81,7 +81,7 @@ struct Processor { var wordIndexCache: Set? = nil var wordIndexMaxIndex: String.Index? = nil - + var state: State = .inProgress var failureReason: Error? = nil @@ -122,7 +122,7 @@ extension Processor { // Initialize registers with end of search bounds self.registers = Registers(program, searchBounds.upperBound) self.storedCaptures = Array( - repeating: .init(), count: program.registerInfo.captures) + repeating: .init(), count: program.registerInfo.captures) _checkInvariants() } @@ -169,18 +169,12 @@ extension Processor { input[searchBounds] } - // Advance in our input, without any checks or failure signalling - mutating func _uncheckedForcedConsumeOne() { - assert(currentPosition != end) - input.formIndex(after: ¤tPosition) - } - // Advance in our input // // Returns whether the advance succeeded. On failure, our // save point was restored mutating func consume(_ n: Distance) -> Bool { - // TODO: need benchmark coverage + // TODO: needs benchmark coverage guard let idx = input.index( currentPosition, offsetBy: n.rawValue, limitedBy: end ) else { @@ -190,10 +184,10 @@ extension Processor { currentPosition = idx return true } - + // Advances in unicode scalar view mutating func consumeScalar(_ n: Distance) -> Bool { - // TODO: need benchmark coverage + // TODO: needs benchmark coverage guard let idx = input.unicodeScalars.index( currentPosition, offsetBy: n.rawValue, limitedBy: end ) else { @@ -226,11 +220,23 @@ extension Processor { currentPosition < end ? input[currentPosition] : nil } + // MARK: Match functions + // + // TODO: refactor these such that `cycle()` calls the corresponding String + // method directly, and all the step, signalFailure, and + // currentPosition logic is collected into a single place inside + // cycle(). + // Match against the current input element. Returns whether // it succeeded vs signaling an error. - mutating func match(_ e: Element) -> Bool { + mutating func match( + _ e: Element, isCaseInsensitive: Bool + ) -> Bool { guard let next = input.match( - e, at: currentPosition, limitedBy: end + e, + at: currentPosition, + limitedBy: end, + isCaseInsensitive: isCaseInsensitive ) else { signalFailure() return false @@ -239,32 +245,17 @@ extension Processor { return true } - mutating func matchCaseInsensitive(_ e: Element) -> Bool { - // TODO: need benchmark coverage - guard let cur = load(), cur.lowercased() == e.lowercased() else { - signalFailure() - return false - } - _uncheckedForcedConsumeOne() - return true - } - // Match against the current input prefix. Returns whether // it succeeded vs signaling an error. mutating func matchSeq( _ seq: Substring, - isScalarMode: Bool + isScalarSemantics: Bool ) -> Bool { - if isScalarMode { - // TODO: needs benchmark coverage - for s in seq.unicodeScalars { - guard matchScalar(s, boundaryCheck: false) else { return false } - } - return true - } - guard let next = input.matchSeq( - seq, at: currentPosition, limitedBy: end + seq, + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics ) else { signalFailure() return false @@ -274,13 +265,17 @@ extension Processor { return true } - func loadScalar() -> Unicode.Scalar? { - currentPosition < end ? input.unicodeScalars[currentPosition] : nil - } - - mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { + mutating func matchScalar( + _ s: Unicode.Scalar, + boundaryCheck: Bool, + isCaseInsensitive: Bool + ) -> Bool { guard let next = input.matchScalar( - s, at: currentPosition, limitedBy: end, boundaryCheck: boundaryCheck + s, + at: currentPosition, + limitedBy: end, + boundaryCheck: boundaryCheck, + isCaseInsensitive: isCaseInsensitive ) else { signalFailure() return false @@ -289,34 +284,18 @@ extension Processor { return true } - mutating func matchScalarCaseInsensitive( - _ s: Unicode.Scalar, - boundaryCheck: Bool - ) -> Bool { - // TODO: needs benchmark coverage - guard let curScalar = loadScalar(), - s.properties.lowercaseMapping == curScalar.properties.lowercaseMapping, - let idx = input.unicodeScalars.index( - currentPosition, - offsetBy: 1, - limitedBy: end), - (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) - else { - signalFailure() - return false - } - currentPosition = idx - return true - } - // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset mutating func matchBitset( - _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + isScalarSemantics: Bool ) -> Bool { guard let next = input.matchBitset( - bitset, at: currentPosition, limitedBy: end + bitset, + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics ) else { signalFailure() return false @@ -325,26 +304,11 @@ extension Processor { return true } - // Equivalent of matchBitset but emitted when in unicode scalar semantic mode - mutating func matchBitsetScalar( - _ bitset: DSLTree.CustomCharacterClass.AsciiBitset - ) -> Bool { - // TODO: needs benchmark coverage - guard let curScalar = loadScalar(), - bitset.matches(scalar: curScalar), - let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end) else { - signalFailure() - return false - } - currentPosition = idx - return true - } - // Matches the next character/scalar if it is not a newline mutating func matchAnyNonNewline( isScalarSemantics: Bool ) -> Bool { - guard let next = input._matchAnyNonNewline( + guard let next = input.matchAnyNonNewline( at: currentPosition, isScalarSemantics: isScalarSemantics ) else { @@ -425,7 +389,7 @@ extension Processor { // TODO: What should we do here? fatalError("Invalid code: Tried to clear save points when empty") } - + mutating func cycle() { _checkInvariants() assert(state == .inProgress) @@ -519,39 +483,25 @@ extension Processor { } case .match: let (isCaseInsensitive, reg) = payload.elementPayload - if isCaseInsensitive { - if matchCaseInsensitive(registers[reg]) { - controller.step() - } - } else { - if match(registers[reg]) { - controller.step() - } + if match(registers[reg], isCaseInsensitive: isCaseInsensitive) { + controller.step() } case .matchScalar: let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload - if caseInsensitive { - if matchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck) { - controller.step() - } - } else { - if matchScalar(scalar, boundaryCheck: boundaryCheck) { - controller.step() - } + if matchScalar( + scalar, + boundaryCheck: boundaryCheck, + isCaseInsensitive: caseInsensitive + ) { + controller.step() } case .matchBitset: let (isScalar, reg) = payload.bitsetPayload let bitset = registers[reg] - if isScalar { - if matchBitsetScalar(bitset) { - controller.step() - } - } else { - if matchBitset(bitset) { - controller.step() - } + if matchBitset(bitset, isScalarSemantics: isScalar) { + controller.step() } case .matchBuiltin: @@ -642,7 +592,7 @@ extension Processor { signalFailure() return } - if matchSeq(input[range], isScalarMode: isScalarMode) { + if matchSeq(input[range], isScalarSemantics: isScalarMode) { controller.step() } @@ -688,18 +638,29 @@ extension Processor { } } +// MARK: String matchers +// +// TODO: Refactor into separate file, formalize patterns + extension String { func match( _ char: Character, at pos: Index, - limitedBy end: String.Index + limitedBy end: String.Index, + isCaseInsensitive: Bool ) -> Index? { // TODO: This can be greatly sped up with string internals // TODO: This is also very much quick-check-able assert(end <= endIndex) - guard pos < end, self[pos] == char else { return nil } + guard pos < end else { return nil } + + if isCaseInsensitive { + guard self[pos].lowercased() == char.lowercased() else { return nil } + } else { + guard self[pos] == char else { return nil } + } let idx = index(after: pos) guard idx <= end else { return nil } @@ -710,16 +671,25 @@ extension String { func matchSeq( _ seq: Substring, at pos: Index, - limitedBy end: Index + limitedBy end: Index, + isScalarSemantics: Bool ) -> Index? { // TODO: This can be greatly sped up with string internals // TODO: This is also very much quick-check-able assert(end <= endIndex) var cur = pos - for e in seq { - guard cur < end, self[cur] == e else { return nil } - self.formIndex(after: &cur) + + if isScalarSemantics { + for e in seq.unicodeScalars { + guard cur < end, unicodeScalars[cur] == e else { return nil } + self.unicodeScalars.formIndex(after: &cur) + } + } else { + for e in seq { + guard cur < end, self[cur] == e else { return nil } + self.formIndex(after: &cur) + } } guard cur <= end else { return nil } @@ -730,12 +700,23 @@ extension String { _ scalar: Unicode.Scalar, at pos: Index, limitedBy end: String.Index, - boundaryCheck: Bool + boundaryCheck: Bool, + isCaseInsensitive: Bool ) -> Index? { + // TODO: extremely quick-check-able + // TODO: can be sped up with string internals assert(end <= endIndex) - guard pos < end, unicodeScalars[pos] == scalar else { - return nil + guard pos < end else { return nil } + let curScalar = unicodeScalars[pos] + + if isCaseInsensitive { + guard curScalar.properties.lowercaseMapping == scalar.properties.lowercaseMapping + else { + return nil + } + } else { + guard curScalar == scalar else { return nil } } let idx = unicodeScalars.index(after: pos) @@ -751,20 +732,25 @@ extension String { func matchBitset( _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, at pos: Index, - limitedBy end: Index + limitedBy end: Index, + isScalarSemantics: Bool ) -> Index? { // TODO: extremely quick-check-able // TODO: can be sped up with string internals - assert(end <= endIndex) - guard pos < end, bitset.matches(char: self[pos]) else { - return nil + guard pos < end else { return nil } + + let idx: String.Index + if isScalarSemantics { + guard bitset.matches(unicodeScalars[pos]) else { return nil } + idx = unicodeScalars.index(after: pos) + } else { + guard bitset.matches(self[pos]) else { return nil } + idx = index(after: pos) } - let idx = index(after: pos) guard idx <= end else { return nil } - return idx } diff --git a/Sources/_StringProcessing/Utility/AsciiBitset.swift b/Sources/_StringProcessing/Utility/AsciiBitset.swift index ad3159820..e063447a0 100644 --- a/Sources/_StringProcessing/Utility/AsciiBitset.swift +++ b/Sources/_StringProcessing/Utility/AsciiBitset.swift @@ -57,7 +57,7 @@ extension DSLTree.CustomCharacterClass { } } - internal func matches(char: Character) -> Bool { + internal func matches(_ char: Character) -> Bool { let matched: Bool if let val = char._singleScalarAsciiValue { matched = matches(val) @@ -71,7 +71,7 @@ extension DSLTree.CustomCharacterClass { return matched } - internal func matches(scalar: Unicode.Scalar) -> Bool { + internal func matches(_ scalar: Unicode.Scalar) -> Bool { let matched: Bool if scalar.isASCII { let val = UInt8(ascii: scalar) diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index cdee66ddb..c053b31e4 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -79,7 +79,7 @@ struct _CharacterClassModel: Hashable { let isScalarSemantics = matchLevel == .unicodeScalar - return input._matchBuiltinCC( + return input.matchBuiltinCC( cc, at: currentPosition, isInverted: isInverted,