diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index a3d864165..4ec7a71dd 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -115,8 +115,69 @@ extension Processor { } } -// MARK: Built-in character class matching +// MARK: Matching `.` +extension String { + + func _matchAnyNonNewline( + at currentPosition: String.Index, + isScalarSemantics: Bool + ) -> String.Index? { + guard currentPosition < endIndex else { + return nil + } + if case .definite(let result) = _quickMatchAnyNonNewline( + at: currentPosition, + isScalarSemantics: isScalarSemantics + ) { + assert(result == _thoroughMatchAnyNonNewline( + at: currentPosition, + isScalarSemantics: isScalarSemantics)) + return result + } + return _thoroughMatchAnyNonNewline( + at: currentPosition, + isScalarSemantics: isScalarSemantics) + } + + @inline(__always) + func _quickMatchAnyNonNewline( + at currentPosition: String.Index, + isScalarSemantics: Bool + ) -> QuickResult { + assert(currentPosition < endIndex) + guard let (asciiValue, next, isCRLF) = _quickASCIICharacter( + at: currentPosition + ) else { + return .unknown + } + switch asciiValue { + case ._lineFeed, ._carriageReturn: + return .definite(nil) + default: + assert(!isCRLF) + return .definite(next) + } + } + + @inline(never) + func _thoroughMatchAnyNonNewline( + at currentPosition: String.Index, + isScalarSemantics: Bool + ) -> String.Index? { + assert(currentPosition < endIndex) + if isScalarSemantics { + let scalar = unicodeScalars[currentPosition] + guard !scalar.isNewline else { return nil } + return unicodeScalars.index(after: currentPosition) + } + + let char = self[currentPosition] + guard !char.isNewline else { return nil } + return index(after: currentPosition) + } +} +// MARK: - Built-in character class matching extension String { // Mentioned in ProgrammersManual.md, update docs if redesigned diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 25cad5c8c..98aee7803 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -346,23 +346,18 @@ extension Processor { return true } - // Matches the next character if it is not a newline - mutating func matchAnyNonNewline() -> Bool { - guard let c = load(), !c.isNewline else { - signalFailure() - return false - } - _uncheckedForcedConsumeOne() - return true - } - - // Matches the next scalar if it is not a newline - mutating func matchAnyNonNewlineScalar() -> Bool { - guard let s = loadScalar(), !s.isNewline else { + // Matches the next character/scalar if it is not a newline + mutating func matchAnyNonNewline( + isScalarSemantics: Bool + ) -> Bool { + guard let next = input._matchAnyNonNewline( + at: currentPosition, + isScalarSemantics: isScalarSemantics + ) else { signalFailure() return false } - input.unicodeScalars.formIndex(after: ¤tPosition) + currentPosition = next return true } @@ -535,14 +530,8 @@ extension Processor { } } case .matchAnyNonNewline: - if payload.isScalar { - if matchAnyNonNewlineScalar() { - controller.step() - } - } else { - if matchAnyNonNewline() { - controller.step() - } + if matchAnyNonNewline(isScalarSemantics: payload.isScalar) { + controller.step() } case .match: let (isCaseInsensitive, reg) = payload.elementPayload diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift index 5150e18cc..de13e340a 100644 --- a/Sources/_StringProcessing/Unicode/ASCII.swift +++ b/Sources/_StringProcessing/Unicode/ASCII.swift @@ -9,26 +9,25 @@ // //===----------------------------------------------------------------------===// -private var _lineFeed: UInt8 { 0x0A } -private var _carriageReturn: UInt8 { 0x0D } -private var _lineTab: UInt8 { 0x0B } -private var _formFeed: UInt8 { 0x0C } -private var _space: UInt8 { 0x20 } -private var _tab: UInt8 { 0x09 } +extension UInt8 { + static var _lineFeed: UInt8 { 0x0A } + static var _carriageReturn: UInt8 { 0x0D } + static var _lineTab: UInt8 { 0x0B } + static var _formFeed: UInt8 { 0x0C } + static var _space: UInt8 { 0x20 } + static var _tab: UInt8 { 0x09 } + + static var _underscore: UInt8 { 0x5F } +} private var _0: UInt8 { 0x30 } private var _9: UInt8 { 0x39 } -private func _isASCIINumber(_ x: UInt8) -> Bool { - return (_0..._9).contains(x) -} private var _a: UInt8 { 0x61 } private var _z: UInt8 { 0x7A } private var _A: UInt8 { 0x41 } private var _Z: UInt8 { 0x5A } -private var _underscore: UInt8 { 0x5F } - extension UInt8 { var _isASCII: Bool { self < 0x80 } @@ -43,14 +42,14 @@ extension UInt8 { /// Assuming we're ASCII, whether we match `\h` var _asciiIsHorizontalWhitespace: Bool { assert(_isASCII) - return self == _space || self == _tab + return self == ._space || self == ._tab } /// Assuming we're ASCII, whether we match `\v` var _asciiIsVerticalWhitespace: Bool { assert(_isASCII) switch self { - case _lineFeed, _carriageReturn, _lineTab, _formFeed: + case ._lineFeed, ._carriageReturn, ._lineTab, ._formFeed: return true default: return false @@ -61,7 +60,7 @@ extension UInt8 { var _asciiIsWhitespace: Bool { assert(_isASCII) switch self { - case _space, _tab, _lineFeed, _lineTab, _formFeed, _carriageReturn: + case ._space, ._tab, ._lineFeed, ._lineTab, ._formFeed, ._carriageReturn: return true default: return false @@ -77,11 +76,13 @@ extension UInt8 { /// Assuming we're ASCII, whether we match `\w` var _asciiIsWord: Bool { assert(_isASCII) - return _asciiIsDigit || _asciiIsLetter || self == _underscore + return _asciiIsDigit || _asciiIsLetter || self == ._underscore } } extension String { + /// TODO: better to take isScalarSemantics parameter, we can return more results + /// and we can give the right `next` index, not requiring the caller to re-adjust it /// TODO: detailed description of nuanced semantics func _quickASCIICharacter( at idx: Index @@ -107,7 +108,7 @@ extension String { guard tail._isSub300StartingByte else { return nil } // Handle CR-LF: - if base == _carriageReturn && tail == _lineFeed { + if base == ._carriageReturn && tail == ._lineFeed { utf8.formIndex(after: &next) guard next == endIndex || utf8[next]._isSub300StartingByte else { return nil @@ -165,5 +166,6 @@ extension String { return (next, asciiValue._asciiIsWord) } } + }