diff --git a/stdlib/private/StdlibUnicodeUnittest/GraphemeBreaking.swift b/stdlib/private/StdlibUnicodeUnittest/GraphemeBreaking.swift index 2cd26d6ae5708..c8c23482c6740 100644 --- a/stdlib/private/StdlibUnicodeUnittest/GraphemeBreaking.swift +++ b/stdlib/private/StdlibUnicodeUnittest/GraphemeBreaking.swift @@ -15,56 +15,47 @@ #if _runtime(_ObjC) import Foundation -func parseGraphemeBreakTests( - _ data: String, - into result: inout [(String, Int)] -) { - for line in data.split(separator: "\n") { +public struct GraphemeBreakTest { + public let string: String + public let pieces: [[Unicode.Scalar]] + + init?(line: some StringProtocol) { // Only look at actual tests - guard line.hasPrefix("÷") else { - continue - } + guard line.hasPrefix("÷") else { return nil } let info = line.split(separator: "#") let components = info[0].split(separator: " ") var string = "" - var count = 0 - - for i in components.indices { - guard i != 0 else { - continue - } - - let scalar: Unicode.Scalar - - // If we're an odd index, this is a scalar. - if i & 0x1 == 1 { - scalar = Unicode.Scalar(UInt32(components[i], radix: 16)!)! - + var pieces: [[Unicode.Scalar]] = [] + + var piece: [Unicode.Scalar] = [] + for component in components { + switch component { + case "": + break + case "×": // no grapheme break opportunity + break + case "÷": // grapheme break opportunity + guard !piece.isEmpty else { break } + pieces.append(piece) + piece = [] + case _: // hexadecimal scalar value + guard let value = UInt32(component, radix: 16) else { return nil } + guard let scalar = Unicode.Scalar(value) else { return nil } string.unicodeScalars.append(scalar) - } else { - // Otherwise, it is a grapheme breaking operator. - - // If this is a break, record the +1 count. Otherwise it is × which is - // not a break. - if components[i] == "÷" { - count += 1 - } + piece.append(scalar) } } - - result.append((string, count)) + if !piece.isEmpty { pieces.append(piece) } + self.string = string + self.pieces = pieces } } -public let graphemeBreakTests: [(String, Int)] = { - var result: [(String, Int)] = [] - +public let graphemeBreakTests: [GraphemeBreakTest] = { let testFile = readInputFile("GraphemeBreakTest.txt") - - parseGraphemeBreakTests(testFile, into: &result) - - return result + return testFile.split(separator: "\n") + .compactMap { GraphemeBreakTest(line: $0) } }() #endif diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index 3646636a9cd25..b602485b72da4 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -2,7 +2,7 @@ // // This source file is part of the Swift.org open source project // -// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors +// Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information @@ -436,6 +436,117 @@ internal struct _GraphemeBreakingState { var shouldBreakRI = false } +extension Unicode { + /// A state machine for recognizing character (i.e., extended grapheme + /// cluster) boundaries in an arbitrary series of Unicode scalars. + /// + /// To detect grapheme breaks in a sequence of Unicode scalars, feed each of + /// them to the `hasBreak(before:)` method. The method returns true if the + /// sequence has a grapheme break preceding the given value. + /// + /// The results produced by this state machine are guaranteed to match the way + /// `String` splits its contents into `Character` values. + @available(SwiftStdlib 5.8, *) + public // SPI(Foundation) FIXME: We need API for this + struct _CharacterRecognizer { + internal var _previous: Unicode.Scalar + internal var _state: _GraphemeBreakingState + + /// Returns a non-nil value if it can be determined whether there is a + /// grapheme break between `scalar1` and `scalar2` without knowing anything + /// about the scalars that precede `scalar1`. This can optionally be used as + /// a fast (but incomplete) test before spinning up a full state machine + /// session. + @_effects(releasenone) + public static func quickBreak( + between scalar1: Unicode.Scalar, + and scalar2: Unicode.Scalar + ) -> Bool? { + if scalar1.value == 0xD, scalar2.value == 0xA { + return false + } + if _hasGraphemeBreakBetween(scalar1, scalar2) { + return true + } + return nil + } + + /// Initialize a new character recognizer at the _start of text_ (sot) + /// position. + /// + /// The resulting state machine will report a grapheme break on the + /// first scalar that is fed to it. + public init() { + _state = _GraphemeBreakingState() + // To avoid having to handle the empty case specially, we use NUL as the + // placeholder before the first scalar. NUL is a control character, so per + // rule GB5, it will induce an unconditional grapheme break before the + // first actual scalar, emulating GB1. + _previous = Unicode.Scalar(0 as UInt8) + } + + /// Feeds the next scalar to the state machine, returning a Boolean value + /// indicating whether it starts a new extended grapheme cluster. + /// + /// This method will always report a break the first time it is called + /// on a newly initialized recognizer. + /// + /// The state machine does not carry information across character + /// boundaries. I.e., if this method returns true, then `self` after the + /// call is equivalent to feeding the same scalar to a newly initialized + /// recognizer instance. + @_effects(releasenone) + public mutating func hasBreak( + before next: Unicode.Scalar + ) -> Bool { + let r = _state.shouldBreak(between: _previous, and: next) + if r { + _state = _GraphemeBreakingState() + } + _previous = next + return r + } + + /// Decode the scalars in the given UTF-8 buffer and feed them to the + /// recognizer up to and including the scalar following the first grapheme + /// break. If the buffer contains a grapheme break, then this function + /// returns the index range of the scalar that follows the first one; + /// otherwise it returns `nil`. + /// + /// On return, the state of the recognizer is updated to reflect the scalars + /// up to and including the returned one. You can detect additional grapheme + /// breaks by feeding the recognizer subsequent data. + /// + /// - Parameter buffer: A buffer containing valid UTF-8 data, starting and + /// ending on Unicode scalar boundaries. + /// + /// - Parameter start: A valid index into `buffer`, addressing the first + /// code unit of a UTF-8 scalar in the buffer, or the end. + /// + /// - Returns: The index range of the scalar that follows the first grapheme + /// break in the buffer, if there is one. If the buffer contains no + /// grapheme breaks, then this function returns `nil`. + /// + /// - Warning: This function does not validate that the buffer contains + /// valid UTF-8 data; its behavior is undefined if given invalid input. + @_effects(releasenone) + public mutating func _firstBreak( + inUncheckedUnsafeUTF8Buffer buffer: UnsafeBufferPointer, + startingAt start: Int = 0 + ) -> Range? { + var i = start + while i < buffer.endIndex { + let (next, n) = _decodeScalar(buffer, startingAt: i) + if hasBreak(before: next) { + return Range(_uncheckedBounds: (i, i &+ n)) + } + i &+= n + } + return nil + } + } +} + extension _StringGuts { // Returns the stride of the grapheme cluster starting at offset `index`, // assuming it is on a grapheme cluster boundary. @@ -459,7 +570,7 @@ extension _StringGuts { while true { guard let (scalar2, nextIndex) = nextScalar(index) else { break } - if shouldBreak(between: scalar, and: scalar2, at: index, with: &state) { + if state.shouldBreak(between: scalar, and: scalar2) { break } index = nextIndex @@ -505,7 +616,7 @@ extension _StringGuts { } } -extension _StringGuts { +extension _GraphemeBreakingState { // Return true if there is an extended grapheme cluster boundary between two // scalars, based on state information previously collected about preceding // scalars. @@ -517,11 +628,9 @@ extension _StringGuts { // // This is based on the Unicode Annex #29 for [Grapheme Cluster Boundary // Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules). - internal func shouldBreak( + internal mutating func shouldBreak( between scalar1: Unicode.Scalar, - and scalar2: Unicode.Scalar, - at index: Int, - with state: inout _GraphemeBreakingState + and scalar2: Unicode.Scalar ) -> Bool { // GB3 if scalar1.value == 0xD, scalar2.value == 0xA { @@ -545,8 +654,8 @@ extension _StringGuts { var enterIndicSequence = false defer { - state.isInEmojiSequence = enterEmojiSequence - state.isInIndicSequence = enterIndicSequence + self.isInEmojiSequence = enterEmojiSequence + self.isInIndicSequence = enterIndicSequence } switch (x, y) { @@ -591,14 +700,14 @@ extension _StringGuts { // continue the grapheme cluster by combining more scalars later. If we're // not currently in an emoji sequence, but our lhs scalar is a pictograph, // then that's a signal that it's the start of an emoji sequence. - if state.isInEmojiSequence || x == .extendedPictographic { + if self.isInEmojiSequence || x == .extendedPictographic { enterEmojiSequence = true } // If we're currently in an indic sequence (or if our lhs is a linking // consonant), then this check and everything underneath ensures that // we continue being in one and may check if this extend is a Virama. - if state.isInIndicSequence || scalar1._isLinkingConsonant { + if self.isInIndicSequence || scalar1._isLinkingConsonant { if y == .extend { let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300) @@ -611,7 +720,7 @@ extension _StringGuts { enterIndicSequence = true if scalar2._isVirama { - state.hasSeenVirama = true + self.hasSeenVirama = true } } @@ -627,32 +736,34 @@ extension _StringGuts { // GB11 case (.zwj, .extendedPictographic): - return !state.isInEmojiSequence + return !self.isInEmojiSequence // GB12 & GB13 case (.regionalIndicator, .regionalIndicator): defer { - state.shouldBreakRI.toggle() + self.shouldBreakRI.toggle() } - return state.shouldBreakRI + return self.shouldBreakRI // GB999 default: // GB9c if - state.isInIndicSequence, - state.hasSeenVirama, + self.isInIndicSequence, + self.hasSeenVirama, scalar2._isLinkingConsonant { - state.hasSeenVirama = false + self.hasSeenVirama = false return false } return true } } +} +extension _StringGuts { // Return true if there is an extended grapheme cluster boundary between two // scalars, with no previous knowledge about preceding scalars. // diff --git a/test/stdlib/CharacterRecognizer.swift b/test/stdlib/CharacterRecognizer.swift new file mode 100644 index 0000000000000..6bacbaaacac54 --- /dev/null +++ b/test/stdlib/CharacterRecognizer.swift @@ -0,0 +1,116 @@ +// RUN: %empty-directory(%t) +// RUN: %target-run-stdlib-swift %S/Inputs/ + +// REQUIRES: executable_test +// REQUIRES: objc_interop +// REQUIRES: optimized_stdlib + +import Swift +import StdlibUnittest +import StdlibUnicodeUnittest + +var suite = TestSuite("CharacterRecognizer") +defer { runAllTests() } + +if #available(SwiftStdlib 5.8, *) { + suite.test("Unicode test data/hasBreak") { + for test in graphemeBreakTests { + var recognizer = Unicode._CharacterRecognizer() + var pieces: [[Unicode.Scalar]] = [] + var piece: [Unicode.Scalar] = [] + for scalar in test.string.unicodeScalars { + if recognizer.hasBreak(before: scalar) { + if !piece.isEmpty { pieces.append(piece) } + piece = [scalar] + } else { + piece.append(scalar) + } + } + if !piece.isEmpty { pieces.append(piece) } + expectEqual(pieces, test.pieces, + "string: \(String(reflecting: test.string))") + } + } +} + +func scalars(in buffer: some Sequence) -> [Unicode.Scalar] { + var result: [Unicode.Scalar] = [] + var it = buffer.makeIterator() + var utf8Decoder = UTF8() + while true { + switch utf8Decoder.decode(&it) { + case .scalarValue(let v): result.append(v) + case .emptyInput: return result + case .error: expectTrue(false, "Invalid scalar") + } + } +} + +if #available(SwiftStdlib 5.8, *) { + suite.test("Unicode test data/_firstBreak") { + for test in graphemeBreakTests { + var recognizer = Unicode._CharacterRecognizer() + var pieces: [[Unicode.Scalar]] = [] + var str = test.string + str.withUTF8 { buffer in + var i = buffer.startIndex + var last = i + while i < buffer.endIndex { + guard let scalar = recognizer._firstBreak( + inUncheckedUnsafeUTF8Buffer: buffer, startingAt: i) + else { break } + + if scalar.lowerBound > last { + pieces.append(scalars(in: buffer[last.. startIndex { + let i = self.index(before: j) + r.append(Array(self[i.. Bool } -func getUTF16Array(from string: String) -> [UInt16] { - var result: [UInt16] = [] - - for cp in string.utf16 { - result.append(cp) - } - - return result -} - if #available(SwiftStdlib 5.6, *) { StringGraphemeBreaking.test("grapheme breaking foreign") { - for graphemeBreakTest in graphemeBreakTests { - let foreignTest = NonContiguousNSString( - getUTF16Array(from: graphemeBreakTest.0) - ) - let test = foreignTest as String + for test in graphemeBreakTests { + let foreign = NonContiguousNSString(Array(test.string.utf16)) + let string = foreign as String - expectTrue(test._guts._isForeign()) + expectTrue(string._guts._isForeign()) expectEqual( - graphemeBreakTest.1, test.count, - "string: \(String(reflecting: graphemeBreakTest.0))") + string.forwardPieces, test.pieces, + "string: \(String(reflecting: test.string)) (forward)") expectEqual( - graphemeBreakTest.1, test.backwardsCount, - "string: \(String(reflecting: graphemeBreakTest.0))") + string.backwardPieces, test.pieces, + "string: \(String(reflecting: test.string)) (backward)") } } } - -runAllTests()