From e562133c0a251bb90c284e795bb59957fc506ad1 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Thu, 29 Dec 2022 17:56:45 -0800 Subject: [PATCH 1/5] [test] Improve grapheme breaking tests Instead of just checking the number of breaks in each test case, expose and check the actual positions of those breaks, too. --- .../GraphemeBreaking.swift | 67 +++++++--------- test/stdlib/StringIndex.swift | 4 +- .../stdlib/StringGraphemeBreaking.swift | 78 +++++++++---------- 3 files changed, 69 insertions(+), 80 deletions(-) diff --git a/stdlib/private/StdlibUnicodeUnittest/GraphemeBreaking.swift b/stdlib/private/StdlibUnicodeUnittest/GraphemeBreaking.swift index 2cd26d6ae5708..c8c23482c6740 100644 --- a/stdlib/private/StdlibUnicodeUnittest/GraphemeBreaking.swift +++ b/stdlib/private/StdlibUnicodeUnittest/GraphemeBreaking.swift @@ -15,56 +15,47 @@ #if _runtime(_ObjC) import Foundation -func parseGraphemeBreakTests( - _ data: String, - into result: inout [(String, Int)] -) { - for line in data.split(separator: "\n") { +public struct GraphemeBreakTest { + public let string: String + public let pieces: [[Unicode.Scalar]] + + init?(line: some StringProtocol) { // Only look at actual tests - guard line.hasPrefix("÷") else { - continue - } + guard line.hasPrefix("÷") else { return nil } let info = line.split(separator: "#") let components = info[0].split(separator: " ") var string = "" - var count = 0 - - for i in components.indices { - guard i != 0 else { - continue - } - - let scalar: Unicode.Scalar - - // If we're an odd index, this is a scalar. - if i & 0x1 == 1 { - scalar = Unicode.Scalar(UInt32(components[i], radix: 16)!)! - + var pieces: [[Unicode.Scalar]] = [] + + var piece: [Unicode.Scalar] = [] + for component in components { + switch component { + case "": + break + case "×": // no grapheme break opportunity + break + case "÷": // grapheme break opportunity + guard !piece.isEmpty else { break } + pieces.append(piece) + piece = [] + case _: // hexadecimal scalar value + guard let value = UInt32(component, radix: 16) else { return nil } + guard let scalar = Unicode.Scalar(value) else { return nil } string.unicodeScalars.append(scalar) - } else { - // Otherwise, it is a grapheme breaking operator. - - // If this is a break, record the +1 count. Otherwise it is × which is - // not a break. - if components[i] == "÷" { - count += 1 - } + piece.append(scalar) } } - - result.append((string, count)) + if !piece.isEmpty { pieces.append(piece) } + self.string = string + self.pieces = pieces } } -public let graphemeBreakTests: [(String, Int)] = { - var result: [(String, Int)] = [] - +public let graphemeBreakTests: [GraphemeBreakTest] = { let testFile = readInputFile("GraphemeBreakTest.txt") - - parseGraphemeBreakTests(testFile, into: &result) - - return result + return testFile.split(separator: "\n") + .compactMap { GraphemeBreakTest(line: $0) } }() #endif diff --git a/test/stdlib/StringIndex.swift b/test/stdlib/StringIndex.swift index 00af44c45436c..aee779fffd6f3 100644 --- a/test/stdlib/StringIndex.swift +++ b/test/stdlib/StringIndex.swift @@ -656,8 +656,8 @@ suite.test("Fully exhaustive index interchange") #if _runtime(_ObjC) suite.test("Fully exhaustive index interchange/GraphemeBreakTests") { - for string in graphemeBreakTests.map { $0.0 } { - fullyExhaustiveIndexInterchange(string) + for test in graphemeBreakTests { + fullyExhaustiveIndexInterchange(test.string) } } #endif diff --git a/validation-test/stdlib/StringGraphemeBreaking.swift b/validation-test/stdlib/StringGraphemeBreaking.swift index 3ae6c5147cca4..549918afe154a 100644 --- a/validation-test/stdlib/StringGraphemeBreaking.swift +++ b/validation-test/stdlib/StringGraphemeBreaking.swift @@ -10,47 +10,59 @@ import StdlibUnicodeUnittest import Foundation let StringGraphemeBreaking = TestSuite("StringGraphemeBreaking") +defer { runAllTests() } extension String { - var backwardsCount: Int { - var c = 0 - var index = endIndex - while index != startIndex { - c += 1 - formIndex(before: &index) + var forwardPieces: [[Unicode.Scalar]] { + var i = startIndex + var r: [[Unicode.Scalar]] = [] + while i < endIndex { + let j = self.index(after: i) + r.append(Array(self[i.. startIndex { + let i = self.index(before: j) + r.append(Array(self[i.. Bool } -func getUTF16Array(from string: String) -> [UInt16] { - var result: [UInt16] = [] - - for cp in string.utf16 { - result.append(cp) - } - - return result -} - if #available(SwiftStdlib 5.6, *) { StringGraphemeBreaking.test("grapheme breaking foreign") { - for graphemeBreakTest in graphemeBreakTests { - let foreignTest = NonContiguousNSString( - getUTF16Array(from: graphemeBreakTest.0) - ) - let test = foreignTest as String + for test in graphemeBreakTests { + let foreign = NonContiguousNSString(Array(test.string.utf16)) + let string = foreign as String - expectTrue(test._guts._isForeign()) + expectTrue(string._guts._isForeign()) expectEqual( - graphemeBreakTest.1, test.count, - "string: \(String(reflecting: graphemeBreakTest.0))") + string.forwardPieces, test.pieces, + "string: \(String(reflecting: test.string)) (forward)") expectEqual( - graphemeBreakTest.1, test.backwardsCount, - "string: \(String(reflecting: graphemeBreakTest.0))") + string.backwardPieces, test.pieces, + "string: \(String(reflecting: test.string)) (backward)") } } } - -runAllTests() From 6e5097a9ecdff4c8e646ab0b094f47f56635883a Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Thu, 29 Dec 2022 18:04:02 -0800 Subject: [PATCH 2/5] [stdlib] String: Move shouldBreak into _GraphemeBreakingState This turns _GraphemeBreakingState into a more proper state machine, although it is only able to recognize breaks in the forward direction. The backward direction requires arbitrarily long lookback, and it currently remains in _StringGuts. --- .../public/core/StringGraphemeBreaking.swift | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index 3646636a9cd25..03ed080741d7a 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -2,7 +2,7 @@ // // This source file is part of the Swift.org open source project // -// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors +// Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information @@ -459,7 +459,7 @@ extension _StringGuts { while true { guard let (scalar2, nextIndex) = nextScalar(index) else { break } - if shouldBreak(between: scalar, and: scalar2, at: index, with: &state) { + if state.shouldBreak(between: scalar, and: scalar2) { break } index = nextIndex @@ -505,7 +505,7 @@ extension _StringGuts { } } -extension _StringGuts { +extension _GraphemeBreakingState { // Return true if there is an extended grapheme cluster boundary between two // scalars, based on state information previously collected about preceding // scalars. @@ -517,11 +517,9 @@ extension _StringGuts { // // This is based on the Unicode Annex #29 for [Grapheme Cluster Boundary // Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules). - internal func shouldBreak( + internal mutating func shouldBreak( between scalar1: Unicode.Scalar, - and scalar2: Unicode.Scalar, - at index: Int, - with state: inout _GraphemeBreakingState + and scalar2: Unicode.Scalar ) -> Bool { // GB3 if scalar1.value == 0xD, scalar2.value == 0xA { @@ -545,8 +543,8 @@ extension _StringGuts { var enterIndicSequence = false defer { - state.isInEmojiSequence = enterEmojiSequence - state.isInIndicSequence = enterIndicSequence + self.isInEmojiSequence = enterEmojiSequence + self.isInIndicSequence = enterIndicSequence } switch (x, y) { @@ -591,14 +589,14 @@ extension _StringGuts { // continue the grapheme cluster by combining more scalars later. If we're // not currently in an emoji sequence, but our lhs scalar is a pictograph, // then that's a signal that it's the start of an emoji sequence. - if state.isInEmojiSequence || x == .extendedPictographic { + if self.isInEmojiSequence || x == .extendedPictographic { enterEmojiSequence = true } // If we're currently in an indic sequence (or if our lhs is a linking // consonant), then this check and everything underneath ensures that // we continue being in one and may check if this extend is a Virama. - if state.isInIndicSequence || scalar1._isLinkingConsonant { + if self.isInIndicSequence || scalar1._isLinkingConsonant { if y == .extend { let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300) @@ -611,7 +609,7 @@ extension _StringGuts { enterIndicSequence = true if scalar2._isVirama { - state.hasSeenVirama = true + self.hasSeenVirama = true } } @@ -627,32 +625,34 @@ extension _StringGuts { // GB11 case (.zwj, .extendedPictographic): - return !state.isInEmojiSequence + return !self.isInEmojiSequence // GB12 & GB13 case (.regionalIndicator, .regionalIndicator): defer { - state.shouldBreakRI.toggle() + self.shouldBreakRI.toggle() } - return state.shouldBreakRI + return self.shouldBreakRI // GB999 default: // GB9c if - state.isInIndicSequence, - state.hasSeenVirama, + self.isInIndicSequence, + self.hasSeenVirama, scalar2._isLinkingConsonant { - state.hasSeenVirama = false + self.hasSeenVirama = false return false } return true } } +} +extension _StringGuts { // Return true if there is an extended grapheme cluster boundary between two // scalars, with no previous knowledge about preceding scalars. // From 699a3f0ee509de4c457b06c5e7179aea55211175 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Thu, 29 Dec 2022 18:17:58 -0800 Subject: [PATCH 3/5] [stdlib] Add new SPI for grapheme breaking (outside String) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `Unicode._CharacterRecognizer` is a newly exported opaque type that exposes the stdlib’s extended grapheme cluster breaking facility, independent of `String`. This essentially makes the underlying simple state machine public, without exposing any of the (unstable) Unicode details. The ability to perform grapheme breaking over, say, the scalars stored in multiple `String` values can be extremely useful while building custom text processing algorithms and data structures. Ideally this would eventually become API, but before proposing this to Swift Evolution, I’d like to prove the shape of the type in actual use (and we’ll also need to find better names for its operations). --- .../public/core/StringGraphemeBreaking.swift | 64 ++++++++++++++++ test/stdlib/CharacterRecognizer.swift | 76 +++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 test/stdlib/CharacterRecognizer.swift diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index 03ed080741d7a..c34e217f05212 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -436,6 +436,70 @@ internal struct _GraphemeBreakingState { var shouldBreakRI = false } +extension Unicode { + /// A state machine for recognizing `Character` (i.e., extended grapheme + /// cluster) boundaries in an arbitrary series of Unicode scalars. + /// + /// The recognizer needs to be initialized with the first scalar in the + /// series. Subsequent scalars must then be fed one by one to the + /// `hasCharacterBoundary(before:)` method, which returns a Boolean value + /// indicating whether the given scalar starts a new `Character`. + /// + /// The results produced by this state machine are guaranteed to match the way + /// `String` splits its contents into `Character` values. + @available(SwiftStdlib 5.8, *) + public // SPI(Foundation) FIXME: We need API for this + struct _CharacterRecognizer { + internal var _previous: Unicode.Scalar + internal var _state: _GraphemeBreakingState + + /// Returns a non-nil value if it can be determined whether there is a + /// `Character` break between `scalar1` and `scalar2` without knowing + /// anything about the scalars that precede `scalar1`. This can be used as a + /// fast (but incomplete) test before spinning up a full state machine + /// session. + @_effects(releasenone) + public static func quickBreak( + between scalar1: Unicode.Scalar, + and scalar2: Unicode.Scalar + ) -> Bool? { + if scalar1.value == 0xD, scalar2.value == 0xA { + return false + } + if _hasGraphemeBreakBetween(scalar1, scalar2) { + return true + } + return nil + } + + /// Initialize a new `Character` recognizer, feeding it the given value as + /// the first Unicode scalar in the series. The state machine assumes that + /// `first` is supposed to start a new extended grapheme cluster. + public init(first: Unicode.Scalar) { + _state = _GraphemeBreakingState() + _previous = first + } + + /// Feeds the next scalar to the state machine, returning a Boolean value + /// indicating whether it starts a new `Character`. + /// + /// The state machine does not carry information across `Character` + /// boundaries. I.e., if this method returns true, then `self` after the + /// call is equivalent to `_CharacterRecognizer(first: next)`. + @_effects(releasenone) + public mutating func hasCharacterBoundary( + before next: Unicode.Scalar + ) -> Bool { + let r = _state.shouldBreak(between: _previous, and: next) + if r { + _state = _GraphemeBreakingState() + } + _previous = next + return r + } + } +} + extension _StringGuts { // Returns the stride of the grapheme cluster starting at offset `index`, // assuming it is on a grapheme cluster boundary. diff --git a/test/stdlib/CharacterRecognizer.swift b/test/stdlib/CharacterRecognizer.swift new file mode 100644 index 0000000000000..7cc39c219924a --- /dev/null +++ b/test/stdlib/CharacterRecognizer.swift @@ -0,0 +1,76 @@ +// RUN: %empty-directory(%t) +// RUN: %target-run-stdlib-swift %S/Inputs/ + +// REQUIRES: executable_test +// REQUIRES: objc_interop +// REQUIRES: optimized_stdlib + +import Swift +import StdlibUnittest +import StdlibUnicodeUnittest + +var suite = TestSuite("CharacterRecognizer") +defer { runAllTests() } + +if #available(SwiftStdlib 5.8, *) { + suite.test("Unicode test data") { + for test in graphemeBreakTests { + var it = test.string.unicodeScalars.makeIterator() + guard let first = it.next() else { continue } + var recognizer = Unicode._CharacterRecognizer(first: first) + var pieces: [[Unicode.Scalar]] = [] + var piece: [Unicode.Scalar] = [first] + while let next = it.next() { + if recognizer.hasCharacterBoundary(before: next) { + pieces.append(piece) + piece = [next] + } else { + piece.append(next) + } + } + if !piece.isEmpty { pieces.append(piece) } + expectEqual(pieces, test.pieces, + "string: \(String(reflecting: test.string))") + } + } +} + +if #available(SwiftStdlib 5.8, *) { + suite.test("Consistency with Swift String's behavior") { + let sampleString = #""" + The powerful programming language that is also easy to learn. + 손쉽게 학습할 수 있는 강력한 프로그래밍 언어. + 🪙 A 🥞 short 🍰 piece 🫘 of 🌰 text 👨‍👨‍👧‍👧 with 👨‍👩‍👦 some 🚶🏽 emoji 🇺🇸🇨🇦 characters 🧈 + some🔩times 🛺 placed 🎣 in 🥌 the 🆘 mid🔀dle 🇦🇶or🏁 around 🏳️‍🌈 a 🍇 w🍑o🥒r🥨d + Unicode is such fun! + U̷n̷i̷c̷o̴d̴e̷ ̶i̸s̷ ̸s̵u̵c̸h̷ ̸f̵u̷n̴!̵ + U̴̡̲͋̾n̵̻̳͌ì̶̠̕c̴̭̈͘ǫ̷̯͋̊d̸͖̩̈̈́ḛ̴́ ̴̟͎͐̈i̴̦̓s̴̜̱͘ ̶̲̮̚s̶̙̞͘u̵͕̯̎̽c̵̛͕̜̓h̶̘̍̽ ̸̜̞̿f̵̤̽ṷ̴͇̎͘ń̷͓̒!̷͍̾̚ + U̷̢̢̧̨̼̬̰̪͓̞̠͔̗̼̙͕͕̭̻̗̮̮̥̣͉̫͉̬̲̺͍̺͊̂ͅ\# + n̶̨̢̨̯͓̹̝̲̣̖̞̼̺̬̤̝̊̌́̑̋̋͜͝ͅ\# + ḭ̸̦̺̺͉̳͎́͑\# + c̵̛̘̥̮̙̥̟̘̝͙̤̮͉͔̭̺̺̅̀̽̒̽̏̊̆͒͌̂͌̌̓̈́̐̔̿̂͑͠͝͝ͅ\# + ö̶̱̠̱̤̙͚͖̳̜̰̹̖̣̻͎͉̞̫̬̯͕̝͔̝̟̘͔̙̪̭̲́̆̂͑̌͂̉̀̓́̏̎̋͗͛͆̌̽͌̄̎̚͝͝͝͝ͅ\# + d̶̨̨̡̡͙̟͉̱̗̝͙͍̮͍̘̮͔͑\# + e̶̢͕̦̜͔̘̘̝͈̪̖̺̥̺̹͉͎͈̫̯̯̻͑͑̿̽͂̀̽͋́̎̈́̈̿͆̿̒̈́̽̔̇͐͛̀̓͆̏̾̀̌̈́̆̽̕ͅ + """# + + let expectedBreaks = Array(sampleString.indices) + + let u = sampleString.unicodeScalars + var i = u.startIndex + var actualBreaks = [i] + var recognizer = Unicode._CharacterRecognizer(first: u[i]) + u.formIndex(after: &i) + while i < u.endIndex { + if recognizer.hasCharacterBoundary(before: u[i]) { + actualBreaks.append(i) + } + u.formIndex(after: &i) + } + expectEqual(actualBreaks, expectedBreaks, + """ + actualBreaks: \(actualBreaks.map { $0._description }) + expectedBreaks: \(expectedBreaks.map { $0._description }) + """) + } +} From 9922b00955715c4bc3a48cb1a0bc528986f4ed21 Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 3 Jan 2023 20:59:24 -0800 Subject: [PATCH 4/5] [stdlib] _CharacterRecognizer: Remove initializer argument --- .../public/core/StringGraphemeBreaking.swift | 43 +++++++++++-------- test/stdlib/CharacterRecognizer.swift | 30 ++++++------- 2 files changed, 39 insertions(+), 34 deletions(-) diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index c34e217f05212..1b08295fe169e 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -437,13 +437,12 @@ internal struct _GraphemeBreakingState { } extension Unicode { - /// A state machine for recognizing `Character` (i.e., extended grapheme + /// A state machine for recognizing character (i.e., extended grapheme /// cluster) boundaries in an arbitrary series of Unicode scalars. /// - /// The recognizer needs to be initialized with the first scalar in the - /// series. Subsequent scalars must then be fed one by one to the - /// `hasCharacterBoundary(before:)` method, which returns a Boolean value - /// indicating whether the given scalar starts a new `Character`. + /// To detect grapheme breaks in a sequence of Unicode scalars, feed each of + /// them to the `hasBreak(before:)` method. The method returns true if the + /// sequence has a grapheme break preceding the given value. /// /// The results produced by this state machine are guaranteed to match the way /// `String` splits its contents into `Character` values. @@ -454,9 +453,9 @@ extension Unicode { internal var _state: _GraphemeBreakingState /// Returns a non-nil value if it can be determined whether there is a - /// `Character` break between `scalar1` and `scalar2` without knowing - /// anything about the scalars that precede `scalar1`. This can be used as a - /// fast (but incomplete) test before spinning up a full state machine + /// grapheme break between `scalar1` and `scalar2` without knowing anything + /// about the scalars that precede `scalar1`. This can optionally be used as + /// a fast (but incomplete) test before spinning up a full state machine /// session. @_effects(releasenone) public static func quickBreak( @@ -472,22 +471,32 @@ extension Unicode { return nil } - /// Initialize a new `Character` recognizer, feeding it the given value as - /// the first Unicode scalar in the series. The state machine assumes that - /// `first` is supposed to start a new extended grapheme cluster. - public init(first: Unicode.Scalar) { + /// Initialize a new character recognizer at the _start of text_ (sot) + /// position. + /// + /// The resulting state machine will report a grapheme break on the + /// first scalar that is fed to it. + public init() { _state = _GraphemeBreakingState() - _previous = first + // To avoid having to handle the empty case specially, we use NUL as the + // placeholder before the first scalar. NUL is a control character, so per + // rule GB5, it will induce an unconditional grapheme break before the + // first actual scalar, emulating GB1. + _previous = Unicode.Scalar(0 as UInt8) } /// Feeds the next scalar to the state machine, returning a Boolean value - /// indicating whether it starts a new `Character`. + /// indicating whether it starts a new extended grapheme cluster. + /// + /// This method will always report a break the first time it is called + /// on a newly initialized recognizer. /// - /// The state machine does not carry information across `Character` + /// The state machine does not carry information across character /// boundaries. I.e., if this method returns true, then `self` after the - /// call is equivalent to `_CharacterRecognizer(first: next)`. + /// call is equivalent to feeding the same scalar to a newly initialized + /// recognizer instance. @_effects(releasenone) - public mutating func hasCharacterBoundary( + public mutating func hasBreak( before next: Unicode.Scalar ) -> Bool { let r = _state.shouldBreak(between: _previous, and: next) diff --git a/test/stdlib/CharacterRecognizer.swift b/test/stdlib/CharacterRecognizer.swift index 7cc39c219924a..013715895fcd3 100644 --- a/test/stdlib/CharacterRecognizer.swift +++ b/test/stdlib/CharacterRecognizer.swift @@ -13,19 +13,17 @@ var suite = TestSuite("CharacterRecognizer") defer { runAllTests() } if #available(SwiftStdlib 5.8, *) { - suite.test("Unicode test data") { + suite.test("Unicode test data/hasBreak") { for test in graphemeBreakTests { - var it = test.string.unicodeScalars.makeIterator() - guard let first = it.next() else { continue } - var recognizer = Unicode._CharacterRecognizer(first: first) + var recognizer = Unicode._CharacterRecognizer() var pieces: [[Unicode.Scalar]] = [] - var piece: [Unicode.Scalar] = [first] - while let next = it.next() { - if recognizer.hasCharacterBoundary(before: next) { - pieces.append(piece) - piece = [next] + var piece: [Unicode.Scalar] = [] + for scalar in test.string.unicodeScalars { + if recognizer.hasBreak(before: scalar) { + if !piece.isEmpty { pieces.append(piece) } + piece = [scalar] } else { - piece.append(next) + piece.append(scalar) } } if !piece.isEmpty { pieces.append(piece) } @@ -57,15 +55,13 @@ if #available(SwiftStdlib 5.8, *) { let expectedBreaks = Array(sampleString.indices) let u = sampleString.unicodeScalars - var i = u.startIndex - var actualBreaks = [i] - var recognizer = Unicode._CharacterRecognizer(first: u[i]) - u.formIndex(after: &i) - while i < u.endIndex { - if recognizer.hasCharacterBoundary(before: u[i]) { + + var recognizer = Unicode._CharacterRecognizer() + var actualBreaks: [String.Index] = [] + for i in u.indices { + if recognizer.hasBreak(before: u[i]) { actualBreaks.append(i) } - u.formIndex(after: &i) } expectEqual(actualBreaks, expectedBreaks, """ From 83f983e0b1c2d084e55f84735cc6443536e876cb Mon Sep 17 00:00:00 2001 From: Karoy Lorentey Date: Tue, 3 Jan 2023 21:00:01 -0800 Subject: [PATCH 5/5] [stdlib] _CharacterRecognizer._firstBreak(inUncheckedUnsafeUTF8Buffer:startingAt:) --- .../public/core/StringGraphemeBreaking.swift | 38 +++++++++++++++ test/stdlib/CharacterRecognizer.swift | 48 ++++++++++++++++++- 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index 1b08295fe169e..b602485b72da4 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -506,6 +506,44 @@ extension Unicode { _previous = next return r } + + /// Decode the scalars in the given UTF-8 buffer and feed them to the + /// recognizer up to and including the scalar following the first grapheme + /// break. If the buffer contains a grapheme break, then this function + /// returns the index range of the scalar that follows the first one; + /// otherwise it returns `nil`. + /// + /// On return, the state of the recognizer is updated to reflect the scalars + /// up to and including the returned one. You can detect additional grapheme + /// breaks by feeding the recognizer subsequent data. + /// + /// - Parameter buffer: A buffer containing valid UTF-8 data, starting and + /// ending on Unicode scalar boundaries. + /// + /// - Parameter start: A valid index into `buffer`, addressing the first + /// code unit of a UTF-8 scalar in the buffer, or the end. + /// + /// - Returns: The index range of the scalar that follows the first grapheme + /// break in the buffer, if there is one. If the buffer contains no + /// grapheme breaks, then this function returns `nil`. + /// + /// - Warning: This function does not validate that the buffer contains + /// valid UTF-8 data; its behavior is undefined if given invalid input. + @_effects(releasenone) + public mutating func _firstBreak( + inUncheckedUnsafeUTF8Buffer buffer: UnsafeBufferPointer, + startingAt start: Int = 0 + ) -> Range? { + var i = start + while i < buffer.endIndex { + let (next, n) = _decodeScalar(buffer, startingAt: i) + if hasBreak(before: next) { + return Range(_uncheckedBounds: (i, i &+ n)) + } + i &+= n + } + return nil + } } } diff --git a/test/stdlib/CharacterRecognizer.swift b/test/stdlib/CharacterRecognizer.swift index 013715895fcd3..6bacbaaacac54 100644 --- a/test/stdlib/CharacterRecognizer.swift +++ b/test/stdlib/CharacterRecognizer.swift @@ -33,9 +33,50 @@ if #available(SwiftStdlib 5.8, *) { } } +func scalars(in buffer: some Sequence) -> [Unicode.Scalar] { + var result: [Unicode.Scalar] = [] + var it = buffer.makeIterator() + var utf8Decoder = UTF8() + while true { + switch utf8Decoder.decode(&it) { + case .scalarValue(let v): result.append(v) + case .emptyInput: return result + case .error: expectTrue(false, "Invalid scalar") + } + } +} + if #available(SwiftStdlib 5.8, *) { - suite.test("Consistency with Swift String's behavior") { - let sampleString = #""" + suite.test("Unicode test data/_firstBreak") { + for test in graphemeBreakTests { + var recognizer = Unicode._CharacterRecognizer() + var pieces: [[Unicode.Scalar]] = [] + var str = test.string + str.withUTF8 { buffer in + var i = buffer.startIndex + var last = i + while i < buffer.endIndex { + guard let scalar = recognizer._firstBreak( + inUncheckedUnsafeUTF8Buffer: buffer, startingAt: i) + else { break } + + if scalar.lowerBound > last { + pieces.append(scalars(in: buffer[last..