Skip to content

[stdlib] Export grapheme breaking facility #62794

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 5, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 29 additions & 38 deletions stdlib/private/StdlibUnicodeUnittest/GraphemeBreaking.swift
Original file line number Diff line number Diff line change
Expand Up @@ -15,56 +15,47 @@
#if _runtime(_ObjC)
import Foundation

func parseGraphemeBreakTests(
_ data: String,
into result: inout [(String, Int)]
) {
for line in data.split(separator: "\n") {
public struct GraphemeBreakTest {
public let string: String
public let pieces: [[Unicode.Scalar]]

init?(line: some StringProtocol) {
// Only look at actual tests
guard line.hasPrefix("÷") else {
continue
}
guard line.hasPrefix("÷") else { return nil }

let info = line.split(separator: "#")
let components = info[0].split(separator: " ")

var string = ""
var count = 0

for i in components.indices {
guard i != 0 else {
continue
}

let scalar: Unicode.Scalar

// If we're an odd index, this is a scalar.
if i & 0x1 == 1 {
scalar = Unicode.Scalar(UInt32(components[i], radix: 16)!)!

var pieces: [[Unicode.Scalar]] = []

var piece: [Unicode.Scalar] = []
for component in components {
switch component {
case "":
break
case "×": // no grapheme break opportunity
break
case "÷": // grapheme break opportunity
guard !piece.isEmpty else { break }
pieces.append(piece)
piece = []
case _: // hexadecimal scalar value
guard let value = UInt32(component, radix: 16) else { return nil }
guard let scalar = Unicode.Scalar(value) else { return nil }
string.unicodeScalars.append(scalar)
} else {
// Otherwise, it is a grapheme breaking operator.

// If this is a break, record the +1 count. Otherwise it is × which is
// not a break.
if components[i] == "÷" {
count += 1
}
piece.append(scalar)
}
}

result.append((string, count))
if !piece.isEmpty { pieces.append(piece) }
self.string = string
self.pieces = pieces
}
}

public let graphemeBreakTests: [(String, Int)] = {
var result: [(String, Int)] = []

public let graphemeBreakTests: [GraphemeBreakTest] = {
let testFile = readInputFile("GraphemeBreakTest.txt")

parseGraphemeBreakTests(testFile, into: &result)

return result
return testFile.split(separator: "\n")
.compactMap { GraphemeBreakTest(line: $0) }
}()
#endif
100 changes: 82 additions & 18 deletions stdlib/public/core/StringGraphemeBreaking.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
Expand Down Expand Up @@ -436,6 +436,70 @@ internal struct _GraphemeBreakingState {
var shouldBreakRI = false
}

extension Unicode {
/// A state machine for recognizing `Character` (i.e., extended grapheme
/// cluster) boundaries in an arbitrary series of Unicode scalars.
///
/// The recognizer needs to be initialized with the first scalar in the
/// series. Subsequent scalars must then be fed one by one to the
/// `hasCharacterBoundary(before:)` method, which returns a Boolean value
/// indicating whether the given scalar starts a new `Character`.
///
/// The results produced by this state machine are guaranteed to match the way
/// `String` splits its contents into `Character` values.
@available(SwiftStdlib 5.8, *)
public // SPI(Foundation) FIXME: We need API for this
struct _CharacterRecognizer {
internal var _previous: Unicode.Scalar
internal var _state: _GraphemeBreakingState

/// Returns a non-nil value if it can be determined whether there is a
/// `Character` break between `scalar1` and `scalar2` without knowing
/// anything about the scalars that precede `scalar1`. This can be used as a
/// fast (but incomplete) test before spinning up a full state machine
/// session.
@_effects(releasenone)
public static func quickBreak(
between scalar1: Unicode.Scalar,
and scalar2: Unicode.Scalar
) -> Bool? {
if scalar1.value == 0xD, scalar2.value == 0xA {
return false
}
if _hasGraphemeBreakBetween(scalar1, scalar2) {
return true
}
return nil
}

/// Initialize a new `Character` recognizer, feeding it the given value as
/// the first Unicode scalar in the series. The state machine assumes that
/// `first` is supposed to start a new extended grapheme cluster.
public init(first: Unicode.Scalar) {
_state = _GraphemeBreakingState()
_previous = first
}

/// Feeds the next scalar to the state machine, returning a Boolean value
/// indicating whether it starts a new `Character`.
///
/// The state machine does not carry information across `Character`
/// boundaries. I.e., if this method returns true, then `self` after the
/// call is equivalent to `_CharacterRecognizer(first: next)`.
@_effects(releasenone)
public mutating func hasCharacterBoundary(
before next: Unicode.Scalar
) -> Bool {
let r = _state.shouldBreak(between: _previous, and: next)
if r {
_state = _GraphemeBreakingState()
}
_previous = next
return r
}
}
}

extension _StringGuts {
// Returns the stride of the grapheme cluster starting at offset `index`,
// assuming it is on a grapheme cluster boundary.
Expand All @@ -459,7 +523,7 @@ extension _StringGuts {

while true {
guard let (scalar2, nextIndex) = nextScalar(index) else { break }
if shouldBreak(between: scalar, and: scalar2, at: index, with: &state) {
if state.shouldBreak(between: scalar, and: scalar2) {
break
}
index = nextIndex
Expand Down Expand Up @@ -505,7 +569,7 @@ extension _StringGuts {
}
}

extension _StringGuts {
extension _GraphemeBreakingState {
// Return true if there is an extended grapheme cluster boundary between two
// scalars, based on state information previously collected about preceding
// scalars.
Expand All @@ -517,11 +581,9 @@ extension _StringGuts {
//
// This is based on the Unicode Annex #29 for [Grapheme Cluster Boundary
// Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
internal func shouldBreak(
internal mutating func shouldBreak(
between scalar1: Unicode.Scalar,
and scalar2: Unicode.Scalar,
at index: Int,
with state: inout _GraphemeBreakingState
and scalar2: Unicode.Scalar
) -> Bool {
// GB3
if scalar1.value == 0xD, scalar2.value == 0xA {
Expand All @@ -545,8 +607,8 @@ extension _StringGuts {
var enterIndicSequence = false

defer {
state.isInEmojiSequence = enterEmojiSequence
state.isInIndicSequence = enterIndicSequence
self.isInEmojiSequence = enterEmojiSequence
self.isInIndicSequence = enterIndicSequence
}

switch (x, y) {
Expand Down Expand Up @@ -591,14 +653,14 @@ extension _StringGuts {
// continue the grapheme cluster by combining more scalars later. If we're
// not currently in an emoji sequence, but our lhs scalar is a pictograph,
// then that's a signal that it's the start of an emoji sequence.
if state.isInEmojiSequence || x == .extendedPictographic {
if self.isInEmojiSequence || x == .extendedPictographic {
enterEmojiSequence = true
}

// If we're currently in an indic sequence (or if our lhs is a linking
// consonant), then this check and everything underneath ensures that
// we continue being in one and may check if this extend is a Virama.
if state.isInIndicSequence || scalar1._isLinkingConsonant {
if self.isInIndicSequence || scalar1._isLinkingConsonant {
if y == .extend {
let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300)

Expand All @@ -611,7 +673,7 @@ extension _StringGuts {
enterIndicSequence = true

if scalar2._isVirama {
state.hasSeenVirama = true
self.hasSeenVirama = true
}
}

Expand All @@ -627,32 +689,34 @@ extension _StringGuts {

// GB11
case (.zwj, .extendedPictographic):
return !state.isInEmojiSequence
return !self.isInEmojiSequence

// GB12 & GB13
case (.regionalIndicator, .regionalIndicator):
defer {
state.shouldBreakRI.toggle()
self.shouldBreakRI.toggle()
}

return state.shouldBreakRI
return self.shouldBreakRI

// GB999
default:
// GB9c
if
state.isInIndicSequence,
state.hasSeenVirama,
self.isInIndicSequence,
self.hasSeenVirama,
scalar2._isLinkingConsonant
{
state.hasSeenVirama = false
self.hasSeenVirama = false
return false
}

return true
}
}
}

extension _StringGuts {
// Return true if there is an extended grapheme cluster boundary between two
// scalars, with no previous knowledge about preceding scalars.
//
Expand Down
76 changes: 76 additions & 0 deletions test/stdlib/CharacterRecognizer.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// RUN: %empty-directory(%t)
// RUN: %target-run-stdlib-swift %S/Inputs/

// REQUIRES: executable_test
// REQUIRES: objc_interop
// REQUIRES: optimized_stdlib

import Swift
import StdlibUnittest
import StdlibUnicodeUnittest

var suite = TestSuite("CharacterRecognizer")
defer { runAllTests() }

if #available(SwiftStdlib 5.8, *) {
suite.test("Unicode test data") {
for test in graphemeBreakTests {
var it = test.string.unicodeScalars.makeIterator()
guard let first = it.next() else { continue }
var recognizer = Unicode._CharacterRecognizer(first: first)
var pieces: [[Unicode.Scalar]] = []
var piece: [Unicode.Scalar] = [first]
while let next = it.next() {
if recognizer.hasCharacterBoundary(before: next) {
pieces.append(piece)
piece = [next]
} else {
piece.append(next)
}
}
if !piece.isEmpty { pieces.append(piece) }
expectEqual(pieces, test.pieces,
"string: \(String(reflecting: test.string))")
}
}
}

if #available(SwiftStdlib 5.8, *) {
suite.test("Consistency with Swift String's behavior") {
let sampleString = #"""
The powerful programming language that is also easy to learn.
손쉽게 학습할 수 있는 강력한 프로그래밍 언어.
🪙 A 🥞 short 🍰 piece 🫘 of 🌰 text 👨‍👨‍👧‍👧 with 👨‍👩‍👦 some 🚶🏽 emoji 🇺🇸🇨🇦 characters 🧈
some🔩times 🛺 placed 🎣 in 🥌 the 🆘 mid🔀dle 🇦🇶or🏁 around 🏳️‍🌈 a 🍇 w🍑o🥒r🥨d
Unicode is such fun!
U̷n̷i̷c̷o̴d̴e̷ ̶i̸s̷ ̸s̵u̵c̸h̷ ̸f̵u̷n̴!̵
U̴̡̲͋̾n̵̻̳͌ì̶̠̕c̴̭̈͘ǫ̷̯͋̊d̸͖̩̈̈́ḛ̴́ ̴̟͎͐̈i̴̦̓s̴̜̱͘ ̶̲̮̚s̶̙̞͘u̵͕̯̎̽c̵̛͕̜̓h̶̘̍̽ ̸̜̞̿f̵̤̽ṷ̴͇̎͘ń̷͓̒!̷͍̾̚
U̷̢̢̧̨̼̬̰̪͓̞̠͔̗̼̙͕͕̭̻̗̮̮̥̣͉̫͉̬̲̺͍̺͊̂ͅ\#
n̶̨̢̨̯͓̹̝̲̣̖̞̼̺̬̤̝̊̌́̑̋̋͜͝ͅ\#
ḭ̸̦̺̺͉̳͎́͑\#
c̵̛̘̥̮̙̥̟̘̝͙̤̮͉͔̭̺̺̅̀̽̒̽̏̊̆͒͌̂͌̌̓̈́̐̔̿̂͑͠͝͝ͅ\#
ö̶̱̠̱̤̙͚͖̳̜̰̹̖̣̻͎͉̞̫̬̯͕̝͔̝̟̘͔̙̪̭̲́̆̂͑̌͂̉̀̓́̏̎̋͗͛͆̌̽͌̄̎̚͝͝͝͝ͅ\#
d̶̨̨̡̡͙̟͉̱̗̝͙͍̮͍̘̮͔͑\#
e̶̢͕̦̜͔̘̘̝͈̪̖̺̥̺̹͉͎͈̫̯̯̻͑͑̿̽͂̀̽͋́̎̈́̈̿͆̿̒̈́̽̔̇͐͛̀̓͆̏̾̀̌̈́̆̽̕ͅ
"""#

let expectedBreaks = Array(sampleString.indices)

let u = sampleString.unicodeScalars
var i = u.startIndex
var actualBreaks = [i]
var recognizer = Unicode._CharacterRecognizer(first: u[i])
u.formIndex(after: &i)
while i < u.endIndex {
if recognizer.hasCharacterBoundary(before: u[i]) {
actualBreaks.append(i)
}
u.formIndex(after: &i)
}
expectEqual(actualBreaks, expectedBreaks,
"""
actualBreaks: \(actualBreaks.map { $0._description })
expectedBreaks: \(expectedBreaks.map { $0._description })
""")
}
}
4 changes: 2 additions & 2 deletions test/stdlib/StringIndex.swift
Original file line number Diff line number Diff line change
Expand Up @@ -656,8 +656,8 @@ suite.test("Fully exhaustive index interchange")

#if _runtime(_ObjC)
suite.test("Fully exhaustive index interchange/GraphemeBreakTests") {
for string in graphemeBreakTests.map { $0.0 } {
fullyExhaustiveIndexInterchange(string)
for test in graphemeBreakTests {
fullyExhaustiveIndexInterchange(test.string)
}
}
#endif
Expand Down
Loading