swiftlang · lorentey · Jan 5, 2023 · Dec 30, 2022 · Dec 30, 2022 · Dec 30, 2022
@@ -15,56 +15,47 @@
 #if _runtime(_ObjC)
 import Foundation
 
-func parseGraphemeBreakTests(
-  _ data: String,
-  into result: inout [(String, Int)]
-) {
-  for line in data.split(separator: "\n") {
+public struct GraphemeBreakTest {
+  public let string: String
+  public let pieces: [[Unicode.Scalar]]
+
+  init?(line: some StringProtocol) {
     // Only look at actual tests
-    guard line.hasPrefix("÷") else {
-      continue
-    }
+    guard line.hasPrefix("÷") else { return nil }
 
     let info = line.split(separator: "#")
     let components = info[0].split(separator: " ")
 
     var string = ""
-    var count = 0
-
-    for i in components.indices {
-      guard i != 0 else {
-        continue
-      }
-
-      let scalar: Unicode.Scalar
-
-      // If we're an odd index, this is a scalar.
-      if i & 0x1 == 1 {
-        scalar = Unicode.Scalar(UInt32(components[i], radix: 16)!)!
-
+    var pieces: [[Unicode.Scalar]] = []
+
+    var piece: [Unicode.Scalar] = []
+    for component in components {
+      switch component {
+      case "":
+        break
+      case "×": // no grapheme break opportunity
+        break
+      case "÷": // grapheme break opportunity
+        guard !piece.isEmpty else { break }
+        pieces.append(piece)
+        piece = []
+      case _: // hexadecimal scalar value
+        guard let value = UInt32(component, radix: 16) else { return nil }
+        guard let scalar = Unicode.Scalar(value) else { return nil }
         string.unicodeScalars.append(scalar)
-      } else {
-        // Otherwise, it is a grapheme breaking operator.
-
-        // If this is a break, record the +1 count. Otherwise it is × which is
-        // not a break.
-        if components[i] == "÷" {
-          count += 1
-        }
+        piece.append(scalar)
       }
     }
-
-    result.append((string, count))
+    if !piece.isEmpty { pieces.append(piece) }
+    self.string = string
+    self.pieces = pieces
   }
 }
 
-public let graphemeBreakTests: [(String, Int)] = {
-  var result: [(String, Int)] = []
-
+public let graphemeBreakTests: [GraphemeBreakTest] = {
   let testFile = readInputFile("GraphemeBreakTest.txt")
-
-  parseGraphemeBreakTests(testFile, into: &result)
-
-  return result
+  return testFile.split(separator: "\n")
+    .compactMap { GraphemeBreakTest(line: $0) }
 }()
 #endif
@@ -2,7 +2,7 @@
 //
 // This source file is part of the Swift.org open source project
 //
-// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
+// Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors
 // Licensed under Apache License v2.0 with Runtime Library Exception
 //
 // See https://swift.org/LICENSE.txt for license information
@@ -436,6 +436,70 @@ internal struct _GraphemeBreakingState {
   var shouldBreakRI = false
 }
 
+extension Unicode {
+  /// A state machine for recognizing `Character` (i.e., extended grapheme
+  /// cluster) boundaries in an arbitrary series of Unicode scalars.
+  ///
+  /// The recognizer needs to be initialized with the first scalar in the
+  /// series. Subsequent scalars must then be fed one by one to the
+  /// `hasCharacterBoundary(before:)` method, which returns a Boolean value
+  /// indicating whether the given scalar starts a new `Character`.
+  ///
+  /// The results produced by this state machine are guaranteed to match the way
+  /// `String` splits its contents into `Character` values.
+  @available(SwiftStdlib 5.8, *)
+  public // SPI(Foundation) FIXME: We need API for this
+  struct _CharacterRecognizer {
+    internal var _previous: Unicode.Scalar
+    internal var _state: _GraphemeBreakingState
+
+    /// Returns a non-nil value if it can be determined whether there is a
+    /// `Character` break between `scalar1` and `scalar2` without knowing
+    /// anything about the scalars that precede `scalar1`. This can be used as a
+    /// fast (but incomplete) test before spinning up a full state machine
+    /// session.
+    @_effects(releasenone)
+    public static func quickBreak(
+      between scalar1: Unicode.Scalar,
+      and scalar2: Unicode.Scalar
+    ) -> Bool? {
+      if scalar1.value == 0xD, scalar2.value == 0xA {
+        return false
+      }
+      if _hasGraphemeBreakBetween(scalar1, scalar2) {
+        return true
+      }
+      return nil
+    }
+
+    /// Initialize a new `Character` recognizer, feeding it the given value as
+    /// the first Unicode scalar in the series. The state machine assumes that
+    /// `first` is supposed to start a new extended grapheme cluster.
+    public init(first: Unicode.Scalar) {
+      _state = _GraphemeBreakingState()
+      _previous = first
+    }
+
+    /// Feeds the next scalar to the state machine, returning a Boolean value
+    /// indicating whether it starts a new `Character`.
+    ///
+    /// The state machine does not carry information across `Character`
+    /// boundaries. I.e., if this method returns true, then `self` after the
+    /// call is equivalent to `_CharacterRecognizer(first: next)`.
+    @_effects(releasenone)
+    public mutating func hasCharacterBoundary(
+      before next: Unicode.Scalar
+    ) -> Bool {
+      let r = _state.shouldBreak(between: _previous, and: next)
+      if r {
+        _state = _GraphemeBreakingState()
+      }
+      _previous = next
+      return r
+    }
+  }
+}
+
 extension _StringGuts {
   // Returns the stride of the grapheme cluster starting at offset `index`,
   // assuming it is on a grapheme cluster boundary.
@@ -459,7 +523,7 @@ extension _StringGuts {
 
     while true {
       guard let (scalar2, nextIndex) = nextScalar(index) else { break }
-      if shouldBreak(between: scalar, and: scalar2, at: index, with: &state) {
+      if state.shouldBreak(between: scalar, and: scalar2) {
         break
       }
       index = nextIndex
@@ -505,7 +569,7 @@ extension _StringGuts {
   }
 }
 
-extension _StringGuts {
+extension _GraphemeBreakingState {
   // Return true if there is an extended grapheme cluster boundary between two
   // scalars, based on state information previously collected about preceding
   // scalars.
@@ -517,11 +581,9 @@ extension _StringGuts {
   //
   // This is based on the Unicode Annex #29 for [Grapheme Cluster Boundary
   // Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
-  internal func shouldBreak(
+  internal mutating func shouldBreak(
     between scalar1: Unicode.Scalar,
-    and scalar2: Unicode.Scalar,
-    at index: Int,
-    with state: inout _GraphemeBreakingState
+    and scalar2: Unicode.Scalar
   ) -> Bool {
     // GB3
     if scalar1.value == 0xD, scalar2.value == 0xA {
@@ -545,8 +607,8 @@ extension _StringGuts {
     var enterIndicSequence = false
 
     defer {
-      state.isInEmojiSequence = enterEmojiSequence
-      state.isInIndicSequence = enterIndicSequence
+      self.isInEmojiSequence = enterEmojiSequence
+      self.isInIndicSequence = enterIndicSequence
     }
 
     switch (x, y) {
@@ -591,14 +653,14 @@ extension _StringGuts {
       // continue the grapheme cluster by combining more scalars later. If we're
       // not currently in an emoji sequence, but our lhs scalar is a pictograph,
       // then that's a signal that it's the start of an emoji sequence.
-      if state.isInEmojiSequence || x == .extendedPictographic {
+      if self.isInEmojiSequence || x == .extendedPictographic {
         enterEmojiSequence = true
       }
 
       // If we're currently in an indic sequence (or if our lhs is a linking
       // consonant), then this check and everything underneath ensures that
       // we continue being in one and may check if this extend is a Virama.
-      if state.isInIndicSequence || scalar1._isLinkingConsonant {
+      if self.isInIndicSequence || scalar1._isLinkingConsonant {
         if y == .extend {
           let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300)
 
@@ -611,7 +673,7 @@ extension _StringGuts {
         enterIndicSequence = true
 
         if scalar2._isVirama {
-          state.hasSeenVirama = true
+          self.hasSeenVirama = true
         }
       }
 
@@ -627,32 +689,34 @@ extension _StringGuts {
 
     // GB11
     case (.zwj, .extendedPictographic):
-      return !state.isInEmojiSequence
+      return !self.isInEmojiSequence
 
     // GB12 & GB13
     case (.regionalIndicator, .regionalIndicator):
       defer {
-        state.shouldBreakRI.toggle()
+        self.shouldBreakRI.toggle()
       }
 
-      return state.shouldBreakRI
+      return self.shouldBreakRI
 
     // GB999
     default:
       // GB9c
       if
-        state.isInIndicSequence,
-        state.hasSeenVirama,
+        self.isInIndicSequence,
+        self.hasSeenVirama,
         scalar2._isLinkingConsonant
       {
-        state.hasSeenVirama = false
+        self.hasSeenVirama = false
         return false
       }
 
       return true
     }
   }
+}
 
+extension _StringGuts {
   // Return true if there is an extended grapheme cluster boundary between two
   // scalars, with no previous knowledge about preceding scalars.
   //

@@ -0,0 +1,76 @@
+// RUN: %empty-directory(%t)
+// RUN: %target-run-stdlib-swift %S/Inputs/
+
+// REQUIRES: executable_test
+// REQUIRES: objc_interop
+// REQUIRES: optimized_stdlib
+
+import Swift
+import StdlibUnittest
+import StdlibUnicodeUnittest
+
+var suite = TestSuite("CharacterRecognizer")
+defer { runAllTests() }
+
+if #available(SwiftStdlib 5.8, *) {
+  suite.test("Unicode test data") {
+    for test in graphemeBreakTests {
+      var it = test.string.unicodeScalars.makeIterator()
+      guard let first = it.next() else { continue }
+      var recognizer = Unicode._CharacterRecognizer(first: first)
+      var pieces: [[Unicode.Scalar]] = []
+      var piece: [Unicode.Scalar] = [first]
+      while let next = it.next() {
+        if recognizer.hasCharacterBoundary(before: next) {
+          pieces.append(piece)
+          piece = [next]
+        } else {
+          piece.append(next)
+        }
+      }
+      if !piece.isEmpty { pieces.append(piece) }
+      expectEqual(pieces, test.pieces,
+        "string: \(String(reflecting: test.string))")
+    }
+  }
+}
+
+if #available(SwiftStdlib 5.8, *) {
+  suite.test("Consistency with Swift String's behavior") {
+    let sampleString = #"""
+    The powerful programming language that is also easy to learn.
+    손쉽게 학습할 수 있는 강력한 프로그래밍 언어.
+    🪙 A 🥞 short 🍰 piece 🫘 of 🌰 text 👨‍👨‍👧‍👧 with 👨‍👩‍👦 some 🚶🏽 emoji 🇺🇸🇨🇦 characters 🧈
+    some🔩times 🛺 placed 🎣 in 🥌 the 🆘 mid🔀dle 🇦🇶or🏁 around 🏳️‍🌈 a 🍇 w🍑o🥒r🥨d
+    Unicode is such fun!
+    U̷n̷i̷c̷o̴d̴e̷ ̶i̸s̷ ̸s̵u̵c̸h̷ ̸f̵u̷n̴!̵
+    U̴̡̲͋̾n̵̻̳͌ì̶̠̕c̴̭̈͘ǫ̷̯͋̊d̸͖̩̈̈́ḛ̴́ ̴̟͎͐̈i̴̦̓s̴̜̱͘ ̶̲̮̚s̶̙̞͘u̵͕̯̎̽c̵̛͕̜̓h̶̘̍̽ ̸̜̞̿f̵̤̽ṷ̴͇̎͘ń̷͓̒!̷͍̾̚
+    U̷̢̢̧̨̼̬̰̪͓̞̠͔̗̼̙͕͕̭̻̗̮̮̥̣͉̫͉̬̲̺͍̺͊̂ͅ\#
+    n̶̨̢̨̯͓̹̝̲̣̖̞̼̺̬̤̝̊̌́̑̋̋͜͝ͅ\#
+    ḭ̸̦̺̺͉̳͎́͑\#
+    c̵̛̘̥̮̙̥̟̘̝͙̤̮͉͔̭̺̺̅̀̽̒̽̏̊̆͒͌̂͌̌̓̈́̐̔̿̂͑͠͝͝ͅ\#
+    ö̶̱̠̱̤̙͚͖̳̜̰̹̖̣̻͎͉̞̫̬̯͕̝͔̝̟̘͔̙̪̭̲́̆̂͑̌͂̉̀̓́̏̎̋͗͛͆̌̽͌̄̎̚͝͝͝͝ͅ\#
+    d̶̨̨̡̡͙̟͉̱̗̝͙͍̮͍̘̮͔͑\#
+    e̶̢͕̦̜͔̘̘̝͈̪̖̺̥̺̹͉͎͈̫̯̯̻͑͑̿̽͂̀̽͋́̎̈́̈̿͆̿̒̈́̽̔̇͐͛̀̓͆̏̾̀̌̈́̆̽̕ͅ
+    """#
+
+    let expectedBreaks = Array(sampleString.indices)
+
+    let u = sampleString.unicodeScalars
+    var i = u.startIndex
+    var actualBreaks = [i]
+    var recognizer = Unicode._CharacterRecognizer(first: u[i])
+    u.formIndex(after: &i)
+    while i < u.endIndex {
+      if recognizer.hasCharacterBoundary(before: u[i]) {
+        actualBreaks.append(i)
+      }
+      u.formIndex(after: &i)
+    }
+    expectEqual(actualBreaks, expectedBreaks,
+      """
+      actualBreaks: \(actualBreaks.map { $0._description })
+      expectedBreaks: \(expectedBreaks.map { $0._description })
+      """)
+  }
+}
@@ -656,8 +656,8 @@ suite.test("Fully exhaustive index interchange")
 
 #if _runtime(_ObjC)
 suite.test("Fully exhaustive index interchange/GraphemeBreakTests") {
-  for string in graphemeBreakTests.map { $0.0 } {
-    fullyExhaustiveIndexInterchange(string)
+  for test in graphemeBreakTests {
+    fullyExhaustiveIndexInterchange(test.string)
   }
 }
 #endif