diff --git a/stdlib/public/SwiftShims/UnicodeData.h b/stdlib/public/SwiftShims/UnicodeData.h index 6256c36a22367..87c6bb7a92e79 100644 --- a/stdlib/public/SwiftShims/UnicodeData.h +++ b/stdlib/public/SwiftShims/UnicodeData.h @@ -62,6 +62,9 @@ __swift_uint32_t _swift_stdlib_getComposition(__swift_uint32_t x, SWIFT_RUNTIME_STDLIB_INTERNAL __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar); +SWIFT_RUNTIME_STDLIB_INTERNAL +__swift_bool _swift_stdlib_isLinkingConsonant(__swift_uint32_t scalar); + //===----------------------------------------------------------------------===// // Unicode.Scalar.Properties //===----------------------------------------------------------------------===// diff --git a/stdlib/public/core/StringGraphemeBreaking.swift b/stdlib/public/core/StringGraphemeBreaking.swift index 04ee06a3781c6..b85d5b7aaefe2 100644 --- a/stdlib/public/core/StringGraphemeBreaking.swift +++ b/stdlib/public/core/StringGraphemeBreaking.swift @@ -10,6 +10,8 @@ // //===----------------------------------------------------------------------===// +import SwiftShims + /// CR and LF are common special cases in grapheme breaking logic private var _CR: UInt8 { return 0x0d } private var _LF: UInt8 { return 0x0a } @@ -175,13 +177,56 @@ extension _StringGuts { } } +extension Unicode.Scalar { + fileprivate var _isLinkingConsonant: Bool { + _swift_stdlib_isLinkingConsonant(value) + } + + fileprivate var _isVirama: Bool { + switch value { + // Devanagari + case 0x94D: + return true + // Bengali + case 0x9CD: + return true + // Gujarati + case 0xACD: + return true + // Oriya + case 0xB4D: + return true + // Telugu + case 0xC4D: + return true + // Malayalam + case 0xD4D: + return true + + default: + return false + } + } +} + internal struct _GraphemeBreakingState { + // When we're looking through an indic sequence, one of the requirements is + // that there is at LEAST 1 Virama present between two linking consonants. + // This value helps ensure that when we ultimately need to decide whether or + // not to break that we've at least seen 1 when walking. + var hasSeenVirama = false + // When walking forwards in a string, we need to know whether or not we've // entered an emoji sequence to be able to eventually break after all of the // emoji's various extenders and zero width joiners. This bit allows us to // keep track of whether or not we're still in an emoji sequence when deciding // to break. - var isInEmojiSequence: Bool = false + var isInEmojiSequence = false + + // Similar to emoji sequences, we need to know not to break an Indic grapheme + // sequence. This sequence is (potentially) composed of many scalars and isn't + // as trivial as comparing two grapheme properties. + var isInIndicSequence = false // When walking forward in a string, we need to not break on emoji flag // sequences. Emoji flag sequences are composed of 2 regional indicators, so @@ -190,7 +235,7 @@ internal struct _GraphemeBreakingState { // is another regional indicator, we reach the same decision rule, but in this // case we actually need to break there's a boundary between emoji flag // sequences. - var shouldBreakRI: Bool = false + var shouldBreakRI = false } extension _StringGuts { @@ -288,8 +333,12 @@ extension _StringGuts { // continue treating the current grapheme cluster as an emoji sequence. var enterEmojiSequence = false + // Very similar to emoji sequences, but for Indic grapheme sequences. + var enterIndicSequence = false + defer { state.isInEmojiSequence = enterEmojiSequence + state.isInIndicSequence = enterIndicSequence } switch (x, y) { @@ -338,6 +387,26 @@ extension _StringGuts { enterEmojiSequence = true } + // If we're currently in an indic sequence (or if our lhs is a linking + // consonant), then this check and everything underneath ensures that + // we continue being in one and may check if this extend is a Virama. + if state.isInIndicSequence || scalar1._isLinkingConsonant { + if y == .extend { + let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300) + + // If our extend's CCC is 0, then this rule does not apply. + guard extendNormData.ccc != 0 else { + return false + } + } + + enterIndicSequence = true + + if scalar2._isVirama { + state.hasSeenVirama = true + } + } + return false // GB9a @@ -370,6 +439,32 @@ extension _StringGuts { // GB999 default: + // GB9c + if state.isInIndicSequence, state.hasSeenVirama, scalar2._isLinkingConsonant { + state.hasSeenVirama = false + return false + } + + // Handle GB9c when walking backwards. + if isBackwards { + switch (x, scalar2._isLinkingConsonant) { + case (.extend, true): + let extendNormData = Unicode._NormData(scalar1, fastUpperbound: 0x300) + + guard extendNormData.ccc != 0 else { + return true + } + + return !checkIfInIndicSequence(index) + + case (.zwj, true): + return !checkIfInIndicSequence(index) + + default: + return true + } + } + return true } } @@ -417,9 +512,7 @@ extension _StringGuts { // | = We found our starting .extendedPictographic letting us // know that we are in an emoji sequence so our initial // break question is answered as NO. - internal func checkIfInEmojiSequence( - _ index: Int - ) -> Bool { + internal func checkIfInEmojiSequence(_ index: Int) -> Bool { var emojiIdx = String.Index(_encodedOffset: index) guard emojiIdx != startIndex else { @@ -447,7 +540,91 @@ extension _StringGuts { return false } - + + // When walking backwards, it's impossible to know whether we break when we + // see our first ((.extend|.zwj), .linkingConsonant) without walking + // further backwards. This walks the string backwards enough until we figure + // out whether or not to break this indic sequence. For example: + // + // Scalar view #1: + // + // [.virama, .extend, .linkingConsonant] + // ^ + // | = To be able to know whether or not to break these + // two, we need to walk backwards to determine if + // this is a legitimate indic sequence. + // ^ + // | = The scalar sequence ends without a starting linking consonant, + // so this is in fact not an indic sequence, so we can break the two. + // + // Scalar view #2: + // + // [.linkingConsonant, .virama, .extend, .linkingConsonant] + // ^ + // | = Same as above + // ^ + // | = This is a virama, so we at least have seen + // 1 to be able to return true if we see a + // linking consonant later. + // ^ + // | = Is a linking consonant and we've seen a virama, so this is a + // legitimate indic sequence, so do NOT break the initial question. + internal func checkIfInIndicSequence(_ index: Int) -> Bool { + var indicIdx = String.Index(_encodedOffset: index) + + guard indicIdx != startIndex else { + return false + } + + let scalars = String.UnicodeScalarView(self) + scalars.formIndex(before: &indicIdx) + + var hasSeenVirama = false + + // Check if the first extend was the Virama. + let scalar = scalars[indicIdx] + + if scalar._isVirama { + hasSeenVirama = true + } + + while indicIdx != startIndex { + scalars.formIndex(before: &indicIdx) + let scalar = scalars[indicIdx] + + let gbp = Unicode._GraphemeBreakProperty(from: scalar) + + switch (gbp, scalar._isLinkingConsonant) { + case (.extend, false): + let extendNormData = Unicode._NormData(scalar, fastUpperbound: 0x300) + + guard extendNormData.ccc != 0 else { + return false + } + + if scalar._isVirama { + hasSeenVirama = true + } + + case (.zwj, false): + continue + + // LinkingConsonant + case (_, true): + guard hasSeenVirama else { + return false + } + + return true + + default: + return false + } + } + + return false + } + // When walking backwards, it's impossible to know whether we break when we // see our first (.regionalIndicator, .regionalIndicator) without walking // further backwards. This walks the string backwards enough until we figure diff --git a/stdlib/public/stubs/Unicode/Common/GraphemeData.h b/stdlib/public/stubs/Unicode/Common/GraphemeData.h index 5c7b3fa81ac51..06af914d99550 100644 --- a/stdlib/public/stubs/Unicode/Common/GraphemeData.h +++ b/stdlib/public/stubs/Unicode/Common/GraphemeData.h @@ -20,7 +20,7 @@ #define GRAPHEME_BREAK_DATA_COUNT 621 -static __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = { +static const __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = { 0x3E00000, 0x400007F, 0x800000A9, 0xAD, 0x800000AE, 0x2DE00300, 0x20C00483, 0x25800591, 0x200005BF, 0x202005C1, 0x202005C4, 0x200005C7, 0x40A00600, 0x21400610, 0x61C, 0x2280064B, 0x20000670, 0x20C006D6, 0x400006DD, 0x20A006DF, 0x202006E7, 0x206006EA, 0x4000070F, 0x20000711, @@ -101,4 +101,41 @@ static __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = { 0xB701F947, 0x3EE0000, 0x2BEE0020, 0xFEE0080, 0x3DEE0100, }; +static const __swift_uint16_t _swift_stdlib_linkingConsonant_ranks[165] = { + 0x0, 0xE, 0xE, 0x12, 0x13, 0x0, 0x0, 0x0, 0x25, 0x35, 0x0, 0x20, 0x25, 0x46, 0x4B, 0x0, 0x17, + 0x23, 0x3D, 0x44, 0x0, 0xA, 0x24, 0x31, 0x4B, 0x0, 0x1, 0x27, 0x27, 0x49, 0x0, 0xF, 0x2E, 0x3A, + 0x57, 0x0, 0x0, 0x1F, 0x2C, 0x2C, 0x0, 0x21, 0x2D, 0x3C, 0x3C, 0x0, 0x0, 0x0, 0xD, 0x2C, 0x0, + 0x2D, 0x30, 0x30, 0x30, 0x0, 0x0, 0x0, 0x1E, 0x31, 0x0, 0x2C, 0x2C, 0x63, 0x72, 0x0, 0x0, 0x0, + 0x29, 0x2F, 0x0, 0x26, 0x4A, 0x51, 0x51, 0x0, 0x18, 0x39, 0x54, 0x68, 0x0, 0x18, 0x2F, 0x53, 0x64, + 0x0, 0x23, 0x39, 0x69, 0x72, 0x0, 0x0, 0x0, 0xE, 0x18, 0x0, 0x0, 0xF, 0x25, 0x25, 0x0, 0x25, 0x26, + 0x49, 0x49, 0x0, 0x19, 0x37, 0x59, 0x61, 0x0, 0xC, 0x24, 0x52, 0x5D, 0x0, 0x8, 0x8, 0x8, 0x2A, + 0x0, 0x0, 0x21, 0x21, 0x21, 0x0, 0x2, 0x21, 0x23, 0x43, 0x0, 0x16, 0x22, 0x3D, 0x44, 0x0, 0x0, + 0x0, 0x22, 0x22, 0x0, 0x0, 0x0, 0x22, 0x22, 0x0, 0x22, 0x28, 0x4B, 0x73, 0x0, 0x0, 0x21, 0x21, + 0x3F, 0x0, 0x0, 0x25, 0x39, 0x43, 0x0, 0x12, 0x12, 0x12, 0x12, +}; + +static const __swift_uint64_t _swift_stdlib_linkingConsonant[166] = { + 0x5, 0x7E0FF00, 0x0, 0x3C0000000, 0x400000000000000, 0x5BFF, 0x0, 0x0, 0x3FFFFFFFFE00000, + 0xFF000000FF000000, 0x0, 0x3C5FDFFFFE0, 0x30000B000, 0x36DFDFFFFE0, 0x5E00, 0xFFE0, 0x3EDFDFF, + 0xFFE0000002000000, 0xB000000003EDFDFF, 0xD620000000020000, 0xC718, 0x3FF, 0xFDFFFFE000000000, + 0x700000003FF, 0xFDFFFFE000000000, 0x3EF, 0x40000000, 0x7FFFFFFFFE00000, 0x0, 0x2FFBFFFFFC000000, + 0x7F, 0xFFFE000000000000, 0x7FFFFFFF, 0xF7D6000000000000, 0x7FAFFFFF, 0xF000, 0x0, + 0xFFFFFEFF00000000, 0x1FFF, 0x0, 0x0, 0x1FFFFFFFF0000, 0xC0623C0300008000, 0x4003FFE1, 0x0, 0x0, + 0x0, 0x0, 0xFFF8000000000000, 0xFFF80003FFF88003, 0x3, 0xFFFFFFFF0001DFF8, 0x7, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x7FFFFFFE0000, 0x7FFFF00000000, 0x0, 0xFFFFFFFFFFF, 0x0, 0xFFFFFFFF007FFFFF, 0x181FFF, + 0x0, 0x0, 0x0, 0x1FE0000FFFFFFFF8, 0xFC00000000000000, 0xFFFF, 0xFFFFFFFF3800C001, + 0xFFFFFFFF0000000F, 0xE0000000000F, 0x0, 0x0, 0xFFFFF78000000000, 0x3FFFFFFF00000007, + 0xFFFC00000005FE3C, 0xFFFFF, 0x0, 0x3FFFFFC000000, 0x7FFFFF, 0xFFFFFFFF8E000000, + 0xFF9F000000000007, 0x7C00, 0x1FFFFFFFFC0, 0xC40EFFFF00000000, 0xFFFFFFFFFFFF, 0x7FC00000000, 0x0, + 0x0, 0x0, 0x3FFF000000000000, 0x7FD, 0x0, 0x0, 0xFEEF000100000000, 0x3FFFFF, 0x0, 0x0, + 0xFFFFFFFFF80000, 0x20000000000000, 0xFFFFFFFFE000, 0x0, 0xFF80, 0x900000007FFFFF, 0x7FFFFFFE0, + 0x7FFFFFFFE, 0xFF00000000000000, 0xFFFB, 0xFFF, 0xBFFFBD7000000000, 0x7FFFFFFFFC0001FF, + 0xFFE0000000000000, 0xFDFF, 0x3ED, 0x0, 0x0, 0xFFFFFFFFC0000000, 0x1F, 0x0, 0xFFFFFFFF8000, 0x0, + 0x0, 0x0, 0xC000000000000000, 0x7FFFFFFF, 0xC000000000000000, 0xFFFFFFFF, 0x0, 0xFFFFFC0000000000, + 0x10007FF, 0x7FFFFFF00000000, 0x7F00000000, 0x0, 0x0, 0x0, 0xFFFFFFFFC000000, 0x0, 0x0, 0x0, 0x0, + 0xFFFFFF6FF000, 0x0, 0x0, 0xFFFFFFFFC0000000, 0xF800000000000001, 0x7FFFFFFFF, 0xFFFFFFFFFF000, + 0x0, 0x0, 0x7FFFFFFFC0000000, 0x0, 0xFFFFFFFC, 0x0, 0x0, 0x1FFFFFFFFF000, 0xFFFFF00000000000, + 0x3FF, 0x0, 0x3FFFF, 0x0, 0x0, 0x0, 0x0, +}; + #endif // #ifndef GRAPHEME_DATA_H diff --git a/stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp b/stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp index 6916df342205e..be34b868f8e93 100644 --- a/stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp +++ b/stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp @@ -12,6 +12,7 @@ #include "Common/GraphemeData.h" #include "../SwiftShims/UnicodeData.h" +#include SWIFT_RUNTIME_STDLIB_INTERNAL __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar) { @@ -57,3 +58,16 @@ __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar) // property). Return the max value here to indicate .any. return 0xFF; } + +SWIFT_RUNTIME_STDLIB_INTERNAL +__swift_bool _swift_stdlib_isLinkingConsonant(__swift_uint32_t scalar) { + auto idx = _swift_stdlib_getScalarBitArrayIdx(scalar, + _swift_stdlib_linkingConsonant, + _swift_stdlib_linkingConsonant_ranks); + + if (idx == std::numeric_limits<__swift_intptr_t>::max()) { + return false; + } + + return true; +} diff --git a/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/IndicRules.swift b/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/IndicRules.swift new file mode 100644 index 0000000000000..daf4f7aef26f1 --- /dev/null +++ b/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/IndicRules.swift @@ -0,0 +1,179 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +import GenUtils + +func getLinkingConsonant( + from data: String +) -> [ClosedRange] { + var unflattened: [(ClosedRange, String)] = [] + + for line in data.split(separator: "\n") { + // Skip comments + guard !line.hasPrefix("#") else { + continue + } + + let components = line.split(separator: ";") + + // Get the property first because it may be one we don't care about. + let splitProperty = components[1].split(separator: "#") + let filteredProperty = splitProperty[0].filter { !$0.isWhitespace } + + // We only care about Linking Consonant who is defined as 'Consonant'. + guard filteredProperty == "Consonant" else { + continue + } + + // This rule only applies to the following scripts, so ensure that these + // scalars are from such scripts. + for script in ["Bengali", "Devanagari", "Gujarati", "Oriya", "Telugu", "Malayalam"] { + guard line.contains(script.uppercased()) else { + continue + } + + break + } + + let scalars: ClosedRange + + let filteredScalars = components[0].filter { !$0.isWhitespace } + + // If we have . appear, it means we have a legitimate range. Otherwise, + // it's a singular scalar. + if filteredScalars.contains(".") { + let range = filteredScalars.split(separator: ".") + + scalars = UInt32(range[0], radix: 16)! ... UInt32(range[1], radix: 16)! + } else { + let scalar = UInt32(filteredScalars, radix: 16)! + + scalars = scalar ... scalar + } + + unflattened.append((scalars, "Consonant")) + } + + return flatten(unflattened).map { $0.0 } +} + +func emitLinkingConsonant( + _ data: [ClosedRange], + into result: inout String +) { + // 64 bit arrays * 8 bytes = .512 KB + var bitArrays: [BitArray] = .init(repeating: .init(size: 64), count: 64) + + let chunkSize = 0x110000 / 64 / 64 + + var chunks: [Int] = [] + + for i in 0 ..< 64 * 64 { + let lower = i * chunkSize + let upper = lower + chunkSize - 1 + + let idx = i / 64 + let bit = i % 64 + + for scalar in lower ... upper { + if data.contains(where: { $0.contains(UInt32(scalar)) }) { + chunks.append(i) + + bitArrays[idx][bit] = true + break + } + } + } + + // Remove the trailing 0s. Currently this reduces quick look size down to + // 96 bytes from 512 bytes. + var reducedBA = Array(bitArrays.reversed()) + reducedBA = Array(reducedBA.drop { + $0.words == [0x0] + }) + + bitArrays = reducedBA.reversed() + + // Keep a record of every rank for all the bitarrays. + var ranks: [UInt16] = [] + + // Record our quick look ranks. + var lastRank: UInt16 = 0 + for (i, _) in bitArrays.enumerated() { + guard i != 0 else { + ranks.append(0) + continue + } + + var rank = UInt16(bitArrays[i - 1].words[0].nonzeroBitCount) + rank += lastRank + + ranks.append(rank) + + lastRank = rank + } + + // Insert our quick look size at the beginning. + var size = BitArray(size: 64) + size.words = [UInt64(bitArrays.count)] + bitArrays.insert(size, at: 0) + + for chunk in chunks { + var chunkBA = BitArray(size: chunkSize) + + let lower = chunk * chunkSize + let upper = lower + chunkSize + + for scalar in lower ..< upper { + if data.contains(where: { $0.contains(UInt32(scalar)) }) { + chunkBA[scalar % chunkSize] = true + } + } + + // Append our chunk bit array's rank. + var lastRank: UInt16 = 0 + for (i, _) in chunkBA.words.enumerated() { + guard i != 0 else { + ranks.append(0) + continue + } + + var rank = UInt16(chunkBA.words[i - 1].nonzeroBitCount) + rank += lastRank + + ranks.append(rank) + lastRank = rank + } + + bitArrays += chunkBA.words.map { + var ba = BitArray(size: 64) + ba.words = [$0] + return ba + } + } + + emitCollection( + ranks, + name: "_swift_stdlib_linkingConsonant_ranks", + into: &result + ) + + emitCollection( + bitArrays, + name: "_swift_stdlib_linkingConsonant", + type: "__swift_uint64_t", + into: &result + ) { + assert($0.words.count == 1) + return "0x\(String($0.words[0], radix: 16, uppercase: true))" + } +} diff --git a/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/main.swift b/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/main.swift index 2d042158c621e..d8b2aae407ee2 100644 --- a/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/main.swift +++ b/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/main.swift @@ -48,37 +48,6 @@ extension Unicode { } } -// Takes an unflattened array of scalar ranges and grapheme break properties and -// attempts to merge ranges who share the same break property. E.g: -// -// 0x0 ... 0xA = .control -// 0xB ... 0xB = .control -// 0xC ... 0x1F = .control -// -// into: -// -// 0x0 ... 0x1F = .control -func flatten( - _ unflattened: [(ClosedRange, Unicode.GraphemeBreakProperty)] -) -> [(ClosedRange, Unicode.GraphemeBreakProperty)] { - var result: [(ClosedRange, Unicode.GraphemeBreakProperty)] = [] - - for elt in unflattened.sorted(by: { $0.0.lowerBound < $1.0.lowerBound }) { - guard !result.isEmpty, result.last!.1 == elt.1 else { - result.append(elt) - continue - } - - if elt.0.lowerBound == result.last!.0.upperBound + 1 { - result[result.count - 1].0 = result.last!.0.lowerBound ... elt.0.upperBound - } else { - result.append(elt) - } - } - - return result -} - // Given a path to one of the Unicode data files, reads it and returns the // unflattened list of scalar & grapheme break property. // @@ -150,7 +119,9 @@ func emit( into result: inout String ) { result += """ - static __swift_uint32_t _swift_stdlib_graphemeBreakProperties[\(data.count)] = { + #define GRAPHEME_BREAK_DATA_COUNT \(data.count) + + static const __swift_uint32_t _swift_stdlib_graphemeBreakProperties[\(data.count)] = { """ @@ -181,69 +152,20 @@ func emit( value |= 1 << 31 } - return "0x\(String(value, radix: 16))" + return "0x\(String(value, radix: 16, uppercase: true))" } - result += "\n};\n\n" -} - -// Writes the stdlib internal routine for binary searching the grapheme array. -func emitAccessor( - _ dataCount: Int, - into result: inout String -) { result += """ - SWIFT_RUNTIME_STDLIB_INTERNAL - __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar) { - auto low = 0; - auto high = \(dataCount) - 1; - - while (high >= low) { - auto idx = low + (high - low) / 2; - - auto entry = _swift_stdlib_graphemeBreakProperties[idx]; - - // Shift the enum and range count out of the value. - auto lower = (entry << 11) >> 11; - - // Shift the enum out first, then shift out the scalar value. - auto upper = lower + ((entry << 3) >> 24); - - // Shift everything out. - auto enumValue = (__swift_uint8_t)(entry >> 29); - - // Special case: extendedPictographic who used an extra bit for the range. - if (enumValue == 5) { - upper = lower + ((entry << 2) >> 23); - } - - if (scalar >= lower && scalar <= upper) { - return enumValue; - } - - if (scalar > upper) { - low = idx + 1; - continue; - } - - if (scalar < lower) { - high = idx - 1; - continue; - } - } - - // If we made it out here, then our scalar was not found in the grapheme - // array (this occurs when a scalar doesn't map to any grapheme break - // property). Return the max value here to indicate .any. - return 0xFF; - } - + + }; + + """ } // Main entry point into the grapheme break property generator. func generateGraphemeBreakProperty() { - var result = readFile("Input/UnicodeGrapheme.cpp") + var result = readFile("Input/GraphemeData.h") let baseData = getGraphemeBreakPropertyData( for: "Data/GraphemeBreakProperty.txt" @@ -267,10 +189,21 @@ func generateGraphemeBreakProperty() { } emit(data, into: &result) - - emitAccessor(data.count, into: &result) - - write(result, to: "Output/UnicodeGrapheme.cpp") + + // Handle the CLDR grapheme breaking rules: + + let indicSyllabicCategory = readFile("Data/IndicSyllabicCategory.txt") + + let consonants = getLinkingConsonant(from: indicSyllabicCategory) + + emitLinkingConsonant(consonants, into: &result) + + result += """ + #endif // #ifndef GRAPHEME_DATA_H + + """ + + write(result, to: "Output/Common/GraphemeData.h") } generateGraphemeBreakProperty() diff --git a/validation-test/stdlib/String.swift b/validation-test/stdlib/String.swift index 453c52d53a85b..bda49c26b321d 100644 --- a/validation-test/stdlib/String.swift +++ b/validation-test/stdlib/String.swift @@ -2280,4 +2280,58 @@ StringTests.test("NormalizationCheck/Opaque") #endif } +func expectBidirectionalCount(_ count: Int, _ string: String) { + var i = 0 + var index = string.endIndex + + while index != string.startIndex { + i += 1 + string.formIndex(before: &index) + } + + expectEqual(i, count) +} + +StringTests.test("GraphemeBreaking.Indic Sequences") { + let test1 = "\u{0915}\u{0924}" // 2 + expectEqual(2, test1.count) + expectBidirectionalCount(2, test1) + + let test2 = "\u{0915}\u{094D}\u{0924}" // 1 + expectEqual(1, test2.count) + expectBidirectionalCount(1, test2) + + let test3 = "\u{0915}\u{094D}\u{094D}\u{0924}" // 1 + expectEqual(1, test3.count) + expectBidirectionalCount(1, test3) + + let test4 = "\u{0915}\u{094D}\u{200D}\u{0924}" // 1 + expectEqual(1, test4.count) + expectBidirectionalCount(1, test4) + + let test5 = "\u{0915}\u{093C}\u{200D}\u{094D}\u{0924}" // 1 + expectEqual(1, test5.count) + expectBidirectionalCount(1, test5) + + let test6 = "\u{0915}\u{093C}\u{094D}\u{200D}\u{0924}" // 1 + expectEqual(1, test6.count) + expectBidirectionalCount(1, test6) + + let test7 = "\u{0915}\u{094D}\u{0924}\u{094D}\u{092F}" // 1 + expectEqual(1, test7.count) + expectBidirectionalCount(1, test7) + + let test8 = "\u{0915}\u{094D}\u{0061}" // 2 + expectEqual(2, test8.count) + expectBidirectionalCount(2, test8) + + let test9 = "\u{0061}\u{094D}\u{0924}" // 2 + expectEqual(2, test9.count) + expectBidirectionalCount(2, test9) + + let test10 = "\u{003F}\u{094D}\u{0924}" // 2 + expectEqual(2, test10.count) + expectBidirectionalCount(2, test10) +} + runAllTests()