Skip to content
11 changes: 5 additions & 6 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -391,9 +391,8 @@ extension DSLTree.CustomCharacterClass.Member {

return { input, bounds in
let curIdx = bounds.lowerBound
let nextIndex = isCharacterSemantic
? input.index(after: curIdx)
: input.unicodeScalars.index(after: curIdx)
let nextIndex = input.index(
after: curIdx, isScalarSemantics: !isCharacterSemantic)

// Under grapheme semantics, we compare based on single NFC scalars. If
// such a character is not single scalar under NFC, the match fails. In
Expand Down Expand Up @@ -603,9 +602,9 @@ extension AST.Atom.CharacterProperty {
if p(input, bounds) != nil { return nil }

// TODO: bounds check
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the story here — does the bounds check always happen before the consume function is called? Or is there a post-advancement check to make sure we haven't gone beyond the bounds of the search range?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't change the behavior. Let's open an issue for that

return opts.semanticLevel == .graphemeCluster
? input.index(after: bounds.lowerBound)
: input.unicodeScalars.index(after: bounds.lowerBound)
return input.index(
after: bounds.lowerBound,
isScalarSemantics: opts.semanticLevel == .unicodeScalar)
}
}

Expand Down
14 changes: 11 additions & 3 deletions Sources/_StringProcessing/Engine/InstPayload.swift
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ struct QuantifyPayload: RawRepresentable {
case asciiBitset = 0
case asciiChar = 1
case any = 2
case builtin = 4
case builtinCC = 4
}

// TODO: figure out how to better organize this...
Expand All @@ -408,6 +408,14 @@ struct QuantifyPayload: RawRepresentable {
var typeMask: UInt64 { 7 }
var payloadMask: UInt64 { 0xFF_FF }

// Calculate the maximum number of trips, else UInt64.max if unbounded
var maxTrips: UInt64 {
guard let maxExtraTrips else {
return UInt64.max
}
return minTrips + maxExtraTrips
}

static func packInfoValues(
_ kind: AST.Quantification.Kind,
_ minTrips: Int,
Expand Down Expand Up @@ -485,7 +493,7 @@ struct QuantifyPayload: RawRepresentable {
+ (model.isInverted ? 1 << 9 : 0)
+ (model.isStrictASCII ? 1 << 10 : 0)
self.rawValue = packedModel
+ QuantifyPayload.packInfoValues(kind, minTrips, maxExtraTrips, .builtin, isScalarSemantics: isScalarSemantics)
+ QuantifyPayload.packInfoValues(kind, minTrips, maxExtraTrips, .builtinCC, isScalarSemantics: isScalarSemantics)
}

var type: PayloadType {
Expand Down Expand Up @@ -531,7 +539,7 @@ struct QuantifyPayload: RawRepresentable {
(self.rawValue & 1) == 1
}

var builtin: _CharacterClassModel.Representation {
var builtinCC: _CharacterClassModel.Representation {
_CharacterClassModel.Representation(rawValue: self.rawValue & 0xFF)!
}
var builtinIsInverted: Bool {
Expand Down
19 changes: 19 additions & 0 deletions Sources/_StringProcessing/Engine/MEBuiltins.swift
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,25 @@ extension String {
else { return nil }
return next
}

internal func matchRegexDot(
at currentPosition: Index,
limitedBy end: Index,
anyMatchesNewline: Bool,
isScalarSemantics: Bool
) -> Index? {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we still making distinctions between the . literal syntax and other "any" designations at this point in the engine? For example, CharacterClass.any and CharacterClass.anyNonNewline are unaffected by the (?s) option, and always do what they say they should. I thought that we had already handled those distinctions during code gen (e.g. by calling emitAny vs emitAnyNonNewline).

If this method is just for the . literal, does it need to account for the NSRE-compatible-dot mode? In that case, a full \r\n pair needs to be consumed even when using scalar semantics.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the leaf function which provides the implementation. The anyMatchesNewline parameter dictates what the behavior is, and is at least part of the quantification payload.

guard currentPosition < end else { return nil }

if anyMatchesNewline {
return index(
after: currentPosition, isScalarSemantics: isScalarSemantics)
}

return matchAnyNonNewline(
at: currentPosition,
limitedBy: end,
isScalarSemantics: isScalarSemantics)
}
}

// MARK: - Built-in character class matching
Expand Down
Loading