-
Notifications
You must be signed in to change notification settings - Fork 2k
[Spark] Hybrid Delta connector combining V1(default) and V2 connectors #5726
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
eab1903
52f52e9
6c4f3e0
8c51832
5658848
a37fae7
3138883
c4e27cb
1c7ba51
1268d0b
c3027d4
20bfc30
eff8439
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,135 @@ | ||
| /* | ||
| * Copyright (2025) The Delta Lake Project Authors. | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package io.delta.sql | ||
|
|
||
| import scala.jdk.CollectionConverters._ | ||
|
|
||
| import io.delta.kernel.spark.catalog.SparkTable | ||
| import io.delta.kernel.spark.utils.{CatalogTableUtils, ScalaUtils} | ||
| import org.apache.spark.sql.delta.sources.{DeltaSQLConfV2, DeltaSourceUtils} | ||
| import org.apache.spark.sql.delta.sources.DeltaV2StreamingUtils | ||
|
|
||
| import org.apache.spark.sql.SparkSession | ||
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
| import org.apache.spark.sql.catalyst.rules.Rule | ||
| import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2 | ||
| import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes | ||
| import org.apache.spark.sql.connector.catalog.Identifier | ||
| import org.apache.spark.sql.execution.streaming.StreamingRelation | ||
| import org.apache.spark.sql.util.CaseInsensitiveStringMap | ||
|
|
||
| /** | ||
| * Rule for applying the V2 streaming path by rewriting V1 StreamingRelation | ||
| * with Delta DataSource to StreamingRelationV2 with SparkTable. | ||
| * | ||
| * This rule handles the case where Spark's FindDataSourceTable rule has converted | ||
| * a StreamingRelationV2 (with DeltaTableV2) back to a StreamingRelation because | ||
| * DeltaTableV2 doesn't advertise STREAMING_READ capability. We convert it back to | ||
| * StreamingRelationV2 with SparkTable (from kernel-spark) which does support streaming. | ||
| * | ||
| * Behavior based on spark.databricks.delta.v2.enableMode: | ||
| * - AUTO (default): Only applies to Unity Catalog managed tables | ||
| * - STRICT: Applies to all Delta tables (for testing V2 streaming) | ||
| * - NONE: Rule is disabled, no conversion happens | ||
| * | ||
| * @param session The Spark session for configuration access | ||
| */ | ||
| class ApplyV2Streaming( | ||
| @transient private val session: SparkSession) | ||
| extends Rule[LogicalPlan] { | ||
|
|
||
| private def isDeltaStreamingRelation(s: StreamingRelation): Boolean = { | ||
| // Check if this is a Delta streaming relation by examining: | ||
| // 1. The source name (e.g., "delta" from .format("delta")) | ||
| // 2. The catalog table's provider (e.g., "DELTA" from Unity Catalog) | ||
| // 3. Whether the table is a Unity Catalog managed table | ||
| s.dataSource.catalogTable match { | ||
| case Some(catalogTable) => | ||
| DeltaSourceUtils.isDeltaDataSourceName(s.sourceName) || | ||
| catalogTable.provider.exists(DeltaSourceUtils.isDeltaDataSourceName) || | ||
| CatalogTableUtils.isUnityCatalogManagedTable(catalogTable) | ||
| case None => false | ||
| } | ||
| } | ||
|
|
||
| private def shouldApplyV2Streaming(s: StreamingRelation): Boolean = { | ||
| if (!isDeltaStreamingRelation(s)) { | ||
| return false | ||
| } | ||
|
|
||
| val mode = session.conf.get( | ||
| DeltaSQLConfV2.V2_ENABLE_MODE.key, | ||
| DeltaSQLConfV2.V2_ENABLE_MODE.defaultValueString) | ||
|
|
||
| // scalastyle:off caselocale | ||
| mode.toUpperCase match { | ||
| // scalastyle:on caselocale | ||
| case "STRICT" => | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can just return V2 table in DeltaCatalog in STRICT mode. The new analyzer rule only happens on AUTO mode |
||
| // Always apply V2 streaming for all Delta tables | ||
| true | ||
| case "AUTO" => | ||
| // Only apply for Unity Catalog managed tables | ||
| // catalogTable is guaranteed to be Some because isDeltaStreamingRelation checked it | ||
| s.dataSource.catalogTable.exists(CatalogTableUtils.isUnityCatalogManagedTable) | ||
| case "NONE" | _ => | ||
| // V2 streaming disabled | ||
| false | ||
| } | ||
| } | ||
|
|
||
| override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
| case s: StreamingRelation if shouldApplyV2Streaming(s) => | ||
| // catalogTable is guaranteed to be defined because shouldApplyV2Streaming checks it | ||
| // via isDeltaStreamingRelation, but we use pattern matching for safety | ||
| s.dataSource.catalogTable match { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if |
||
| case Some(catalogTable) => | ||
| val ident = | ||
| Identifier.of(catalogTable.identifier.database.toArray, catalogTable.identifier.table) | ||
| val table = | ||
| new SparkTable( | ||
| ident, | ||
| catalogTable, | ||
| ScalaUtils.toJavaMap(catalogTable.properties)) | ||
|
|
||
| // Add a marker to indicate this schema comes from V2 streaming (SparkTable) | ||
| // This allows DeltaDataSource.sourceSchema to distinguish between: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. DeltaDataSource.sourceSchema is called before this rule |
||
| // 1. Schema from SparkTable (validated by Kernel) - can be used directly | ||
| // 2. User-provided schema - must go through DeltaLog validation | ||
| val optionsWithMarker = new java.util.HashMap[String, String]( | ||
| s.dataSource.options.size + 1) | ||
| s.dataSource.options.asJava.forEach((k, v) => optionsWithMarker.put(k, v)) | ||
| optionsWithMarker.put( | ||
| DeltaV2StreamingUtils.V2_STREAMING_SCHEMA_SOURCE_KEY, | ||
| DeltaV2StreamingUtils.V2_STREAMING_SCHEMA_SOURCE_SPARK_TABLE) | ||
|
|
||
| StreamingRelationV2( | ||
| source = None, | ||
| sourceName = "delta", | ||
| table = table, | ||
| extraOptions = new CaseInsensitiveStringMap(optionsWithMarker), | ||
| output = toAttributes(table.schema), | ||
| catalog = None, | ||
| identifier = Some(ident), | ||
| v1Relation = Some(s)) | ||
|
|
||
| case None => | ||
| // This should never happen due to shouldApplyV2Streaming check, but be defensive | ||
| s | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| /* | ||
| * Copyright (2025) The Delta Lake Project Authors. | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.delta.sources | ||
|
|
||
| /** | ||
| * Utilities/constants for Delta's V2 streaming path integration. | ||
| * | ||
| * This is defined in sparkV1 so it can be referenced from both: | ||
| * - spark (DeltaDataSource / V1 streaming APIs) | ||
| * - spark-unified (ApplyV2Streaming rule) | ||
| */ | ||
| object DeltaV2StreamingUtils { | ||
| /** Marker option key injected by ApplyV2Streaming into streaming options. */ | ||
| final val V2_STREAMING_SCHEMA_SOURCE_KEY: String = "__v2StreamingSchemaSource" | ||
|
|
||
| /** Marker option value indicating the schema originated from kernel SparkTable. */ | ||
| final val V2_STREAMING_SCHEMA_SOURCE_SPARK_TABLE: String = "SparkTable" | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
shall we skip checking
CatalogTableUtils.isUnityCatalogManagedTable(catalogTable)here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the scope of the change is to handle only uc owned tables?