feat(spark): add LogicalRDD support (#451)

andrew-coleman · web-flow · commit 142c57499895 · 2025-08-01T12:12:59.000+01:00
Support conversion of dataframes that are created using the
Spark createDataFrame() method.
This produces a LogicalRDD in the query plan which can be
converted to a substrait VirtualTableScan.
Introduces overridable `rddLimit` to guard against serialising very large datasets.

Signed-off-by: Andrew Coleman &lt;andrew_coleman@uk.ibm.com&gt;
diff --git a/spark/src/main/scala/io/substrait/spark/logical/ToSubstraitRel.scala b/spark/src/main/scala/io/substrait/spark/logical/ToSubstraitRel.scala
@@ -21,11 +21,13 @@ import io.substrait.spark.expression._
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SaveMode
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HiveTableRelation}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Average, Sum}
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.execution.LogicalRDD
 import org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand
 import org.apache.spark.sql.execution.datasources.{FileFormat => DSFileFormat, HadoopFsRelation, InsertIntoHadoopFsRelationCommand, LogicalRelation, V1WriteCommand, WriteFiles}
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
@@ -52,7 +54,7 @@ import io.substrait.utils.Util
 import java.util
 import java.util.{Collections, Optional}
 
-import scala.collection.JavaConverters.{asJavaIterableConverter, seqAsJavaList}
+import scala.collection.JavaConverters.asJavaIterableConverter
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
@@ -64,6 +66,10 @@ class ToSubstraitRel extends AbstractLogicalPlanVisitor with Logging {
 
   private val existenceJoins = scala.collection.mutable.Map[Long, SExpression.InPredicate]()
 
+  private var _rddLimit = 100
+  def rddLimit: Int = _rddLimit
+  def rddLimit_=(rddLimit: Int): Unit = _rddLimit = rddLimit
+
   def getExistenceJoin(id: Long): Option[SExpression.InPredicate] = existenceJoins.get(id)
 
   override def default(p: LogicalPlan): relation.Rel = p match {
@@ -439,23 +445,25 @@ class ToSubstraitRel extends AbstractLogicalPlanVisitor with Logging {
       .build
     namedScan
   }
-  private def buildVirtualTableScan(localRelation: LocalRelation): relation.AbstractReadRel = {
-    val namedStruct = ToSubstraitType.toNamedStruct(localRelation.schema)
+  private def buildVirtualTableScan(
+      schema: StructType,
+      data: Seq[InternalRow]): relation.AbstractReadRel = {
+    val namedStruct = ToSubstraitType.toNamedStruct(schema)
 
-    if (localRelation.data.isEmpty) {
+    if (data.isEmpty) {
       relation.EmptyScan.builder().initialSchema(namedStruct).build()
     } else {
       relation.VirtualTableScan
         .builder()
         .initialSchema(namedStruct)
         .addAllRows(
-          localRelation.data
+          data
             .map(
               row => {
                 var idx = 0
                 val buf = new ArrayBuffer[SExpression.Literal](row.numFields)
                 while (idx < row.numFields) {
-                  val dt = localRelation.schema(idx).dataType
+                  val dt = schema(idx).dataType
                   val l = Literal.apply(row.get(idx, dt), dt)
                   buf += ToSubstraitLiteral.apply(l)
                   idx += 1
@@ -528,7 +536,14 @@ class ToSubstraitRel extends AbstractLogicalPlanVisitor with Logging {
       case hiveTableRelation: HiveTableRelation =>
         tableNames = hiveTableRelation.tableMeta.identifier.unquotedString.split("\\.").toList
         buildNamedScan(hiveTableRelation.schema, tableNames)
-      case localRelation: LocalRelation => buildVirtualTableScan(localRelation)
+      case localRelation: LocalRelation =>
+        buildVirtualTableScan(localRelation.schema, localRelation.data)
+      case rdd: LogicalRDD =>
+        if (rdd.rdd.count() > _rddLimit) {
+          logWarning(
+            s"LogicalRDD relation contains ${rdd.rdd.count()} rows.  Truncating to ${_rddLimit}. This limit can be changed by setting the `rddLimit` property on this ToSubstraitRel instance.")
+        }
+        buildVirtualTableScan(rdd.schema, rdd.rdd.take(_rddLimit))
       case logicalRelation: LogicalRelation =>
         logicalRelation.relation match {
           case fsRelation: HadoopFsRelation =>
diff --git a/spark/src/test/scala/io/substrait/spark/RelationsSuite.scala b/spark/src/test/scala/io/substrait/spark/RelationsSuite.scala
@@ -1,7 +1,9 @@
 package io.substrait.spark
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 
 class RelationsSuite extends SparkFunSuite with SharedSparkSession with SubstraitPlanTestBase {
 
@@ -38,4 +40,63 @@ class RelationsSuite extends SparkFunSuite with SharedSparkSession with Substrai
       "select * from (values (1, cast(struct(1, 'a') as struct<f1: int, f2: string>)) as table(int_col, col))"
     )
   }
+
+  test("create_dataset - LocalRelation") {
+    val spark = this.spark
+    import spark.implicits._
+
+    val df = Seq(
+      (1, "one"),
+      (2, "two"),
+      (3, "three")
+    ).toDF("id", "value")
+
+    assertSparkSubstraitRelRoundTrip(df.queryExecution.optimizedPlan)
+  }
+
+  test("createdataframe - LogicalRDD") {
+    val data = Seq(
+      Row(1, "one"),
+      Row(2, "two"),
+      Row(3, "three")
+    )
+
+    val schema = StructType(
+      List(
+        StructField("id", IntegerType, true),
+        StructField("value", StringType, true)
+      ))
+
+    val df = spark.createDataFrame(
+      spark.sparkContext.parallelize(data),
+      schema
+    )
+
+    assertSparkSubstraitRelRoundTrip(df.queryExecution.optimizedPlan)
+  }
+
+  test("Limit RDD size") {
+    val data = Seq(
+      Row(1, "one"),
+      Row(2, "two"),
+      Row(3, "three"),
+      Row(4, "four")
+    )
+
+    val schema = StructType(
+      List(
+        StructField("id", IntegerType, true),
+        StructField("value", StringType, true)
+      ))
+
+    val df = spark.createDataFrame(
+      spark.sparkContext.parallelize(data),
+      schema
+    )
+
+    assertResult(4)(df.count())
+
+    val plan = assertSparkSubstraitRelRoundTrip(df.queryExecution.optimizedPlan, 2)
+    assertResult(2)(plan.count())
+  }
 }
diff --git a/spark/src/test/scala/io/substrait/spark/SubstraitPlanTestBase.scala b/spark/src/test/scala/io/substrait/spark/SubstraitPlanTestBase.scala
@@ -57,9 +57,14 @@ trait SubstraitPlanTestBase { self: SharedSparkSession =>
 
   def assertSqlSubstraitRelRoundTrip(query: String): LogicalPlan = {
     val sparkPlan = plan(query)
+    assertSparkSubstraitRelRoundTrip(sparkPlan)
+  }
 
+  def assertSparkSubstraitRelRoundTrip(sparkPlan: LogicalPlan, rddLimit: Int = 10): LogicalPlan = {
     // convert spark logical plan to substrait
-    val substraitRel = new ToSubstraitRel().visit(sparkPlan)
+    val toSubstrait = new ToSubstraitRel
+    toSubstrait.rddLimit = rddLimit
+    val substraitRel = toSubstrait.visit(sparkPlan)
 
     // Serialize to protobuf byte array
     val extensionCollector = new ExtensionCollector
@@ -77,15 +82,15 @@ trait SubstraitPlanTestBase { self: SharedSparkSession =>
     require(sparkPlan2.resolved)
 
     // and back to substrait again
-    val substraitRel3 = new ToSubstraitRel().visit(sparkPlan2)
+    val substraitRel3 = toSubstrait.visit(sparkPlan2)
 
     // compare with original substrait plan to ensure it round-tripped (via proto bytes) correctly
     substraitRel3.shouldEqualPlainly(substraitRel)
 
     // Do one more roundtrip, this time with Substrait Plan object which contains also names,
     // to test that the Spark schemas match. This in some cases adds an extra Project
     // to rename fields, which then would break the round trip test we do above.
-    val substraitPlan = new ToSubstraitRel().convert(sparkPlan)
+    val substraitPlan = toSubstrait.convert(sparkPlan)
     val sparkPlan3 = toLogicalPlan.convert(substraitPlan);
     require(sparkPlan3.resolved);