[SPARK-53421][SPARK-53377][SDP] Propagate Logical Plan ID in SDP Analysis

jackywang-db · cloud-fan · commit c459d71f8134 · 2025-09-01T11:15:55.000+08:00
### What changes were proposed in this pull request? Propagate `LogicalPlan.PLAN_ID_TAG` to the resolved logical plan during SDP analysis so when the whole plan is sent to Spark for analysis, it contains the correct plan id. ### Why are the changes needed? Spark Connect attaches a plan id to each logical plan. In SDP, we take part of the logical plan and analyze it independently to resolve table references correctly. When this happens, the logical plan id is lost which causes resolution errors when the plan is sent to Spark for complete analysis. For example, group by and rollup functions would fail with `sql.AnalysisException: [CANNOT_RESOLVE_DATAFRAME_COLUMN] Cannot resolve dataframe column "id". It's probably because of illegal references like df1.select(df2.col("a"))` ```python3 from pyspark.sql.functions import col, sum, count dp.materialized_view def groupby_result(): return spark.read.table("src").groupBy("id").count() ``` This happens because we take the below unresolved logical plan: ``` 'Aggregate ['id], ['id, 'count(1) AS count#7] +- 'UnresolvedRelation [src], [], false ``` Perform independent analysis on the `UnresolvedRelation` part to identify the table. During this analysis, the plan id is lost. ``` 'Aggregate ['id], ['id, 'count(1) AS count#7] +- SubqueryAlias spark_catalog.default.src +- Relation spark_catalog.default.src[id#9L] parquet ``` So when the above partially resolved logical plan is sent to Spark for analysis, it tries to resolve the `id` attribute in the aggregate operation with respect to the `SubqueryAlias` children, and fails because the children no longer contains the same plan id. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Tests ### Was this patch authored or co-authored using generative AI tooling? Closes #52121 from JiaqiWang18/SPARK-53377-sdp-groupBy-rollup-tests. Authored-by: Jacky Wang <jacky.wang@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -181,6 +181,17 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]]
     }
   }
 
+  def mergeTagsFrom(other: BaseType): Unit = {
+    if (!other.isTagsEmpty) {
+      // Merge all tags from the other node into this node.
+      // Unlike copyTagsFrom which only copies when this node has no tags,
+      // mergeTagsFrom will always merge tags regardless of existing state.
+      // If both nodes have the same tag with different values, the value
+      // from the other node will overwrite the existing value in this node.
+      tags ++= other.tags
+    }
+  }
+
   def setTagValue[T](tag: TreeNodeTag[T], value: T): Unit = {
     tags(tag) = value
   }
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/PythonPipelineSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/PythonPipelineSuite.scala
@@ -30,7 +30,7 @@ import org.apache.spark.api.python.PythonUtils
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.connect.service.SparkConnectService
-import org.apache.spark.sql.pipelines.graph.DataflowGraph
+import org.apache.spark.sql.pipelines.graph.{DataflowGraph, PipelineUpdateContextImpl}
 import org.apache.spark.sql.pipelines.utils.{EventVerificationTestHelpers, TestPipelineUpdateContextMixin}
 
 /**
@@ -434,6 +434,66 @@ class PythonPipelineSuite
         .map(_.identifier) == Seq(graphIdentifier("a"), graphIdentifier("something")))
   }
 
+  test("groupby and rollup works with internal datasets, referencing with (col, str)") {
+    val graph = buildGraph("""
+      from pyspark.sql.functions import col, sum, count
+
+      @dp.materialized_view
+      def src():
+        return spark.range(3)
+
+      @dp.materialized_view
+      def groupby_with_col_result():
+        return spark.read.table("src").groupBy(col("id")).agg(
+          sum("id").alias("sum_id"),
+          count("*").alias("cnt")
+        )
+
+      @dp.materialized_view
+      def groupby_with_str_result():
+        return spark.read.table("src").groupBy("id").agg(
+          sum("id").alias("sum_id"),
+          count("*").alias("cnt")
+        )
+
+      @dp.materialized_view
+      def rollup_with_col_result():
+        return spark.read.table("src").rollup(col("id")).agg(
+          sum("id").alias("sum_id"),
+          count("*").alias("cnt")
+        )
+
+      @dp.materialized_view
+      def rollup_with_str_result():
+        return spark.read.table("src").rollup("id").agg(
+          sum("id").alias("sum_id"),
+          count("*").alias("cnt")
+        )
+    """)
+
+    val updateContext = new PipelineUpdateContextImpl(graph, _ => ())
+    updateContext.pipelineExecution.runPipeline()
+    updateContext.pipelineExecution.awaitCompletion()
+
+    val groupbyDfs =
+      Seq(spark.table("groupby_with_col_result"), spark.table("groupby_with_str_result"))
+
+    val rollupDfs =
+      Seq(spark.table("rollup_with_col_result"), spark.table("rollup_with_str_result"))
+
+    // groupBy: each variant should have exactly one row per id [0,1,2]
+    groupbyDfs.foreach { df =>
+      assert(df.select("id").collect().map(_.getLong(0)).toSet == Set(0L, 1L, 2L))
+    }
+
+    // rollup: each variant should have groupBy rows + one total row
+    rollupDfs.foreach { df =>
+      assert(df.count() == 3 + 1) // 3 ids + 1 total
+      val totalRow = df.filter("id IS NULL").collect().head
+      assert(totalRow.getLong(1) == 3L && totalRow.getLong(2) == 3L)
+    }
+  }
+
   test("create pipeline without table will throw RUN_EMPTY_PIPELINE exception") {
     checkError(
       exception = intercept[AnalysisException] {
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowAnalysis.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowAnalysis.scala
@@ -112,23 +112,29 @@ object FlowAnalysis {
         // - SELECT ... FROM STREAM(t1)
         // - SELECT ... FROM STREAM t1
         case u: UnresolvedRelation if u.isStreaming =>
-          readStreamInput(
+          val resolved = readStreamInput(
             context,
             name = IdentifierHelper.toQuotedString(u.multipartIdentifier),
             spark.readStream,
             streamingReadOptions = StreamingReadOptions()
           ).queryExecution.analyzed
-
+          // Spark Connect requires the PLAN_ID_TAG to be propagated to the resolved plan
+          // to allow correct analysis of the parent plan that contains this subquery
+          resolved.mergeTagsFrom(u)
+          resolved
         // Batch read on another dataset in the pipeline
         case u: UnresolvedRelation =>
-          readBatchInput(
+          val resolved = readBatchInput(
             context,
             name = IdentifierHelper.toQuotedString(u.multipartIdentifier),
             batchReadOptions = BatchReadOptions()
           ).queryExecution.analyzed
+          // Spark Connect requires the PLAN_ID_TAG to be propagated to the resolved plan
+          // to allow correct analysis of the parent plan that contains this subquery
+          resolved.mergeTagsFrom(u)
+          resolved
       }
     Dataset.ofRows(spark, resolvedPlan)
-
   }
 
   /**
diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/SqlPipelineSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/SqlPipelineSuite.scala
@@ -743,6 +743,41 @@ class SqlPipelineSuite extends PipelineTest with SharedSparkSession {
     )
   }
 
+  test("groupby and rollup works with internal datasets") {
+    val unresolvedDataflowGraph = unresolvedDataflowGraphFromSql(
+      sqlText = s"""
+                   |CREATE MATERIALIZED VIEW src AS
+                   |    SELECT id
+                   |    FROM range(3);
+                   |
+                   |CREATE MATERIALIZED VIEW groupby_result AS
+                   |    SELECT id, SUM(id) AS sum_id, COUNT(*) AS cnt
+                   |    FROM src
+                   |    GROUP BY id;
+                   |
+                   |CREATE MATERIALIZED VIEW rollup_result AS
+                   |    SELECT id, SUM(id) AS sum_id, COUNT(*) AS cnt
+                   |    FROM src
+                   |    GROUP BY ROLLUP(id);
+                   |""".stripMargin
+    )
+
+    startPipelineAndWaitForCompletion(unresolvedDataflowGraph)
+
+    val groupbyDf = spark.table(fullyQualifiedIdentifier("groupby_result"))
+    val rollupDf = spark.table(fullyQualifiedIdentifier("rollup_result"))
+
+    // groupBy should have exactly one row per id [0,1,2]
+    assert(groupbyDf.select("id").collect().map(_.getLong(0)).toSet == Set(0L, 1L, 2L))
+
+    // rollup should have all groupBy rows + one extra (the total row)
+    assert(rollupDf.count() == groupbyDf.count() + 1)
+
+    // verify the rollup total row: id IS NULL, sum_id=3, cnt=3
+    val totalRow = rollupDf.filter("id IS NULL").collect().head
+    assert(totalRow.getLong(1) == 3L && totalRow.getLong(2) == 3L)
+  }
+
   test("Empty streaming table definition is disallowed") {
     val unresolvedDataflowGraph = unresolvedDataflowGraphFromSql(
       sqlText = "CREATE STREAMING TABLE st;"

Original file line number	Diff line number	Diff line change
`@@ -181,6 +181,17 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]]`
`181`	`181`	`}`
`182`	`182`	`}`
`183`	`183`
	`184`	`+ def mergeTagsFrom(other: BaseType): Unit = {`
	`185`	`+ if (!other.isTagsEmpty) {`
	`186`	`+ // Merge all tags from the other node into this node.`
	`187`	`+ // Unlike copyTagsFrom which only copies when this node has no tags,`
	`188`	`+ // mergeTagsFrom will always merge tags regardless of existing state.`
	`189`	`+ // If both nodes have the same tag with different values, the value`
	`190`	`+ // from the other node will overwrite the existing value in this node.`
	`191`	`+ tags ++= other.tags`
	`192`	`+ }`
	`193`	`+ }`
	`194`	`+`
`184`	`195`	`def setTagValue[T](tag: TreeNodeTag[T], value: T): Unit = {`
`185`	`196`	`tags(tag) = value`
`186`	`197`	`}`