[SPARK-53275][SQL] Handle stateful expressions when ordering in interpreted mode

bersprockets · peter-toth · commit bc36a7db43f2 · 2025-08-25T12:03:27.000+02:00
### What changes were proposed in this pull request? This PR updates `InterpretedOrdering` to use a different copy of stateful expressions when evaluating the two input rows. ### Why are the changes needed? Consider these spark-shell commands: ``` # for this particular example, the bug is exercised when there are 2 executors bin/spark-shell --master "local[2]" import org.apache.spark.sql.functions.udf spark.udf.register("udf", (s: String) => s) Seq((0, "2"), (0, "1")).toDF("a", "b").createOrReplaceTempView("v1") // return a correct result: Array([0,1], [0,2]) sql("select a, udf(b) from v1 order by a, udf(b) asc").collect // run in interpreted mode sql("set spark.sql.codegen.factoryMode=NO_CODEGEN") // return an incorrect result: Array([0,2], [0,1]) sql("select a, udf(b) from v1 order by a, udf(b) asc").collect ``` This is because the `ScalaUDF` expression indirectly holds an UnsafeRow as a buffer (via a serializer, which holds an `UnsafeProjection`, which holds the `UnsafeRow` buffer). When the udf is evaluated for the first row, the resulting `UTF8String` uses the `UnsafeRow`'s base object as its own base object. When the udf is evaluated for the second row, that same base object is updated such that both `UTF8String` objects contain the same string value. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New unit test. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #52028 from bersprockets/ordering_issue. Authored-by: Bruce Robbins <bersprockets@gmail.com> Signed-off-by: Peter Toth <peter.toth@gmail.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ordering.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ordering.scala
@@ -38,6 +38,8 @@ class BaseOrdering extends Ordering[InternalRow] {
  * An interpreted row ordering comparator.
  */
 class InterpretedOrdering(ordering: Seq[SortOrder]) extends BaseOrdering {
+  private val leftEvaluators = ordering.map(_.child)
+  private val rightEvaluators = leftEvaluators.map(_.freshCopyIfContainsStatefulExpression())
   private lazy val physicalDataTypes = ordering.map { order =>
     val dt = order.dataType match {
       case udt: UserDefinedType[_] => udt.sqlType
@@ -54,8 +56,8 @@ class InterpretedOrdering(ordering: Seq[SortOrder]) extends BaseOrdering {
     val size = ordering.size
     while (i < size) {
       val order = ordering(i)
-      val left = order.child.eval(a)
-      val right = order.child.eval(b)
+      val left = leftEvaluators(i).eval(a)
+      val right = rightEvaluators(i).eval(b)
 
       if (left == null && right == null) {
         // Both null, continue looking.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/OrderingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/OrderingSuite.scala
@@ -24,7 +24,9 @@ import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.sql.{RandomDataGenerator, Row}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, GenerateOrdering, LazilyGeneratedOrdering}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.ArrayImplicits._
 
@@ -166,4 +168,24 @@ class OrderingSuite extends SparkFunSuite with ExpressionEvalHelper {
     GenerateOrdering.genComparisons(ctx, schema)
     assert(ctx.INPUT_ROW == null)
   }
+
+  test("SPARK-53275: ordering by stateful expressions in interpreted mode") {
+    // even though we explicitly create an InterpretedOrdering below, we still need
+    // to set CODEGEN_FACTORY_MODE to NO_CODEGEN because the ScalaUDF expression will
+    // indirectly create an UnsafeProjection, and we want that UnsafeProjection to be
+    // an InterpretedUnsafeProjection
+    withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString) {
+      val udfFunc = (s: String) => s
+      val stringUdf = ScalaUDF(udfFunc, StringType, BoundReference(0, StringType, true) :: Nil,
+        Option(ExpressionEncoder[String]().resolveAndBind()) :: Nil,
+        Some(ExpressionEncoder[String]().resolveAndBind()))
+      val sortOrder = Seq(SortOrder(stringUdf, Ascending))
+      val rowOrdering = new InterpretedOrdering(sortOrder)
+      val rowType = StructType(StructField("col1", StringType, nullable = true) :: Nil)
+      val toCatalyst = CatalystTypeConverters.createToCatalystConverter(rowType)
+      val rowB1 = toCatalyst(Row("B")).asInstanceOf[InternalRow]
+      val rowB2 = toCatalyst(Row("A")).asInstanceOf[InternalRow]
+      assert(rowOrdering.compare(rowB1, rowB2) > 0)
+    }
+  }
 }