|
18 | 18 | package org.apache.kyuubi.spark.connector.hive
|
19 | 19 |
|
20 | 20 | import java.lang.{Boolean => JBoolean, Long => JLong}
|
| 21 | +import java.net.URI |
21 | 22 |
|
22 | 23 | import scala.util.Try
|
23 | 24 |
|
24 | 25 | import org.apache.hadoop.fs.{FileStatus, Path}
|
25 | 26 | import org.apache.hadoop.hive.ql.plan.{FileSinkDesc, TableDesc}
|
26 | 27 | import org.apache.spark.internal.Logging
|
27 | 28 | import org.apache.spark.sql.SparkSession
|
28 |
| -import org.apache.spark.sql.catalyst.InternalRow |
| 29 | +import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} |
29 | 30 | import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTablePartition}
|
30 | 31 | import org.apache.spark.sql.connector.catalog.TableChange
|
31 | 32 | import org.apache.spark.sql.connector.catalog.TableChange._
|
32 | 33 | import org.apache.spark.sql.execution.command.CommandUtils
|
33 |
| -import org.apache.spark.sql.execution.command.CommandUtils.{calculateMultipleLocationSizes, calculateSingleLocationSize} |
34 | 34 | import org.apache.spark.sql.execution.datasources.{PartitionDirectory, PartitionedFile}
|
35 | 35 | import org.apache.spark.sql.hive.execution.HiveFileFormat
|
36 | 36 | import org.apache.spark.sql.internal.SQLConf
|
@@ -82,7 +82,28 @@ object HiveConnectorUtils extends Logging {
|
82 | 82 | isSplitable: JBoolean,
|
83 | 83 | maxSplitBytes: JLong,
|
84 | 84 | partitionValues: InternalRow): Seq[PartitionedFile] =
|
85 |
| - Try { // SPARK-42821: 4.0.0-preview2 |
| 85 | + Try { // SPARK-42821, SPARK-51185: Spark 4.0 |
| 86 | + val fileStatusWithMetadataClz = DynClasses.builder() |
| 87 | + .impl("org.apache.spark.sql.execution.datasources.FileStatusWithMetadata") |
| 88 | + .buildChecked() |
| 89 | + DynMethods |
| 90 | + .builder("splitFiles") |
| 91 | + .impl( |
| 92 | + "org.apache.spark.sql.execution.PartitionedFileUtil", |
| 93 | + fileStatusWithMetadataClz, |
| 94 | + classOf[Path], |
| 95 | + classOf[Boolean], |
| 96 | + classOf[Long], |
| 97 | + classOf[InternalRow]) |
| 98 | + .buildChecked() |
| 99 | + .invokeChecked[Seq[PartitionedFile]]( |
| 100 | + null, |
| 101 | + file, |
| 102 | + filePath, |
| 103 | + isSplitable, |
| 104 | + maxSplitBytes, |
| 105 | + partitionValues) |
| 106 | + }.recover { case _: Exception => // SPARK-42821: 4.0.0-preview2 |
86 | 107 | val fileStatusWithMetadataClz = DynClasses.builder()
|
87 | 108 | .impl("org.apache.spark.sql.execution.datasources.FileStatusWithMetadata")
|
88 | 109 | .buildChecked()
|
@@ -192,19 +213,41 @@ object HiveConnectorUtils extends Logging {
|
192 | 213 | file.asInstanceOf[FileStatus].getPath
|
193 | 214 | }.get
|
194 | 215 |
|
| 216 | + private def calculateMultipleLocationSizes( |
| 217 | + sparkSession: SparkSession, |
| 218 | + tid: TableIdentifier, |
| 219 | + paths: Seq[Option[URI]]): Seq[Long] = { |
| 220 | + |
| 221 | + val sparkSessionClz = DynClasses.builder() |
| 222 | + .impl("org.apache.spark.sql.classic.SparkSession") // SPARK-49700 (4.0.0) |
| 223 | + .impl("org.apache.spark.sql.SparkSession") |
| 224 | + .build() |
| 225 | + |
| 226 | + val calculateMultipleLocationSizesMethod = |
| 227 | + DynMethods.builder("calculateMultipleLocationSizes") |
| 228 | + .impl( |
| 229 | + CommandUtils.getClass, |
| 230 | + sparkSessionClz, |
| 231 | + classOf[TableIdentifier], |
| 232 | + classOf[Seq[Option[URI]]]) |
| 233 | + .buildChecked(CommandUtils) |
| 234 | + |
| 235 | + calculateMultipleLocationSizesMethod |
| 236 | + .invokeChecked[Seq[Long]](sparkSession, tid, paths) |
| 237 | + } |
| 238 | + |
195 | 239 | def calculateTotalSize(
|
196 | 240 | spark: SparkSession,
|
197 | 241 | catalogTable: CatalogTable,
|
198 | 242 | hiveTableCatalog: HiveTableCatalog): (BigInt, Seq[CatalogTablePartition]) = {
|
199 | 243 | val sessionState = spark.sessionState
|
200 | 244 | val startTime = System.nanoTime()
|
201 | 245 | val (totalSize, newPartitions) = if (catalogTable.partitionColumnNames.isEmpty) {
|
202 |
| - ( |
203 |
| - calculateSingleLocationSize( |
204 |
| - sessionState, |
205 |
| - catalogTable.identifier, |
206 |
| - catalogTable.storage.locationUri), |
207 |
| - Seq()) |
| 246 | + val tableSize = CommandUtils.calculateSingleLocationSize( |
| 247 | + sessionState, |
| 248 | + catalogTable.identifier, |
| 249 | + catalogTable.storage.locationUri) |
| 250 | + (tableSize, Seq()) |
208 | 251 | } else {
|
209 | 252 | // Calculate table size as a sum of the visible partitions. See SPARK-21079
|
210 | 253 | val partitions = hiveTableCatalog.listPartitions(catalogTable.identifier)
|
@@ -402,7 +445,13 @@ object HiveConnectorUtils extends Logging {
|
402 | 445 | new StructType(newFields)
|
403 | 446 | }
|
404 | 447 |
|
405 |
| - def withSQLConf[T](pairs: (String, String)*)(f: => T): T = { |
| 448 | + // This is a fork of Spark's withSQLConf, and we use a different name to avoid linkage |
| 449 | + // issue on cross-version cases. |
| 450 | + // For example, SPARK-46227(4.0.0) moves `withSQLConf` from SQLHelper to SQLConfHelper, |
| 451 | + // classes that extend SQLConfHelper will prefer to linkage super class's method when |
| 452 | + // compiling with Spark 4.0, then linkage error will happen when run the jar with lower |
| 453 | + // Spark versions. |
| 454 | + def withSparkSQLConf[T](pairs: (String, String)*)(f: => T): T = { |
406 | 455 | val conf = SQLConf.get
|
407 | 456 | val (keys, values) = pairs.unzip
|
408 | 457 | val currentValues = keys.map { key =>
|
|
0 commit comments