2

複合型を返す型付き UDAF を実装しようとしています。どういうわけか、Spark は結果列の型を推測できずbinary、シリアル化されたデータをそこに配置します。問題を再現する最小限の例を次に示します

import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{SparkSession, Encoder, Encoders}

case class Data(key: Int)

class NoopAgg[I] extends Aggregator[I, Map[String, Int], Map[String, Int]] {
    override def zero: Map[String, Int] = Map.empty[String, Int]

    override def reduce(b: Map[String, Int], a: I): Map[String, Int] = b

    override def merge(b1: Map[String, Int], b2: Map[String, Int]): Map[String, Int] = b1

    override def finish(reduction: Map[String, Int]): Map[String, Int] = reduction

    override def bufferEncoder: Encoder[Map[String, Int]] = Encoders.kryo[Map[String, Int]]

    override def outputEncoder: Encoder[Map[String, Int]] = Encoders.kryo[Map[String, Int]]
}

object Question {
  def main(args: Array[String]): Unit = {
      val spark = SparkSession.builder().master("local").getOrCreate()

      val sc = spark.sparkContext

      import spark.implicits._

      val ds = sc.parallelize((1 to 10).map(i => Data(i))).toDS()

      val noop = new NoopAgg[Data]().toColumn

      val result = ds.groupByKey(_.key).agg(noop.as("my_sum").as[Map[String, Int]])

      result.printSchema()
  }
}

印刷します

root
 |-- value: integer (nullable = false)
 |-- my_sum: binary (nullable = true)
4

1 に答える 1