package ca.training.bigdata.kafka.streaming

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{StringType, StructType}

object SparkStructuredStreamingApp {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder.appName("Move data from Kafka to Hive and Elastic Search using Spark Structured Streaming")
      .enableHiveSupport().getOrCreate()

    import spark.implicits._

    val customer_schema = new StructType()
      .add("id", StringType)
      .add($"name".string)
      .add($"ipAddress".string)
      .add($"country".string)
      .add($"city".string)

    val currency_schema = new StructType()
      .add($"name".string)
      .add($"price".string)

    val json_schema = new StructType()
      .add($"event".string)
      .add("customer", customer_schema)
      .add("currency", currency_schema)
      .add($"timestamp".string)

    var df = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "sandbox-hdp.hortonworks.com:6667")
      .option("subscribe", "good-topic")
      .load()

    df = df.selectExpr("CAST(value AS STRING) as json").as[String].select(from_json($"json", json_schema) as "data").select("data.*")

    val streaming = df.writeStream
      .format("parquet")
      .option("path", "/user/root/kafka/")
      .option("checkpointLocation", "/checkpoint_path")
      .outputMode("append")
      .start()

    streaming.awaitTermination()

  }

}
