package ca.training.bigdata.spark.streaming.bidding

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}

/**
  * Created by BigDataTraining on 2018-03-16.
  */
object MonitoringStreaming {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("Real time Bidding - Monitoring Streaming using Spark Streaming").getOrCreate()
    import spark.implicits._

    // Define the income data schema
    val bidSchema = new StructType()
      .add("bidid", StringType)
      .add("timestamp", StringType)
      .add("ipinyouid", StringType)
      .add("useragent", StringType)
      .add("IP", StringType)
      .add("region", IntegerType)
      .add("cityID", IntegerType)
      .add("adexchange", StringType)
      .add("domain", StringType)
      .add("turl", StringType)
      .add("urlid", StringType)
      .add("slotid", StringType)
      .add("slotwidth", StringType)
      .add("slotheight", StringType)
      .add("slotvisibility", StringType)
      .add("slotformat", StringType)
      .add("slotprice", StringType)
      .add("creative", StringType)
      .add("bidprice", StringType)

    // Income Data Streaming
    val streamingInputDF = spark
      .readStream
      .format("csv")
      .schema(bidSchema)
      .option("header", false)
      .option("inferSchema", true)
      .option("sep", "\t")
      .option("maxFilesPerTrigger", 1)
      .load("file:///root/TrainingOnHDP/dataset/spark/bid")

    // Print out Schema
    streamingInputDF.printSchema()

    // Sliding window-based functionality section
    val ts = unix_timestamp($"timestamp", "yyyyMMddHHmmssSSS").cast("timestamp")
    val streamingCityTimeDF = streamingInputDF
      .withColumn("ts", ts)
      .select($"cityID", $"ts")

    val citySchema = new StructType()
      .add("cityID", StringType)
      .add("cityName", StringType)

    val staticDF = spark
      .read
      .format("csv")
      .schema(citySchema)
      .option("header", false)
      .option("inferSchema", true)
      .option("sep", "\t")
      .load("file:///root/TrainingOnHDP/dataset/spark/city")

    val joinedDF = streamingCityTimeDF.join(staticDF, "cityID")

    val streamingCityNameBidsTimeDF = streamingInputDF
      .withColumn("ts", ts)
      .select($"ts", $"bidid", $"cityID", $"bidprice", $"slotprice")
      .join(staticDF, "cityID")

    val cityBidsParquet = streamingCityNameBidsTimeDF
      .select($"bidid", $"bidprice", $"slotprice", $"cityName")
      .writeStream
      .outputMode("append")
      .format("parquet")
      .option("path", "/root/labs/spark/bidding")
      .option("checkpointLocation", "/root/labs/spark/bidding_checkpoint")
      .start()

    Thread.sleep(1000 * 120)

    spark.streams.active.foreach(x => println("ID:" + x.id + "  Run ID:" + x.runId + "    Status: " + x.status))

    cityBidsParquet.awaitTermination()

  }

}
