package ca.training.bigdata.spark.sql.dm

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

/**
  * Created by BigDataTraining on 2018-03-10.
  */
object EnrichmentDataset {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("Enrichment dataset for Data Munging Using Spark").getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext

    var rdd = sc.textFile("file:///root/household_power_consumption.txt")
    rdd.count()

    val header = rdd.first()
    val data = rdd.filter(row => row != header).filter(rows => !rows.contains("?"))

    val hhEPCClassRdd = data.map(_.split(";")).map(p => HouseholdEPC(p(0).trim().toString,p(1).trim().toString,p(2).toDouble,p(3).toDouble,p(4).toDouble,p(5).toDouble,p(6).toDouble,p(7).toDouble,p(8).toDouble))
    val hhEPCDF = hhEPCClassRdd.toDF()
    hhEPCDF.show(5)

    val hhEPCDatesDf = hhEPCDF
      .withColumn("dow", from_unixtime(unix_timestamp($"date", "dd/MM/yyyy"), "EEEEE"))
      .withColumn("day", dayofmonth(to_date(unix_timestamp($"date", "dd/MM/yyyy").cast("timestamp"))))
      .withColumn("month", month(to_date(unix_timestamp($"date", "dd/MM/yyyy").cast("timestamp"))))
      .withColumn("year", year(to_date(unix_timestamp($"date", "dd/MM/yyyy").cast("timestamp"))))

    hhEPCDatesDf.show(5)

    val delTmDF = hhEPCDF.drop("time")

    val finalDayDf1 = delTmDF
      .groupBy($"date")
      .agg(sum($"gap").name("A"),sum($"grp").name("B"),avg($"voltage").name("C"),sum($"gi").name("D"), sum($"sm_1").name("E"), sum($"sm_2").name("F"), sum($"sm_3").name("G"))
      .select($"date", round($"A", 2).name("dgap"), round($"B", 2).name("dgrp"), round($"C", 2).name("dvoltage"), round($"C", 2).name("dgi"), round($"E", 2).name("dsm_1"), round($"F", 2).name("dsm_2"), round($"G", 2).name("dsm_3"))
      .withColumn("day", dayofmonth(to_date(unix_timestamp($"date", "dd/MM/yyyy").cast("timestamp"))))
      .withColumn("month", month(to_date(unix_timestamp($"date", "dd/MM/yyyy").cast("timestamp"))))
      .withColumn("year", year(to_date(unix_timestamp($"date", "dd/MM/yyyy").cast("timestamp"))))

    finalDayDf1.show(5)

    val readingsByMonthDf = hhEPCDatesDf
      .groupBy($"year", $"month")
      .count()
      .orderBy($"year", $"month")

    readingsByMonthDf.count()
    readingsByMonthDf.show(5)

  }

}
