package ca.training.bigdata.spark.sql.dm

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import ca.training.bigdata.spark.sql.dm.AnalyzeMissingData.processRdd

/**
  * Created by BigDataTraining on 2018-03-10.
  */
object CombineDataset {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("Combine the Dataset using joins for Data Munging Using Spark").getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext

    var rdd1 = sc.textFile("file:///root/household_power_consumption.txt")

    val header = rdd1.first()
    val data1 = rdd1.filter(row => row != header).filter(rows => !rows.contains("?"))

    val hhEPCClassRdd = data1.map(_.split(";")).map(p => HouseholdEPC(p(0).trim().toString,p(1).trim().toString,p(2).toDouble,p(3).toDouble,p(4).toDouble,p(5).toDouble,p(6).toDouble,p(7).toDouble,p(8).toDouble))
    val hhEPCDF = hhEPCClassRdd.toDF()
    hhEPCDF.show(5)

    val hhEPCDatesDf = hhEPCDF
      .withColumn("dow", from_unixtime(unix_timestamp($"date", "dd/MM/yyyy"), "EEEEE"))
      .withColumn("day", dayofmonth(to_date(unix_timestamp($"date", "dd/MM/yyyy").cast("timestamp"))))
      .withColumn("month", month(to_date(unix_timestamp($"date", "dd/MM/yyyy").cast("timestamp"))))
      .withColumn("year", year(to_date(unix_timestamp($"date", "dd/MM/yyyy").cast("timestamp"))))

    hhEPCDatesDf.show(5)

    val delTmDF = hhEPCDF.drop("time")

    val finalDayDf1 = delTmDF
      .groupBy($"date")
      .agg(sum($"gap").name("A"),sum($"grp").name("B"),avg($"voltage").name("C"),sum($"gi").name("D"), sum($"sm_1").name("E"), sum($"sm_2").name("F"), sum($"sm_3").name("G"))
      .select($"date", round($"A", 2).name("dgap"), round($"B", 2).name("dgrp"), round($"C", 2).name("dvoltage"), round($"C", 2).name("dgi"), round($"E", 2).name("dsm_1"), round($"F", 2).name("dsm_2"), round($"G", 2).name("dsm_3"))
      .withColumn("day", dayofmonth(to_date(unix_timestamp($"date", "dd/MM/yyyy").cast("timestamp"))))
      .withColumn("month", month(to_date(unix_timestamp($"date", "dd/MM/yyyy").cast("timestamp"))))
      .withColumn("year", year(to_date(unix_timestamp($"date", "dd/MM/yyyy").cast("timestamp"))))

    val ds1 = finalDayDf1.as[HouseholdEPCDTmDay]

    val rdd2 = sc.textFile("file:////root/TrainingOnHDP/dataset/spark/weather_201701.txt")
    val header2 = rdd2.first()
    val data2 = rdd2.filter(row => row != header2)
    val dw_rdd = processRdd(data2)
    val ds2 = dw_rdd.toDF().na.replace(Seq("CET", "Events"),Map("0" -> "NA")).as[DayWeather]

    val joined_ds = ds1.join(ds2).where(unix_timestamp(ds1("date"), "dd/MM/yyyy") === unix_timestamp(ds2("CET"), "yyyy-MM-dd"))

    joined_ds.cache()
    joined_ds.count()

    var corr = joined_ds.stat.corr("Mean_TemperatureC","dgap")
    println("Mean_TemperatureC to dgap : Correlation = %.4f".format(corr))

    corr = joined_ds.stat.corr("Mean_TemperatureC","dgrp")
    println("Mean_TemperatureC to dgrp : Correlation = %.4f".format(corr))

    corr = joined_ds.stat.corr("Mean_Humidity","dgap")
    println("Mean_Humidity to dgap : Correlation = %.4f".format(corr))

    corr = joined_ds.stat.corr("Mean_Humidity","dgrp")
    println("Mean_Humidity to dgrp : Correlation = %.4f".format(corr))

    corr = joined_ds.stat.corr("Max_TemperatureC","dsm_1")
    println("Max_TemperatureC to dsm_1 : Correlation = %.4f".format(corr))

    corr = joined_ds.stat.corr("Max_TemperatureC","dsm_2")
    println("Max_TemperatureC to dsm_2 : Correlation = %.4f".format(corr))

    corr = joined_ds.stat.corr("Max_TemperatureC","dsm_3")
    println("Max_TemperatureC to dsm_3 : Correlation = %.4f".format(corr))

    val joinedMonthly_ds = joined_ds
      .groupBy("year", "month")
      .agg(sum($"dgap").name("A"),sum($"dgrp").name("B"),avg($"dvoltage").name("C"),sum($"dgi").name("D"), sum($"dsm_1").name("E"), sum($"dsm_2").name("F"), sum($"dsm_3").name("G")).select($"year", $"month", round($"A", 2).name("mgap"), round($"B", 2).name("mgrp"), round($"C", 2).name("mvoltage"), round($"C", 2).name("mgi"), round($"E", 2).name("msm_1"), round($"F", 2).name("msm_2"), round($"G", 2).name("msm_3"))
      .orderBy("year", "month")

    joinedMonthly_ds.show()

  }

}
