package ca.training.bigdata.spark.sql.dm

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

/**
  * Created by BigDataTraining on 2018-03-10.
  */
object ComputingBasicStatistics {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("Compute basic statistics and aggregations for Data Munging Using Spark").getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext

    var rdd = sc.textFile("file:///root/household_power_consumption.txt")
    rdd.count()

    val header = rdd.first()
    val data = rdd.filter(row => row != header).filter(rows => !rows.contains("?"))

    val hhEPCClassRdd = data.map(_.split(";")).map(p => HouseholdEPC(p(0).trim().toString,p(1).trim().toString,p(2).toDouble,p(3).toDouble,p(4).toDouble,p(5).toDouble,p(6).toDouble,p(7).toDouble,p(8).toDouble))
    val hhEPCDF = hhEPCClassRdd.toDF()
    hhEPCDF.show(5)

    hhEPCDF.describe().show()

    hhEPCDF
      .describe()
      .select($"summary", $"gap", $"grp", $"voltage", $"gi", $"sm_1", $"sm_2", $"sm_3", round($"gap", 4).name("rgap"), round($"grp", 4).name("rgrp"), round($"voltage", 4).name("rvoltage"), round($"gi", 4).name("rgi"), round($"sm_1", 4).name("rsm_1"), round($"sm_2", 4).name("rsm_2"), round($"sm_3", 4).name("rsm_3"))
      .drop("gap", "grp", "voltage", "gi", "sm_1", "sm_2", "sm_3")
      .show()

    val numDates = hhEPCDF.groupBy("date").agg(countDistinct("date")).count()

  }

}
