package ca.training.bigdata.spark.sql.eda

import org.apache.spark.sql.{DataFrameWriter, Row, SparkSession}
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
import org.apache.spark.sql.functions._

/**
  * Created by BigDataTraining on 2018-03-10.
  */
object CreatingPivotTables {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("Using Spark for creating tables").getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext

    val age = StructField("age", DataTypes.IntegerType)
    val job  = StructField("job", DataTypes.StringType)
    val marital  = StructField("marital", DataTypes.StringType)
    val edu  = StructField("edu", DataTypes.StringType)
    val credit_default  = StructField("credit_default", DataTypes.StringType)
    val housing  = StructField("housing", DataTypes.StringType)
    val loan  = StructField("loan", DataTypes.StringType)
    val contact  = StructField("contact", DataTypes.StringType)
    val month  = StructField("month", DataTypes.StringType)
    val day  = StructField("day", DataTypes.StringType)
    val dur  = StructField("dur", DataTypes.DoubleType)
    val campaign  = StructField("campaign", DataTypes.DoubleType)
    val pdays  = StructField("pdays", DataTypes.DoubleType)
    val prev  = StructField("prev", DataTypes.DoubleType)
    val pout  = StructField("pout", DataTypes.StringType)
    val emp_var_rate  = StructField("emp_var_rate", DataTypes.DoubleType)
    val cons_price_idx  = StructField("cons_price_idx", DataTypes.DoubleType)
    val cons_conf_idx  = StructField("cons_conf_idx", DataTypes.DoubleType)
    val euribor3m  = StructField("euribor3m", DataTypes.DoubleType)
    val nr_employed  = StructField("nr_employed", DataTypes.DoubleType)
    val deposit  = StructField("deposit", DataTypes.StringType)

    val fields = Array(age, job, marital, edu, credit_default, housing, loan, contact, month, day, dur, campaign, pdays, prev, pout, emp_var_rate, cons_price_idx, cons_conf_idx, euribor3m, nr_employed, deposit)
    val schema = StructType(fields)
    val df = spark.read.schema(schema).option("sep", ";").option("header", true).csv("file:///root/bank-additional-full.csv")

    val select_df = df.select($"job", $"marital", $"edu", $"housing", $"loan", $"contact", $"month", $"day", $"dur", $"campaign", $"pdays", $"prev", $"pout", $"deposit")

    select_df.groupBy("marital").pivot("housing").agg(count("housing")).sort("marital").show()
    select_df.groupBy("job").pivot("marital", Seq("unknown", "divorced", "married", "single")).agg(round(sum("campaign"), 2), round(avg("campaign"), 2)).sort("job").toDF("Job", "U-Tot", "U-Avg", "D-Tot", "D-Avg", "M-Tot", "M-Avg", "S-Tot", "S-Avg").show()
    select_df.groupBy("job").pivot("marital", Seq("unknown", "divorced", "married", "single")).agg(round(sum("dur"), 2), round(avg("dur"), 2)).sort("job").toDF("Job", "U-Tot", "U-Avg", "D-Tot", "D-Avg", "M-Tot", "M-Avg", "S-Tot", "S-Avg").show()
    select_df.groupBy("job").pivot("marital", Seq("divorced", "married")).agg(round(avg("dur"), 2)).sort("job").show()
    select_df.groupBy("job", "housing").pivot("marital", Seq("divorced", "married")).agg(round(avg("dur"), 2)).sort("job").show

    val saved_df = select_df
      .groupBy("deposit").pivot("month", Seq("jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"))
      .agg(count("deposit"))
      .sort("deposit")
      .na.fill(0)

    saved_df.write.format("csv").mode("overwrite").save("/tmp/eda/pivot")

    val rdd = sc.textFile("/tmp/eda/pivot/*.csv").map(_.split(","))
    rdd.cache()

    val labels = List("deposit", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec")
    val labelQ1 = List("jan", "feb", "mar")
    val labelQ2 = List("apr", "may", "jun")
    val labelQ3 = List("jul", "aug", "sep")
    val labelQ4 = List("oct", "nov", "dec")
    val indexQ1 = labelQ1.map(x => labels.indexOf(x))
    val indexQ2 = labelQ2.map(x => labels.indexOf(x))
    val indexQ3 = labelQ3.map(x => labels.indexOf(x))
    val indexQ4 = labelQ4.map(x => labels.indexOf(x))

    rdd.map(x => indexQ1.map(i => x(i).toDouble).sum).collect
    rdd.map(x => indexQ2.map(i => x(i).toDouble).sum).collect
    rdd.map(x => indexQ3.map(i => x(i).toDouble).sum).collect
    rdd.map(x => indexQ4.map(i => x(i).toDouble).sum).collect

  }

}
