package ca.training.bigdata.spark.sql.dm

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

/**
  * Created by BigDataTraining on 2018-03-10.
  */
object AnalyzeMissingData {

  def processRdd(data: RDD[String]): RDD[DayWeather] = {

    val rdd = data.map(_.split("\t")).map(c => c.map(f => f match {
      case x if x.isEmpty() || x.equals("-") => "0";
      case x => x }))
      .map { p => DayWeather(
        p(0).trim().toString,
        p(1).toDouble,
        p(2).toDouble,
        p(3).toDouble,
        p(4).toDouble,
        p(5).toDouble,
        p(6).toDouble,
        p(7).toDouble,
        p(8).toDouble,
        p(9).toDouble,
        p(10).toDouble,
        p(11).toDouble,
        p(12).toDouble,
        p(13).toDouble,
        p(14).toDouble,
        p(15).toDouble,
        p(16).toDouble,
        p(17).toDouble,
        p(18).toDouble,
        p(19).toDouble,
        p(20));
      }
    rdd;
  }

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("Analyze Missing Data for Data Munging Using Spark").getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext

    val rdd = sc.textFile("file:////root/TrainingOnHDP/dataset/spark/weather_201701.txt")

    val header = rdd.first()
    val data = rdd.filter(row => row != header)

    val empty_rdd = data.map(_.split("\t")).filter(!_.contains(" "))
    empty_rdd.collect().foreach( x => println(x.mkString(",")))

    val df = spark.read.format("csv")
      .option("delimiter", "\t")
      .option("header", true)
      .option("inferSchema", true)
      .load("file:////root/TrainingOnHDP/dataset/spark/weather_201701.txt")

    df.as("a").select($"a.*").show()

    val drop_f = df.filter($"ev" === " " || $"p_high" === "-")
    drop_f.show(100)

    val dw_rdd = processRdd(data)

    dw_rdd.take(5).foreach(println)

    val ds = dw_rdd.toDF().na.replace(Seq("CET", "Events"),Map("0" -> "NA")).as[DayWeather]
    ds.show()


  }

}
