package ca.training.bigdata.spark.sql.dm

import org.apache.spark.sql.SparkSession

/**
  * Created by BigDataTraining on 2018-03-10.
  */
object RetrievingData {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("Retrieving Data for Data Munging Using Spark")
      .enableHiveSupport()
      .getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext

    var rdd = sc.textFile("file:///root/household_power_consumption.txt")
    rdd.count()

    val header = rdd.first()
    val data = rdd.filter(row => row != header).filter(rows => !rows.contains("?"))

    val hhEPCClassRdd = data.map(_.split(";")).map(p => HouseholdEPC(p(0).trim().toString,p(1).trim().toString,p(2).toDouble,p(3).toDouble,p(4).toDouble,p(5).toDouble,p(6).toDouble,p(7).toDouble,p(8).toDouble))
    val hhEPCDF = hhEPCClassRdd.toDF()
    hhEPCDF.show(5)

  }

}
