package ca.training.bigdata.spark.sql.eda

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}

/**
  * Created by BigDataTraining on 2018-03-10.
  */
object SamplingwithRDD {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("Sampling with RDD section").getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext

    val age = StructField("age", DataTypes.IntegerType)
    val job  = StructField("job", DataTypes.StringType)
    val marital  = StructField("marital", DataTypes.StringType)
    val edu  = StructField("edu", DataTypes.StringType)
    val credit_default  = StructField("credit_default", DataTypes.StringType)
    val housing  = StructField("housing", DataTypes.StringType)
    val loan  = StructField("loan", DataTypes.StringType)
    val contact  = StructField("contact", DataTypes.StringType)
    val month  = StructField("month", DataTypes.StringType)
    val day  = StructField("day", DataTypes.StringType)
    val dur  = StructField("dur", DataTypes.DoubleType)
    val campaign  = StructField("campaign", DataTypes.DoubleType)
    val pdays  = StructField("pdays", DataTypes.DoubleType)
    val prev  = StructField("prev", DataTypes.DoubleType)
    val pout  = StructField("pout", DataTypes.StringType)
    val emp_var_rate  = StructField("emp_var_rate", DataTypes.DoubleType)
    val cons_price_idx  = StructField("cons_price_idx", DataTypes.DoubleType)
    val cons_conf_idx  = StructField("cons_conf_idx", DataTypes.DoubleType)
    val euribor3m  = StructField("euribor3m", DataTypes.DoubleType)
    val nr_employed  = StructField("nr_employed", DataTypes.DoubleType)
    val deposit  = StructField("deposit", DataTypes.StringType)

    val fields = Array(age, job, marital, edu, credit_default, housing, loan, contact, month, day, dur, campaign, pdays, prev, pout, emp_var_rate, cons_price_idx, cons_conf_idx, euribor3m, nr_employed, deposit)
    val schema = StructType(fields)
    val df = spark.read.schema(schema).option("sep", ";").option("header", true).csv("file:///root/bank-additional-full.csv")

    val rdd = df.rdd.map(r => (r.getAs[String](2), List(r.getInt(0), r.getString(1), r.getString(2), r.getString(3), r.getString(4), r.getString(5), r.getString(6), r.getString(7), r.getString(8), r.getString(9), r.getDouble(10), r.getDouble(11), r.getDouble(12), r.getDouble(13), r.getString(14), r.getDouble(15), r.getDouble(16), r.getDouble(17), r.getDouble(18), r.getDouble(19), r.getString(20))))

    rdd.take(2).foreach(println)
    val fractions = Map("unknown" -> .10, "divorced" -> .15, "married" -> 0.5, "single" -> .25)

    val rdd_sample = rdd.sampleByKey(true, fractions, 1)
    val rdd_sample_exact = rdd.sampleByKeyExact(true, fractions, 1)

    println(rdd.countByKey)
    println(rdd_sample.countByKey)
    println(rdd_sample_exact.countByKey)

  }

}
