package ca.training.bigdata.spark.sql.etl

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

/**
  * Created by BigDataTraining on 2018-03-10.
  */
object UsingSparkwithRDBMS {

  case class Transaction(invoice_no: String,
                         stock_code: String,
                         description: String,
                         quantity: Int,
                         invoice_date: String,
                         unit_price: Double,
                         customer_id: String,
                         country: String)

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("Using Spark with RDBMS").getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext

    var rdd = sc.textFile("/tmp/online_retail.csv").map(line =>line.split("\t").map(_.trim))
    val header = rdd.first
    rdd = rdd.filter(_(0) != header(0))

    var df = rdd.map(value => Transaction(value(0),value(1),value(2),
      value(3).toInt, value(4), value(5).toDouble, value(6), value(7))).toDF()

    df = df.withColumn("invoice_timestamp", unix_timestamp(col("invoice_date"), "dd/MM/yyyy HH:mm").cast("timestamp"))

    df = df.select("invoice_no", "stock_code", "description", "quantity", "unit_price", "customer_id", "country", "invoice_timestamp")
    df = df.withColumnRenamed("invoice_timestamp", "invoice_date")
    df.cache()

    val opts = scala.collection.mutable.Map(
      "url" -> "jdbc:mysql://localhost:3306/spark",
      "driver" -> "com.mysql.jdbc.Driver",
      "numPartitions" -> "1",
      "dbtable" -> "online_retail")

    opts("user") = "root"
    opts("password") = "hadoop"

    df.write.mode("overwrite").format("jdbc").options(opts).option("dbtable", "online_retail").save()
    df.write.mode("overwrite").format("parquet").save("/tmp/online_retail")

  }


}
