package ca.training.bigdata.falcon.churn

import org.apache.spark.{SparkConf, SparkContext}
//import org.apache.spark.sql.SparkSession

object CustomerEmailTransformer {

  val keys = Array("Message-ID: ",
    "Date: ",
    "From: ",
    "To: ",
    "Subject: ",
    "Cc: ",
    "Mime-Version: ",
    "Content-Type: ",
    "Content-Transfer-Encoding: ",
    "Bcc: ",
    "X-From: ",
    "X-To: ",
    "X-cc: ",
    "X-bcc: ",
    "X-Folder: ",
    "X-Origin: ",
    "X-FileName: ")

  case class Email(message_id: String,
                   edate: String,
                   efrom: String,
                   eto: String,
                   subject: String,
                   cc: String,
                   mime_type: String,
                   content_type: String,
                   content_transfer_encoding: String,
                   bcc: String,
                   x_from: String,
                   x_to: String,
                   x_cc: String,
                   x_bcc: String,
                   x_folder: String,
                   x_origin: String,
                   x_filename: String)

  def extractByKey(key: String, content: String): String = {
    var value = ""
    val start = content.indexOf(key)
    if (start >= 0) {
      var end = Int.MaxValue
      keys.filterNot(_.equalsIgnoreCase(key)).foreach(k => {
        val pos = content.indexOf(k)
        if (pos >= 0 && pos < end && pos - 3 > start) end = pos
      })
      if (end != Int.MaxValue) value = content.substring(start + key.length, end)
      else value = content.substring(start + key.length)
    }
    value.trim
  }

  def main(args: Array[String]): Unit = {

    /*
    val spark = SparkSession
      .builder()
      .appName("Customer Email Parsing and Transformation")
      .enableHiveSupport()
      .getOrCreate()
    */

    val conf = new SparkConf().setAppName("Customer Email Parsing and Transformation")
    val sc = new SparkContext(conf)
    val sqlContext = new org.apache.spark.sql.SQLContext(sc)


//    val files = spark.sparkContext.wholeTextFiles(args(0))
    val files = sc.wholeTextFiles(args(0))

    import sqlContext.implicits._

    val df = files.map(f => {
      Email(extractByKey("Message-ID: ", f._2),
        extractByKey("Date: ", f._2),
        extractByKey("From: ", f._2),
        extractByKey("To: ", f._2),
        extractByKey("Subject: ", f._2),
        extractByKey("Cc: ", f._2),
        extractByKey("Mime-Version: ", f._2),
        extractByKey("Content-Type: ", f._2),
        extractByKey("Content-Transfer-Encoding: ", f._2),
        extractByKey("Bcc: ", f._2),
        extractByKey("X-From: ", f._2),
        extractByKey("X-To: ", f._2),
        extractByKey("X-cc: ", f._2),
        extractByKey("X-bcc: ", f._2),
        extractByKey("X-Folder: ", f._2),
        extractByKey("X-Origin: ", f._2),
        extractByKey("X-FileName: ", f._2))
    }).toDF()

    df.printSchema()

    df.write.mode("overwrite").format("parquet").save(args(1))

  }

}
