package ca.training.bigdata.spark.ml

import org.apache.spark.sql.SparkSession
import ml.dmlc.xgboost4j.scala.spark.{DataUtils, XGBoost}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.evaluation.RegressionEvaluator
import ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}
/**
  *
  * Created by BigDataTraining on 2018-03-30.
  */
object PowerPlantPredictionPipeline {

  case class PowerPlantTable(AT: Double,
                             V : Double,
                             AP : Double,
                             RH : Double,
                             PE : Double)

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("Power Plant Prediction Using XGBoost and Spark ML Pipeline").getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext

    val df = sc.textFile("file:///root/TrainingOnHDP/dataset/spark/powerplant.txt")
      .map(x => x.split("\t"))
      .filter(line => line(0) != "AT")
      .map(line => PowerPlantTable(line(0).toDouble, line(1).toDouble, line(2).toDouble, line(3).toDouble, line(4).toDouble))
      .toDF

    df.printSchema()
    df.show(10)

    val assembler =  new VectorAssembler()
      .setInputCols(Array("AT", "V", "AP", "RH"))
      .setOutputCol("features")

    val xgboostEstimator = new XGBoostEstimator(
      Map[String, Any]("num_round" -> 5, "nworkers" -> 1, "objective" -> "reg:linear", "eta" -> 0.3, "max_depth" -> 6, "early_stopping_rounds" -> 10))

    // construct the pipeline
    val pipeline = new Pipeline()
      .setStages(Array(assembler, xgboostEstimator))

    val pipelineData = df.withColumnRenamed("PE","label")

    val pipelineModel = pipeline.fit(pipelineData)

    val predictions = pipelineModel.transform(pipelineData)

    predictions.show(10)

    pipelineModel.write.overwrite().save("/tmp/xgPipeline")

    val loadedPipeline = PipelineModel.load("/tmp/xgPipeline")

    val vected = assembler.transform(df).withColumnRenamed("PE", "label").drop("AT","V","AP","RH")
    val Array(split20, split80) = vected.randomSplit(Array(0.20, 0.80), 1800009193L)
    val testSet = split20.cache()
    val trainingSet = split80.cache()

    val evaluator = new RegressionEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("rmse")

    val paramGrid = new ParamGridBuilder()
      .addGrid(xgboostEstimator.maxDepth, Array(4, 7))
      .addGrid(xgboostEstimator.eta, Array(0.1, 0.6))
      .build()

    val cv = new CrossValidator()
      .setEstimator(xgboostEstimator)
      .setEvaluator(evaluator)
      .setEstimatorParamMaps(paramGrid)
      .setNumFolds(4)

    val cvModel = cv.fit(trainingSet)

    cvModel.bestModel.extractParamMap

    val results = cvModel.transform(testSet)
    evaluator.evaluate(results)

    cvModel.bestModel.asInstanceOf[ml.dmlc.xgboost4j.scala.spark.XGBoostModel].save("/tmp/xgboostTunedModel")


  }

}
