package ca.training.bigdata.spark.ml

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
import org.apache.spark.ml.regression.GeneralizedLinearRegression
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{SparkSession, _}

/**
  * Created by BigDataTraining on 2018-04-01.
  */
object BikeSharingPrediction {

  def genLinearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = {
    val lr = new GeneralizedLinearRegression()
      .setFeaturesCol("features")
      .setLabelCol("label")
      .setFamily("gaussian")
      .setLink("identity")
      .setMaxIter(10)
      .setRegParam(0.3)

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr))

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)

    val model = pipeline.fit(training)

    val fullPredictions = model.transform(test).cache()
    val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0))
    val labels = fullPredictions.select("label").rdd.map(_.getDouble(0))
    val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError

    println(s"  Root mean squared error (RMSE): $RMSE")
  }


  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("Bike Sharing Prediction using Generalized Linear Regression").getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext

    val df = spark.read.format("csv").option("header", "true").load("file:///root/TrainingOnHDP/dataset/spark/hour.csv")
    df.cache()

    df.registerTempTable("BikeSharing")

    print(df.count())

    spark.sql("SELECT * FROM BikeSharing").show()

    val df1 = df.drop("instant").drop("dteday").drop("casual").drop("registered")

    val df2 = df1.withColumn("season", df1("season").cast("double")).withColumn("yr", df1("yr").cast("double"))
      .withColumn("mnth", df1("mnth").cast("double")).withColumn("hr", df1("hr").cast("double")).withColumn("holiday", df1("holiday").cast("double"))
      .withColumn("weekday", df1("weekday").cast("double")).withColumn("workingday", df1("workingday").cast("double")).withColumn("weathersit", df1("weathersit").cast("double"))
      .withColumn("temp", df1("temp").cast("double")).withColumn("atemp", df1("atemp").cast("double")).withColumn("hum", df1("hum").cast("double"))
      .withColumn("windspeed", df1("windspeed").cast("double")).withColumn("label", df1("label").cast("double"))

    df2.printSchema()

    val df3 = df2.drop("label")

    val featureCols = df3.columns

    val vectorAssembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("rawFeatures")

    val vectorIndexer = new VectorIndexer().setInputCol("rawFeatures").setOutputCol("features").setMaxCategories(2)

    genLinearRegressionWithVectorFormat(vectorAssembler, vectorIndexer, df2)

  }
}
