package ca.training.bigdata.spark.ml

import org.apache.spark.ml.clustering.LDA
import org.apache.spark.ml.feature.{CountVectorizer, RegexTokenizer, StopWordsRemover}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

import scala.collection.mutable

/**
  * Created by BigDataTraining on 2018-03-31.
  */
object TopicModelingLDA {

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder().appName("Topic modeling using LDA").getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._

    val numTopics: Int = 10
    val maxIterations: Int = 3
    val vocabSize: Int = 100

    val df = spark.read.format("com.databricks.spark.xml")
      .option("rowTag", "sentences")
      .option("mode", "PERMISSIVE")
      .load("file:///root/corpus/fulltext/*.xml")

    val docDF = df.select("sentence._VALUE")
      .withColumn("docId", monotonically_increasing_id())
      .withColumn("sentences", concat_ws(",", $"_VALUE"))
      .drop("_VALUE")

    val tokens = new RegexTokenizer()
      .setGaps(false)
      .setPattern("\\p{L}+")
      .setInputCol("sentences")
      .setOutputCol("words")
      .transform(docDF)

    val filteredTokens = new StopWordsRemover()
      .setCaseSensitive(false)
      .setInputCol("words")
      .setOutputCol("filtered")
      .transform(tokens)

    val cvModel = new CountVectorizer()
      .setInputCol("filtered")
      .setOutputCol("features")
      .setVocabSize(vocabSize)
      .fit(filteredTokens)

    val termVectors = cvModel
      .transform(filteredTokens)
      .select("docId", "features")

    val lda = new LDA()
      .setK(numTopics)
      .setMaxIter(maxIterations)

    val ldaModel = lda.fit(termVectors)

    println("Model was fit using parameters: " + ldaModel.parent.extractParamMap)

    val ll = ldaModel.logLikelihood(termVectors)
    val lp = ldaModel.logPerplexity(termVectors)

    println(s"The lower bound on the log likelihood of the entire corpus: $ll")
    println(s"The upper bound on perplexity: $lp")

    val topicsDF = ldaModel.describeTopics(3)
    println("The topics described by their top-weighted terms:")
    topicsDF.show(false)

    val transformed = ldaModel.transform(termVectors)
    transformed.select("docId", "topicDistribution").take(3).foreach(println)
    val vocab = cvModel.vocabulary

    topicsDF.foreach { row =>
      var i = 0
      var termsString = ""
      var topicTermIndicesString = ""
      val topicNumber = row.get(0)
      val topicTerms:mutable.WrappedArray[Int]  = row.get(1).asInstanceOf[mutable.WrappedArray[Int]]

      for (i <- 0 to topicTerms.length-1){
        topicTermIndicesString += topicTerms(i) +", "
        termsString += vocab(topicTerms(i)) +", "
      }
      println ("Topic: "+ topicNumber+ "|["+topicTermIndicesString + "]|[" + termsString +"]")
    }

  }

}
