package ca.training.bigdata.spark.ml

import org.apache.spark.sql.SparkSession
import scala.util.matching.Regex
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.rdd.RDD
import org.apache.spark.ml.feature.{HashingTF, IDF, RegexTokenizer, Tokenizer, NGram, StopWordsRemover, CountVectorizer}
import org.apache.spark.sql.{Row, DataFrame}
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, IndexToString}
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier, LogisticRegression, NaiveBayes, NaiveBayesModel}
import org.apache.spark.ml.evaluation.{RegressionEvaluator, MulticlassClassificationEvaluator}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.clustering.{LDA}
import org.apache.spark.sql.functions._

/**
  * Created by BigDataTraining on 2018-03-30.
  */
object TextualAnalysis {

  def deleteAbbrev(instr: String): String = {
    val pattern = new Regex("[A-Z]\\.([A-Z]\\.)+")
    val str = pattern.replaceAllIn(instr, " ")
    str
  }

  def deleteDocTypes(instr: String): String = {
    //println("Input string length="+ instr.length())
    val pattern = new Regex("(?s)<TYPE>(GRAPHIC|EXCEL|PDF|ZIP|COVER|CORRESP|EX-10[01].INS|EX-99.SDR [KL].INS|EX-10[01].SCH|EX-99.SDR [KL].SCH|EX-10[01].CAL|EX-99.SDR [KL].CAL|EX-10[01].DEF|EX-99.SDR [KL].LAB|EX-10[01].LAB|EX-99.SDR [KL].LAB|EX-10[01].PRE|EX-99.SDR [KL].PRE|EX-10[01].PRE|EX-99.SDR [KL].PRE).*?</TEXT>")
    val str = pattern.replaceAllIn(instr, " ")
    str
  }

  def deleteMetaData(instr: String): String = {
    val pattern1 = new Regex("<HEAD>.*?</HEAD>")
    val str1 = pattern1.replaceAllIn(instr, " ")
    val pattern2 = new Regex("(?s)<TYPE>.*?<SEQUENCE>.*?<FILENAME>.*?<DESCRIPTION>.*?")
    val str2 = pattern2.replaceAllIn(str1, " ")
    str2
  }

  def deleteTablesNHTMLElem(instr: String): String = {
    val pattern1 = new Regex("(?s)(?i)<Table.*?</Table>")
    val str1 = pattern1.replaceAllIn(instr, " ")
    val pattern2 = new Regex("(?s)<[^>]*>")
    val str2 = pattern2.replaceAllIn(str1, " ")
    str2
  }

  def deleteExtCharset(instr: String): String = {
    val pattern1 = new Regex("(?s)(&#32;|&nbsp;|&#x(A|a)0;)")
    val str1 = pattern1.replaceAllIn(instr, " ")
    val pattern2 = new Regex("(&#146;|&#x2019;)")
    val str2 = pattern2.replaceAllIn(str1, "'")
    val pattern3 = new Regex("&#120;")
    val str3 = pattern3.replaceAllIn(str2, "x")
    val pattern4 = new Regex("(&#168;|&#167;|&reg;|&#153;|&copy;)")
    val str4 = pattern4.replaceAllIn(str3, " ")
    val pattern5 = new Regex("(&#147;|&#148;|&#x201C;|&#x201D;)")
    val str5 = pattern5.replaceAllIn(str4, "\"")
    val pattern6 = new Regex("&amp;")
    val str6 = pattern6.replaceAllIn(str5, "&")
    val pattern7 = new Regex("(&#150;|&#151;|&#x2013;)")
    val str7 = pattern7.replaceAllIn(str6, "-")
    val pattern8 = new Regex("&#8260;")
    val str8 = pattern8.replaceAllIn(str7, "/")
    str8
  }

  def deleteExcessLFCRWS(instr: String): String = {
    val pattern1 = new Regex("[\n\r]+")
    val str1 = pattern1.replaceAllIn(instr, "\n")
    val pattern2 = new Regex("[\t]+")
    val str2 = pattern2.replaceAllIn(str1, " ")
    val pattern3 = new Regex("\\s+")
    val str3 = pattern3.replaceAllIn(str2, " ")
    str3
  }

  def deleteStrings(str: String): String = {
    val strings = Array("IDEA: XBRL DOCUMENT", "\\/\\* Do Not Remove This Comment \\*\\/", "v2.4.0.8")
    var str1 = str
    for(myString <- strings) {
      var pattern1 = new Regex(myString)
      str1 = pattern1.replaceAllIn(str1, " ")
    }
    str1
  }

  def deleteAllURLsFileNamesDigitsPunctuationExceptPeriod(instr: String): String = {
    val pattern1 = new Regex("\\b(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]")
    val str1 = pattern1.replaceAllIn(instr, "")
    val pattern2 = new Regex("[_a-zA-Z0-9\\-\\.]+.(txt|sgml|xml|xsd|htm|html)")
    val str2 = pattern2.replaceAllIn(str1, " ")
    val pattern3 = new Regex("[^a-zA-Z|^.]")
    val str3 = pattern3.replaceAllIn(str2, " ")
    str3
  }

  def keepOnlyAlphas(instr: String): String = {
    val pattern1 = new Regex("[^a-zA-Z|]")
    val str1 = pattern1.replaceAllIn(instr, " ")
    val str2 = str1.replaceAll("[\\s]+", " ")
    str2
  }

  def calcFileSize(rdd: RDD[String]): Long = {
    rdd.map(_.getBytes("UTF-8").length.toLong)
      .reduce(_+_)
  }

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder().appName("Machine learning application for textual analysis").getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext

    val inputLines = sc.textFile("file:///root/0001193125-14-383437.txt")
    val linesToString = inputLines.toLocalIterator.mkString

    val lineRemAbbrev = deleteAbbrev(linesToString)
    val lineRemDocTypes = deleteDocTypes(lineRemAbbrev)
    val lineRemMetaData = deleteMetaData(lineRemDocTypes)
    val lineRemTabNHTML = deleteTablesNHTMLElem(lineRemMetaData)
    val lineRemExtChrst = deleteExtCharset(lineRemTabNHTML)
    val lineRemExcessLFCRWS = deleteExcessLFCRWS(lineRemExtChrst)
    val lineRemStrings = deleteStrings(lineRemExcessLFCRWS)
    val lineRemAllUrlsFileNamesDigitsPuncXPeriod = deleteAllURLsFileNamesDigitsPunctuationExceptPeriod(lineRemStrings)

    //Code for Computing readability section
    val countPeriods = lineRemAllUrlsFileNamesDigitsPuncXPeriod.count(_ == '.')
    val lineWords = keepOnlyAlphas(lineRemAllUrlsFileNamesDigitsPuncXPeriod)

    val wordsStringDF = sc.parallelize(List(lineWords)).toDF()
    val wordsDF = wordsStringDF.withColumn("words10k", explode(split($"value", "[\\s]"))).drop("value")

    val dictDF = spark.read.format("csv").option("header", "true")
      .load("file:///root/LoughranMcDonald_MasterDictionary_2014.csv")

    val joinWordsDict = wordsDF.join(dictDF, lower(wordsDF("words10k")) === lower(dictDF("Word")))

    val numWords = joinWordsDict.count()
    val avgWordsPerSentence = numWords / countPeriods
    val numPolySylb = joinWordsDict.select("words10k", "Syllables").where(joinWordsDict("Syllables") > 2)
    val polySCount = numPolySylb.count()
    val fogIndex = 0.4*(avgWordsPerSentence+((polySCount/numWords)*100))

    val fileSize = calcFileSize(inputLines)/1000000.0
    println(math.log(fileSize))

    //Code for Using word lists section
    val negWordCount = joinWordsDict
      .select("words10k", "negative")
      .where(joinWordsDict("negative") > 0)
      .count()

    val sentiment = negWordCount / (numWords.toDouble)

    val modalWordCount = joinWordsDict
      .select("words10k", "modal")
      .where(joinWordsDict("modal") > 0)
      .groupBy("modal").count()

    modalWordCount.show()
  }
}
