package ca.training.bigdata.spark.core.munging.textual

import org.apache.spark.sql.SparkSession

import ca.training.bigdata.spark.core.munging.textual.ProcessingMultipleInputData.Process

/**
  * Created by BigDataTraining on 2018-03-10.
  */
object RemovingStopWords {

  def processLine(s: String, stopWords: Set[String]): List[String] = {
    s.toLowerCase()
      .split("\\s+")
      .filter(x => x.matches("[A-Za-z]+"))
      .filter(!stopWords.contains(_))
      .toList
  }

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("Remove stop words for Munging textual data using Spark SQL").getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext

    val data = spark.sparkContext.wholeTextFiles("/user/root/20_newsgroups/comp.graphics")
    val files = data.map { case (filename, content) => filename}

    val buf = scala.collection.mutable.ArrayBuffer.empty[org.apache.spark.rdd.RDD[(String, String, Int)]]
    var output : org.apache.spark.rdd.RDD[(String, String, Int)] = sc.emptyRDD
    files.collect.foreach( filename => { buf +=  Process(filename, spark); })

    output = spark.sparkContext.union(buf.toList);
    output.cache()
    output.take(5).foreach(println)

    val stopWords = sc.broadcast(Set("as", "able", "about", "above", "according", "accordingly", "across", "actually", "..."))

    val groupedRDD = output.map{ case (x, y, z) => (x, (processLine(y.trim(), stopWords.value)).mkString, z)}.filter{case (x, y, z) => !y.equals("")}

    groupedRDD.take(5).foreach(println)
    val words = groupedRDD.map{ case (x, y, z) => y}
    val wordsDF = words.toDF
    val regex = "[,.:;'\"\\?\\-!\\(\\)]".r
    val stopwords = sc.textFile("file:///root/stopwords.txt")
    val stopwordsDF = stopwords.flatMap(line => line.split("[\\s]")).map(word => regex.replaceAllIn(word.trim.toLowerCase, "")).filter(word => !word.isEmpty).toDF()
    val cleanwords = wordsDF.except(stopwordsDF)
    stopwordsDF.count()
    words.count()
    cleanwords.count()

  }

}
