package ca.training.bigdata.spark.core.munging.textual

import org.apache.spark.sql.SparkSession

/**
  * Created by BigDataTraining on 2018-03-10.
  */
object ProcessingMultipleInputData {

  def Process(filename: String, spark: SparkSession) = {
    val fpath = filename.split('/').last;
    val lines = spark.sparkContext.textFile(filename);
    val counts = lines.flatMap(line => line.split(" ")).map(word => word).map(word => (word, 1)).reduceByKey(_ + _);
    val word_counts = counts.map( x => (fpath,x._1, x._2));
    word_counts
  }

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("Processing multiple inut files for Munging textual data using Spark SQL").getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext

    val data = spark.sparkContext.wholeTextFiles("/user/root/20_newsgroups/comp.graphics")
    val files = data.map { case (filename, content) => filename}

    val buf = scala.collection.mutable.ArrayBuffer.empty[org.apache.spark.rdd.RDD[(String, String, Int)]]
    var output : org.apache.spark.rdd.RDD[(String, String, Int)] = sc.emptyRDD
    files.collect.foreach( filename => { buf += Process(filename, spark); })

    output = spark.sparkContext.union(buf.toList);
    output.cache()
    output.take(5).foreach(println)

  }

}
