Twitter live data mining using Spark streaming and Scala.

Want to work and learn live streaming data processing? Easiest way to create a twitter developer app and follow below code to ingest and store data in your AWS S3 for further analysis and processing with tools like Amazon EMR or Machine learning projects.

 * @author Gyanendra
 * @Date : 08/12/19

import org.apache.spark.SparkConf
import org.apache.spark.streaming.twitter.TwitterUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import twitter4j.auth.OAuthAuthorization
import twitter4j.conf.ConfigurationBuilder

object TweeterStreamReaderApp {
  def main(args: Array[String]) {

    var twitterCredentials = new Array[String](4);
    twitterCredentials(0) = "gA7xFE3S1QfVTN55Uuzb";
    twitterCredentials(1) = "2te2Z1yFvynXcp06rc2j3zg38tNAa1zY29rOT3d5BFI";
    twitterCredentials(2) = "1063309360480-61DChczOivazJZTWodLfuRRW8gDNfJ";
    twitterCredentials(3) = "bFYPmpiWhFgOtdJGe95YyhOntxOQAmx0xEYtF";

    val appName = "TweeterStreamReader"
    val conf = new SparkConf()
    val ssc = new StreamingContext(conf, Seconds(5))
    val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = twitterCredentials.take(4)
    val filters = args.takeRight(args.length - 4)
    val cb = new ConfigurationBuilder
    val auth = new OAuthAuthorization(
    val tweets = TwitterUtils.createStream(ssc, Some(auth), filters)
    val englishTweets = tweets.filter(_.getLang() == "en")

    // lets print all rdd. Further you can store this to S3
    englishTweets.foreachRDD { (rdd, time) =>

    def p(rdd: org.apache.spark.rdd.RDD[_]) = rdd.foreach(println)

Download this code from my repo