Zeppling 推特流媒体示例,无法
Zeppling twitter streaming example, unable to
当按照 zeppelin 教程流式传输推文并使用 Spark SQL 查询它们时,我 运行 遇到错误,其中找不到 'tweets' temp table。使用的确切代码和链接如下
参考:https://zeppelin.apache.org/docs/0.6.2/quickstart/tutorial.html
import scala.collection.mutable.HashMap
import org.apache.spark.streaming._
import org.apache.spark.streaming.twitter._
import org.apache.spark.storage.StorageLevel
import scala.io.Source
import scala.collection.mutable.HashMap
import java.io.File
import org.apache.log4j.Logger
import org.apache.log4j.Level
import sys.process.stringSeqToProcess
/** Configures the Oauth Credentials for accessing Twitter */
def configureTwitterCredentials(apiKey: String, apiSecret: String, accessToken: String, accessTokenSecret: String) {
val configs = new HashMap[String, String] ++= Seq(
"apiKey" -> apiKey, "apiSecret" -> apiSecret, "accessToken" -> accessToken, "accessTokenSecret" -> accessTokenSecret)
println("Configuring Twitter OAuth")
configs.foreach{ case(key, value) =>
if (value.trim.isEmpty) {
throw new Exception("Error setting authentication - value for " + key + " not set")
}
val fullKey = "twitter4j.oauth." + key.replace("api", "consumer")
System.setProperty(fullKey, value.trim)
println("\tProperty " + fullKey + " set as [" + value.trim + "]")
}
println()
}
// Configure Twitter credentials
val apiKey = "xxx"
val apiSecret = "xxx"
val accessToken = "xx-xxx"
val accessTokenSecret = "xxx"
configureTwitterCredentials(apiKey, apiSecret, accessToken, accessTokenSecret)
import org.apache.spark.streaming._
import org.apache.spark.streaming.twitter._
@transient val ssc = new StreamingContext(sc, Seconds(2))
@transient val tweets = TwitterUtils.createStream(ssc, None)
@transient val twt = tweets.window(Seconds(60), Seconds(2))
val sqlContext= new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
case class Tweet(createdAt:Long, text:String)
twt.map(status=>
Tweet(status.getCreatedAt().getTime()/1000, status.getText())).foreachRDD(rdd=>
// Below line works only in spark 1.3.0.
// For spark 1.1.x and spark 1.2.x,
// use rdd.registerTempTable("tweets") instead.
rdd.toDF().registerTempTable("tweets")
)
ssc.start()
在下一段中,我有 SQL select 语句
%sql select createdAt, count(1) from tweets group by createdAt order by createdAt
抛出以下异常
org.apache.spark.sql.AnalysisException: Table not found: tweets;
at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.getTable(Analyzer.scala:305)
能够通过进行以下编辑获得上述示例 运行。我不确定,如果由于 Spark (v1.6.3) 的版本升级或其他一些底层架构的细微差别而需要进行此更改,我可能会遗漏,但无论如何
REF:
在第二段'而不是直接调用 as SQL 语法,尝试使用 sqlContext 如下
val my_df = sqlContext.sql("SELECT * from sweets LIMIT 5")
my_df.collect().foreach(println)
当按照 zeppelin 教程流式传输推文并使用 Spark SQL 查询它们时,我 运行 遇到错误,其中找不到 'tweets' temp table。使用的确切代码和链接如下
参考:https://zeppelin.apache.org/docs/0.6.2/quickstart/tutorial.html
import scala.collection.mutable.HashMap
import org.apache.spark.streaming._
import org.apache.spark.streaming.twitter._
import org.apache.spark.storage.StorageLevel
import scala.io.Source
import scala.collection.mutable.HashMap
import java.io.File
import org.apache.log4j.Logger
import org.apache.log4j.Level
import sys.process.stringSeqToProcess
/** Configures the Oauth Credentials for accessing Twitter */
def configureTwitterCredentials(apiKey: String, apiSecret: String, accessToken: String, accessTokenSecret: String) {
val configs = new HashMap[String, String] ++= Seq(
"apiKey" -> apiKey, "apiSecret" -> apiSecret, "accessToken" -> accessToken, "accessTokenSecret" -> accessTokenSecret)
println("Configuring Twitter OAuth")
configs.foreach{ case(key, value) =>
if (value.trim.isEmpty) {
throw new Exception("Error setting authentication - value for " + key + " not set")
}
val fullKey = "twitter4j.oauth." + key.replace("api", "consumer")
System.setProperty(fullKey, value.trim)
println("\tProperty " + fullKey + " set as [" + value.trim + "]")
}
println()
}
// Configure Twitter credentials
val apiKey = "xxx"
val apiSecret = "xxx"
val accessToken = "xx-xxx"
val accessTokenSecret = "xxx"
configureTwitterCredentials(apiKey, apiSecret, accessToken, accessTokenSecret)
import org.apache.spark.streaming._
import org.apache.spark.streaming.twitter._
@transient val ssc = new StreamingContext(sc, Seconds(2))
@transient val tweets = TwitterUtils.createStream(ssc, None)
@transient val twt = tweets.window(Seconds(60), Seconds(2))
val sqlContext= new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
case class Tweet(createdAt:Long, text:String)
twt.map(status=>
Tweet(status.getCreatedAt().getTime()/1000, status.getText())).foreachRDD(rdd=>
// Below line works only in spark 1.3.0.
// For spark 1.1.x and spark 1.2.x,
// use rdd.registerTempTable("tweets") instead.
rdd.toDF().registerTempTable("tweets")
)
ssc.start()
在下一段中,我有 SQL select 语句
%sql select createdAt, count(1) from tweets group by createdAt order by createdAt
抛出以下异常
org.apache.spark.sql.AnalysisException: Table not found: tweets;
at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)
at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations$.getTable(Analyzer.scala:305)
能够通过进行以下编辑获得上述示例 运行。我不确定,如果由于 Spark (v1.6.3) 的版本升级或其他一些底层架构的细微差别而需要进行此更改,我可能会遗漏,但无论如何
REF:
在第二段'而不是直接调用 as SQL 语法,尝试使用 sqlContext 如下
val my_df = sqlContext.sql("SELECT * from sweets LIMIT 5")
my_df.collect().foreach(println)