无法在 bluemix 使用 ipython 连接到 cloudant 数据库
Cannot connect to cloudant database using ipython at bluemix
我正在按照 IBM bluemix 的教程尝试从刚在 cloudant db 上创建的 crimes db 中提取数据。我正在使用 ipython 笔记本。
我在尝试连接到数据库时遇到错误
我已经相应地在 cloudant 上创建了数据库,但是在尝试连接到数据库时出现错误
<pre>
root
|-- _id: string (nullable = true)
|-- _rev: string (nullable = true)
|-- geometry: struct (nullable = true)
| |-- coordinates: array (nullable = true)
| | |-- element: double (containsNull = true)
| |-- type: string (nullable = true)
|-- properties: struct (nullable = true)
| |-- compnos: string (nullable = true)
| |-- domestic: boolean (nullable = true)
| |-- fromdate: long (nullable = true)
| |-- main_crimecode: string (nullable = true)
| |-- naturecode: string (nullable = true)
| |-- reptdistrict: string (nullable = true)
| |-- shooting: boolean (nullable = true)
| |-- source: string (nullable = true)
|-- type: string (nullable = true)
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-1-7370c766a2d0> in <module>()
9
10
---> 11 cloudantdata.select("properties.naturecode").show()
12
13
/usr/local/src/bluemix_ipythonspark_141/spark/python/pyspark/sql/dataframe.py in show(self, n)
256 +---+-----+
257 """
--> 258 print(self._jdf.showString(n))
259
260 def __repr__(self):
/usr/local/src/bluemix_ipythonspark_141/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/usr/local/src/bluemix_ipythonspark_141/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling o40.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 7.0 failed 10 times, most recent failure: Lost task 0.9 in stage 7.0 (TID 29, yp-spark-dal09-env5-0046): java.lang.ArrayIndexOutOfBoundsException: Array index out of range: 8
at org.apache.spark.sql.catalyst.CatalystTypeConverters$.convertRowWithConverters(CatalystTypeConverters.scala:348)
at org.apache.spark.sql.catalyst.CatalystTypeConverters$$anonfun$createToCatalystConverter.apply(CatalystTypeConverters.scala:180)
at org.apache.spark.sql.execution.RDDConversions$$anonfun$rowToRowRdd$$anonfun$apply.apply(ExistingRDD.scala:62)
at org.apache.spark.sql.execution.RDDConversions$$anonfun$rowToRowRdd$$anonfun$apply.apply(ExistingRDD.scala:59)
at scala.collection.Iterator$$anon.next(Iterator.scala:328)
at scala.collection.Iterator$$anon.next(Iterator.scala:328)
at scala.collection.Iterator$$anon.next(Iterator.scala:328)
at scala.collection.Iterator$$anon.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.sql.execution.SparkPlan$$anonfun.apply(SparkPlan.scala:143)
at org.apache.spark.sql.execution.SparkPlan$$anonfun.apply(SparkPlan.scala:143)
at org.apache.spark.SparkContext$$anonfun$runJob.apply(SparkContext.scala:1775)
at org.apache.spark.SparkContext$$anonfun$runJob.apply(SparkContext.scala:1775)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1157)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:627)
at java.lang.Thread.run(Thread.java:801)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage.apply(DAGScheduler.scala:1264)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage.apply(DAGScheduler.scala:1263)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed.apply(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
at org.apache.spark.util.EventLoop$$anon.run(EventLoop.scala:48)
</pre>
"Procedure 2: Create a Python notebook to analyze the Cloudant data"
sqlContext = SQLContext(sc)
cloudantdata = sqlContext.read.format("com.cloudant.spark").\
option("cloudant.host","borischow.cloudant.com").\
option("cloudant.username", "borischow").\
option("cloudant.password","xxxxxx").\
load("crimes")
cloudantdata.printSchema()
cloudantdata.count()
cloudantdata.select("properties.naturecode").show()
disturbDF = cloudantdata.filter("properties.naturecode = 'DISTRB'")
disturbDF.show()
disturbDF.select("properties").write.format("com.cloudant.spark").\
option("cloudant.host","borischow.cloudant.com").\
option("cloudant.username", "borischow").\
option("cloudant.password","xxxxxx").\
save("crimes_filtered")
您似乎遇到了此工单中报告的已知问题:https://github.com/cloudant-labs/spark-cloudant/issues/24。
我正在按照 IBM bluemix 的教程尝试从刚在 cloudant db 上创建的 crimes db 中提取数据。我正在使用 ipython 笔记本。
我在尝试连接到数据库时遇到错误
我已经相应地在 cloudant 上创建了数据库,但是在尝试连接到数据库时出现错误
<pre>
root
|-- _id: string (nullable = true)
|-- _rev: string (nullable = true)
|-- geometry: struct (nullable = true)
| |-- coordinates: array (nullable = true)
| | |-- element: double (containsNull = true)
| |-- type: string (nullable = true)
|-- properties: struct (nullable = true)
| |-- compnos: string (nullable = true)
| |-- domestic: boolean (nullable = true)
| |-- fromdate: long (nullable = true)
| |-- main_crimecode: string (nullable = true)
| |-- naturecode: string (nullable = true)
| |-- reptdistrict: string (nullable = true)
| |-- shooting: boolean (nullable = true)
| |-- source: string (nullable = true)
|-- type: string (nullable = true)
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-1-7370c766a2d0> in <module>()
9
10
---> 11 cloudantdata.select("properties.naturecode").show()
12
13
/usr/local/src/bluemix_ipythonspark_141/spark/python/pyspark/sql/dataframe.py in show(self, n)
256 +---+-----+
257 """
--> 258 print(self._jdf.showString(n))
259
260 def __repr__(self):
/usr/local/src/bluemix_ipythonspark_141/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/usr/local/src/bluemix_ipythonspark_141/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling o40.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 7.0 failed 10 times, most recent failure: Lost task 0.9 in stage 7.0 (TID 29, yp-spark-dal09-env5-0046): java.lang.ArrayIndexOutOfBoundsException: Array index out of range: 8
at org.apache.spark.sql.catalyst.CatalystTypeConverters$.convertRowWithConverters(CatalystTypeConverters.scala:348)
at org.apache.spark.sql.catalyst.CatalystTypeConverters$$anonfun$createToCatalystConverter.apply(CatalystTypeConverters.scala:180)
at org.apache.spark.sql.execution.RDDConversions$$anonfun$rowToRowRdd$$anonfun$apply.apply(ExistingRDD.scala:62)
at org.apache.spark.sql.execution.RDDConversions$$anonfun$rowToRowRdd$$anonfun$apply.apply(ExistingRDD.scala:59)
at scala.collection.Iterator$$anon.next(Iterator.scala:328)
at scala.collection.Iterator$$anon.next(Iterator.scala:328)
at scala.collection.Iterator$$anon.next(Iterator.scala:328)
at scala.collection.Iterator$$anon.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.sql.execution.SparkPlan$$anonfun.apply(SparkPlan.scala:143)
at org.apache.spark.sql.execution.SparkPlan$$anonfun.apply(SparkPlan.scala:143)
at org.apache.spark.SparkContext$$anonfun$runJob.apply(SparkContext.scala:1775)
at org.apache.spark.SparkContext$$anonfun$runJob.apply(SparkContext.scala:1775)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1157)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:627)
at java.lang.Thread.run(Thread.java:801)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage.apply(DAGScheduler.scala:1264)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage.apply(DAGScheduler.scala:1263)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed.apply(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
at org.apache.spark.util.EventLoop$$anon.run(EventLoop.scala:48)
</pre>
"Procedure 2: Create a Python notebook to analyze the Cloudant data"
sqlContext = SQLContext(sc)
cloudantdata = sqlContext.read.format("com.cloudant.spark").\
option("cloudant.host","borischow.cloudant.com").\
option("cloudant.username", "borischow").\
option("cloudant.password","xxxxxx").\
load("crimes")
cloudantdata.printSchema()
cloudantdata.count()
cloudantdata.select("properties.naturecode").show()
disturbDF = cloudantdata.filter("properties.naturecode = 'DISTRB'")
disturbDF.show()
disturbDF.select("properties").write.format("com.cloudant.spark").\
option("cloudant.host","borischow.cloudant.com").\
option("cloudant.username", "borischow").\
option("cloudant.password","xxxxxx").\
save("crimes_filtered")
您似乎遇到了此工单中报告的已知问题:https://github.com/cloudant-labs/spark-cloudant/issues/24。