Apache Spark 无法查看输出
Apache Spark Not able to view output
我刚刚开始学习 Apache Spark。我正在尝试打印链接的输出,但由于某种原因它没有显示。我也试过 links.collect() , display(links) 但其中 none 有效。任何帮助将不胜感激。
第二张图片的完整堆栈跟踪:
Py4JJavaError Traceback (most recent call last)
<ipython-input-34-01e857cfa45e> in <module>()
----> 1 for link in links.collect():
2 print("%s" %(link))
/databricks/spark/python/pyspark/rdd.py in collect(self)
769 """
770 with SCCallSiteSync(self.context) as css:
--> 771 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
772 return list(_load_from_socket(port, self._jrdd_deserializer))
773
/databricks/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 4 in stage 24.0 failed 1 times, most recent failure: Lost task 4.0 in stage 24.0 (TID 76, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/databricks/spark/python/pyspark/worker.py", line 111, in main
process()
File "/databricks/spark/python/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/databricks/spark/python/pyspark/rdd.py", line 2346, in pipeline_func
return func(split, prev_func(split, iterator))
File "/databricks/spark/python/pyspark/rdd.py", line 2346, in pipeline_func
return func(split, prev_func(split, iterator))
File "/databricks/spark/python/pyspark/rdd.py", line 317, in func
return f(iterator)
File "/databricks/spark/python/pyspark/rdd.py", line 1776, in combineLocally
merger.mergeValues(iterator)
File "/databricks/spark/python/pyspark/shuffle.py", line 236, in mergeValues
for k, v in iterator:
File "<ipython-input-31-4b09041aa30b>", line 1, in <lambda>
File "<ipython-input-28-f43debc22073>", line 3, in parseNeighbors
File "/databricks/python/lib/python2.7/re.py", line 171, in split
return _compile(pattern, flags).split(string, maxsplit)
TypeError: expected string or buffer
at org.apache.spark.api.python.PythonRunner$$anon.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:342)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:46)
at org.apache.spark.scheduler.Task.run(Task.scala:96)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:222)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
你的映射函数错误错误类似于
re.split(r'\s+',[('a b')])
尝试将其替换为
parts=re.split(r'\s+',urls[0])
这将发送
re.split(r'\s+',('a b'))
整行都发送到您的地图函数,因此您需要通过调用它们来访问单元格,例如 .map(lambda row: (row[0], row[1])
我刚刚开始学习 Apache Spark。我正在尝试打印链接的输出,但由于某种原因它没有显示。我也试过 links.collect() , display(links) 但其中 none 有效。任何帮助将不胜感激。
第二张图片的完整堆栈跟踪:
Py4JJavaError Traceback (most recent call last)
<ipython-input-34-01e857cfa45e> in <module>()
----> 1 for link in links.collect():
2 print("%s" %(link))
/databricks/spark/python/pyspark/rdd.py in collect(self)
769 """
770 with SCCallSiteSync(self.context) as css:
--> 771 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
772 return list(_load_from_socket(port, self._jrdd_deserializer))
773
/databricks/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 4 in stage 24.0 failed 1 times, most recent failure: Lost task 4.0 in stage 24.0 (TID 76, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/databricks/spark/python/pyspark/worker.py", line 111, in main
process()
File "/databricks/spark/python/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/databricks/spark/python/pyspark/rdd.py", line 2346, in pipeline_func
return func(split, prev_func(split, iterator))
File "/databricks/spark/python/pyspark/rdd.py", line 2346, in pipeline_func
return func(split, prev_func(split, iterator))
File "/databricks/spark/python/pyspark/rdd.py", line 317, in func
return f(iterator)
File "/databricks/spark/python/pyspark/rdd.py", line 1776, in combineLocally
merger.mergeValues(iterator)
File "/databricks/spark/python/pyspark/shuffle.py", line 236, in mergeValues
for k, v in iterator:
File "<ipython-input-31-4b09041aa30b>", line 1, in <lambda>
File "<ipython-input-28-f43debc22073>", line 3, in parseNeighbors
File "/databricks/python/lib/python2.7/re.py", line 171, in split
return _compile(pattern, flags).split(string, maxsplit)
TypeError: expected string or buffer
at org.apache.spark.api.python.PythonRunner$$anon.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:342)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:46)
at org.apache.spark.scheduler.Task.run(Task.scala:96)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:222)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
你的映射函数错误错误类似于
re.split(r'\s+',[('a b')])
尝试将其替换为
parts=re.split(r'\s+',urls[0])
这将发送
re.split(r'\s+',('a b'))
整行都发送到您的地图函数,因此您需要通过调用它们来访问单元格,例如 .map(lambda row: (row[0], row[1])