使用 Spark 与 nltk 进行文本挖掘
Use Spark for text mining with nltk
我对 spark 和文本挖掘有疑问。请帮帮我。我附上了所有错误以便更好地识别。我找不到任何东西来调试这个错误。我不知道为什么当我输入 words.collect() 时 Python 没有回答。
我下载了 Apache Hadoop 2.6 的 spark 并解压。我正在尝试转换此代码但抛出错误。
这是我的代码:
from pyspark import SparkConf
from pyspark import SparkContext
conf = SparkConf()
conf.setAppName('spark-NLTK')
sc = SparkContext.getOrCreate();
import nltk
data = sc.textFile('c:/Users/Ramin/Desktop/Nixon.txt')
#word tokenization
def word_tokenize(x):
lowerW = x.lower()
return nltk.word_tokenize(x)
words = data.flatMap(word_tokenize)
words.collect()
我收到这个错误:
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 0.0 failed 1 times, most recent failure: Lost task 1.0 in stage 0.0 (TID 1, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 364, in main
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 69, in read_command
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 173, in _read_with_length
return self.loads(obj)
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 587, in loads
return pickle.loads(obj, encoding=encoding)
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\cloudpickle.py", line 875, in subimport
__import__(name)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\__init__.py", line 143, in <module>
from nltk.chunk import *
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\chunk\__init__.py", line 157, in <module>
from nltk.chunk.api import ChunkParserI
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\chunk\api.py", line 13, in <module>
from nltk.parse import ParserI
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\parse\__init__.py", line 100, in <module>
from nltk.parse.transitionparser import TransitionParser
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\parse\transitionparser.py", line 22, in <module>
from sklearn.datasets import load_svmlight_file
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\datasets\__init__.py", line 22, in <module>
from .twenty_newsgroups import fetch_20newsgroups
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\datasets\twenty_newsgroups.py", line 44, in <module>
from ..feature_extraction.text import CountVectorizer
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\feature_extraction\__init__.py", line 10, in <module>
from . import text
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 28, in <module>
from ..preprocessing import normalize
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\preprocessing\__init__.py", line 6, in <module>
from ._function_transformer import FunctionTransformer
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\preprocessing\_function_transformer.py", line 5, in <module>
from ..utils.testing import assert_allclose_dense_sparse
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\utils\testing.py", line 718, in <module>
import pytest
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pytest.py", line 6, in <module>
from _pytest.assertion import register_assert_rewrite
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\__init__.py", line 7, in <module>
from _pytest.assertion import rewrite
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\rewrite.py", line 26, in <module>
from _pytest.assertion import util
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\util.py", line 8, in <module>
import _pytest._code
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\_code\__init__.py", line 2, in <module>
from .code import Code # noqa
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\_code\code.py", line 23, in <module>
import pluggy
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pluggy\__init__.py", line 16, in <module>
from .manager import PluginManager, PluginValidationError
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pluggy\manager.py", line 11, in <module>
import importlib_metadata
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 547, in <module>
__version__ = version(__name__)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 509, in version
return distribution(distribution_name).version
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 482, in distribution
return Distribution.from_name(distribution_name)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 183, in from_name
dist = next(dists, None)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 425, in <genexpr>
for path in map(cls._switch_path, paths)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 449, in _search_path
if not root.is_dir():
File "C:\Users\Ramin\Anaconda3\lib\pathlib.py", line 1358, in is_dir
return S_ISDIR(self.stat().st_mode)
File "C:\Users\Ramin\Anaconda3\lib\pathlib.py", line 1168, in stat
return self._accessor.stat(self)
OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'C:\C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\jars\spark-core_2.11-2.4.6.jar'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.api.python.PythonRunner$$anon.read(PythonRunner.scala:592)
at org.apache.spark.api.python.PythonRunner$$anon.read(PythonRunner.scala:575)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$collect$$anonfun.apply(RDD.scala:990)
at org.apache.spark.rdd.RDD$$anonfun$collect$$anonfun.apply(RDD.scala:990)
at org.apache.spark.SparkContext$$anonfun$runJob.apply(SparkContext.scala:2101)
at org.apache.spark.SparkContext$$anonfun$runJob.apply(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage.apply(DAGScheduler.scala:1879)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage.apply(DAGScheduler.scala:1878)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed.apply(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed.apply(DAGScheduler.scala:927)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
at org.apache.spark.util.EventLoop$$anon.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect.apply(RDD.scala:990)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 364, in main
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 69, in read_command
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 173, in _read_with_length
return self.loads(obj)
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 587, in loads
return pickle.loads(obj, encoding=encoding)
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\cloudpickle.py", line 875, in subimport
__import__(name)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\__init__.py", line 143, in <module>
from nltk.chunk import *
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\chunk\__init__.py", line 157, in <module>
from nltk.chunk.api import ChunkParserI
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\chunk\api.py", line 13, in <module>
from nltk.parse import ParserI
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\parse\__init__.py", line 100, in <module>
from nltk.parse.transitionparser import TransitionParser
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\parse\transitionparser.py", line 22, in <module>
from sklearn.datasets import load_svmlight_file
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\datasets\__init__.py", line 22, in <module>
from .twenty_newsgroups import fetch_20newsgroups
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\datasets\twenty_newsgroups.py", line 44, in <module>
from ..feature_extraction.text import CountVectorizer
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\feature_extraction\__init__.py", line 10, in <module>
from . import text
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 28, in <module>
from ..preprocessing import normalize
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\preprocessing\__init__.py", line 6, in <module>
from ._function_transformer import FunctionTransformer
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\preprocessing\_function_transformer.py", line 5, in <module>
from ..utils.testing import assert_allclose_dense_sparse
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\utils\testing.py", line 718, in <module>
import pytest
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pytest.py", line 6, in <module>
from _pytest.assertion import register_assert_rewrite
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\__init__.py", line 7, in <module>
from _pytest.assertion import rewrite
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\rewrite.py", line 26, in <module>
from _pytest.assertion import util
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\util.py", line 8, in <module>
import _pytest._code
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\_code\__init__.py", line 2, in <module>
from .code import Code # noqa
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\_code\code.py", line 23, in <module>
import pluggy
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pluggy\__init__.py", line 16, in <module>
from .manager import PluginManager, PluginValidationError
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pluggy\manager.py", line 11, in <module>
import importlib_metadata
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 547, in <module>
__version__ = version(__name__)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 509, in version
return distribution(distribution_name).version
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 482, in distribution
return Distribution.from_name(distribution_name)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 183, in from_name
dist = next(dists, None)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 425, in <genexpr>
for path in map(cls._switch_path, paths)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 449, in _search_path
if not root.is_dir():
File "C:\Users\Ramin\Anaconda3\lib\pathlib.py", line 1358, in is_dir
return S_ISDIR(self.stat().st_mode)
File "C:\Users\Ramin\Anaconda3\lib\pathlib.py", line 1168, in stat
return self._accessor.stat(self)
OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'C:\C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\jars\spark-core_2.11-2.4.6.jar'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.api.python.PythonRunner$$anon.read(PythonRunner.scala:592)
at org.apache.spark.api.python.PythonRunner$$anon.read(PythonRunner.scala:575)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$collect$$anonfun.apply(RDD.scala:990)
at org.apache.spark.rdd.RDD$$anonfun$collect$$anonfun.apply(RDD.scala:990)
at org.apache.spark.SparkContext$$anonfun$runJob.apply(SparkContext.scala:2101)
at org.apache.spark.SparkContext$$anonfun$runJob.apply(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
... 1 more
您的 Spark 配置的某些部分不正确,因为它认为它需要将 C 驱动器两次添加到您的 Spark 库的路径
C:\C:\Bigdata\SPARK
因此,我会尝试 运行 包含 Spark 示例代码,例如 SparkPi
,然后再使用其他地方的代码
我对 spark 和文本挖掘有疑问。请帮帮我。我附上了所有错误以便更好地识别。我找不到任何东西来调试这个错误。我不知道为什么当我输入 words.collect() 时 Python 没有回答。 我下载了 Apache Hadoop 2.6 的 spark 并解压。我正在尝试转换此代码但抛出错误。
这是我的代码:
from pyspark import SparkConf
from pyspark import SparkContext
conf = SparkConf()
conf.setAppName('spark-NLTK')
sc = SparkContext.getOrCreate();
import nltk
data = sc.textFile('c:/Users/Ramin/Desktop/Nixon.txt')
#word tokenization
def word_tokenize(x):
lowerW = x.lower()
return nltk.word_tokenize(x)
words = data.flatMap(word_tokenize)
words.collect()
我收到这个错误:
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 0.0 failed 1 times, most recent failure: Lost task 1.0 in stage 0.0 (TID 1, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 364, in main
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 69, in read_command
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 173, in _read_with_length
return self.loads(obj)
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 587, in loads
return pickle.loads(obj, encoding=encoding)
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\cloudpickle.py", line 875, in subimport
__import__(name)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\__init__.py", line 143, in <module>
from nltk.chunk import *
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\chunk\__init__.py", line 157, in <module>
from nltk.chunk.api import ChunkParserI
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\chunk\api.py", line 13, in <module>
from nltk.parse import ParserI
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\parse\__init__.py", line 100, in <module>
from nltk.parse.transitionparser import TransitionParser
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\parse\transitionparser.py", line 22, in <module>
from sklearn.datasets import load_svmlight_file
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\datasets\__init__.py", line 22, in <module>
from .twenty_newsgroups import fetch_20newsgroups
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\datasets\twenty_newsgroups.py", line 44, in <module>
from ..feature_extraction.text import CountVectorizer
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\feature_extraction\__init__.py", line 10, in <module>
from . import text
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 28, in <module>
from ..preprocessing import normalize
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\preprocessing\__init__.py", line 6, in <module>
from ._function_transformer import FunctionTransformer
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\preprocessing\_function_transformer.py", line 5, in <module>
from ..utils.testing import assert_allclose_dense_sparse
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\utils\testing.py", line 718, in <module>
import pytest
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pytest.py", line 6, in <module>
from _pytest.assertion import register_assert_rewrite
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\__init__.py", line 7, in <module>
from _pytest.assertion import rewrite
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\rewrite.py", line 26, in <module>
from _pytest.assertion import util
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\util.py", line 8, in <module>
import _pytest._code
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\_code\__init__.py", line 2, in <module>
from .code import Code # noqa
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\_code\code.py", line 23, in <module>
import pluggy
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pluggy\__init__.py", line 16, in <module>
from .manager import PluginManager, PluginValidationError
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pluggy\manager.py", line 11, in <module>
import importlib_metadata
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 547, in <module>
__version__ = version(__name__)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 509, in version
return distribution(distribution_name).version
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 482, in distribution
return Distribution.from_name(distribution_name)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 183, in from_name
dist = next(dists, None)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 425, in <genexpr>
for path in map(cls._switch_path, paths)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 449, in _search_path
if not root.is_dir():
File "C:\Users\Ramin\Anaconda3\lib\pathlib.py", line 1358, in is_dir
return S_ISDIR(self.stat().st_mode)
File "C:\Users\Ramin\Anaconda3\lib\pathlib.py", line 1168, in stat
return self._accessor.stat(self)
OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'C:\C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\jars\spark-core_2.11-2.4.6.jar'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.api.python.PythonRunner$$anon.read(PythonRunner.scala:592)
at org.apache.spark.api.python.PythonRunner$$anon.read(PythonRunner.scala:575)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$collect$$anonfun.apply(RDD.scala:990)
at org.apache.spark.rdd.RDD$$anonfun$collect$$anonfun.apply(RDD.scala:990)
at org.apache.spark.SparkContext$$anonfun$runJob.apply(SparkContext.scala:2101)
at org.apache.spark.SparkContext$$anonfun$runJob.apply(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage.apply(DAGScheduler.scala:1879)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage.apply(DAGScheduler.scala:1878)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed.apply(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed.apply(DAGScheduler.scala:927)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
at org.apache.spark.util.EventLoop$$anon.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect.apply(RDD.scala:990)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 364, in main
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 69, in read_command
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 173, in _read_with_length
return self.loads(obj)
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 587, in loads
return pickle.loads(obj, encoding=encoding)
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\cloudpickle.py", line 875, in subimport
__import__(name)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\__init__.py", line 143, in <module>
from nltk.chunk import *
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\chunk\__init__.py", line 157, in <module>
from nltk.chunk.api import ChunkParserI
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\chunk\api.py", line 13, in <module>
from nltk.parse import ParserI
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\parse\__init__.py", line 100, in <module>
from nltk.parse.transitionparser import TransitionParser
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\parse\transitionparser.py", line 22, in <module>
from sklearn.datasets import load_svmlight_file
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\datasets\__init__.py", line 22, in <module>
from .twenty_newsgroups import fetch_20newsgroups
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\datasets\twenty_newsgroups.py", line 44, in <module>
from ..feature_extraction.text import CountVectorizer
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\feature_extraction\__init__.py", line 10, in <module>
from . import text
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 28, in <module>
from ..preprocessing import normalize
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\preprocessing\__init__.py", line 6, in <module>
from ._function_transformer import FunctionTransformer
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\preprocessing\_function_transformer.py", line 5, in <module>
from ..utils.testing import assert_allclose_dense_sparse
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\utils\testing.py", line 718, in <module>
import pytest
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pytest.py", line 6, in <module>
from _pytest.assertion import register_assert_rewrite
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\__init__.py", line 7, in <module>
from _pytest.assertion import rewrite
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\rewrite.py", line 26, in <module>
from _pytest.assertion import util
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\util.py", line 8, in <module>
import _pytest._code
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\_code\__init__.py", line 2, in <module>
from .code import Code # noqa
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\_code\code.py", line 23, in <module>
import pluggy
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pluggy\__init__.py", line 16, in <module>
from .manager import PluginManager, PluginValidationError
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pluggy\manager.py", line 11, in <module>
import importlib_metadata
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 547, in <module>
__version__ = version(__name__)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 509, in version
return distribution(distribution_name).version
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 482, in distribution
return Distribution.from_name(distribution_name)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 183, in from_name
dist = next(dists, None)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 425, in <genexpr>
for path in map(cls._switch_path, paths)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py", line 449, in _search_path
if not root.is_dir():
File "C:\Users\Ramin\Anaconda3\lib\pathlib.py", line 1358, in is_dir
return S_ISDIR(self.stat().st_mode)
File "C:\Users\Ramin\Anaconda3\lib\pathlib.py", line 1168, in stat
return self._accessor.stat(self)
OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'C:\C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\jars\spark-core_2.11-2.4.6.jar'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.api.python.PythonRunner$$anon.read(PythonRunner.scala:592)
at org.apache.spark.api.python.PythonRunner$$anon.read(PythonRunner.scala:575)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$collect$$anonfun.apply(RDD.scala:990)
at org.apache.spark.rdd.RDD$$anonfun$collect$$anonfun.apply(RDD.scala:990)
at org.apache.spark.SparkContext$$anonfun$runJob.apply(SparkContext.scala:2101)
at org.apache.spark.SparkContext$$anonfun$runJob.apply(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
... 1 more
您的 Spark 配置的某些部分不正确,因为它认为它需要将 C 驱动器两次添加到您的 Spark 库的路径
C:\C:\Bigdata\SPARK
因此,我会尝试 运行 包含 Spark 示例代码,例如 SparkPi
,然后再使用其他地方的代码