在 pyspark 和 sparksubmit 中读取文本文件
Read text file in pyspark and sparksubmit
假设我 运行 a python shell (file1.py) 以文本文件作为参数。我 运行 如下所示:
python file1.py textfile1.txt
里面file1.py下面的代码
from pyspark import SparkContext
....
#I can read the file using the follwoing command
sc = SparkContext()
inputfile= sc.textFile(sys.argv[1])
我必须做哪些必要的修改才能使 file1.py 运行 没有问题?
但是 pyspark 对我不起作用,通常,我使用的是 spark-submit!所以当 运行 在本地模式下使用 spark-submit 时它会给我以下错误
Traceback (most recent call last):
File "/home/noorhadoop/Desktop/folder1/file1.py", line 4, in <module>
from pyspark import SparkContext
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/__init__.py", line 44, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/context.py", line 33, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/java_gateway.py", line 25, in <module>
File "/usr/lib/python3.6/platform.py", line 909, in <module>
"system node release version machine processor")
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
Error in sys.excepthook:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/apport_python_hook.py", line 63, in apport_excepthook
from apport.fileutils import likely_packaged, get_recent_crashes
File "/usr/lib/python3/dist-packages/apport/__init__.py", line 5, in <module>
from apport.report import Report
File "/usr/lib/python3/dist-packages/apport/report.py", line 21, in <module>
from urllib.request import urlopen
File "/usr/lib/python3.6/urllib/request.py", line 88, in <module>
import http.client
File "/usr/lib/python3.6/http/client.py", line 71, in <module>
import email.parser
File "/usr/lib/python3.6/email/parser.py", line 12, in <module>
from email.feedparser import FeedParser, BytesFeedParser
File "/usr/lib/python3.6/email/feedparser.py", line 27, in <module>
from email._policybase import compat32
File "/usr/lib/python3.6/email/_policybase.py", line 9, in <module>
from email.utils import _has_surrogates
File "/usr/lib/python3.6/email/utils.py", line 31, in <module>
import urllib.parse
File "/usr/lib/python3.6/urllib/parse.py", line 227, in <module>
_DefragResultBase = namedtuple('DefragResult', 'url fragment')
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
Original exception was:
Traceback (most recent call last):
File "/home/noorhadoop/Desktop/folder1/file1.py", line 4, in <module>
from pyspark import SparkContext
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/__init__.py", line 44, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/context.py", line 33, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/java_gateway.py", line 25, in <module>
File "/usr/lib/python3.6/platform.py", line 909, in <module>
"system node release version machine processor")
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
hduser@noorhadoop-virtual-machine:/usr/local/spark$ ./bin/spark-submit --master local[3] /home/noorhadoop/Desktop/folder1/file1.py /home/noorhadoop/Desktop/folder1/simple1.txt
Traceback (most recent call last):
File "/home/noorhadoop/Desktop/folder1/file1.py", line 4, in <module>
from pyspark import SparkContext
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/__init__.py", line 44, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/context.py", line 33, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/java_gateway.py", line 25, in <module>
File "/usr/lib/python3.6/platform.py", line 909, in <module>
"system node release version machine processor")
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
Error in sys.excepthook:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/apport_python_hook.py", line 63, in apport_excepthook
from apport.fileutils import likely_packaged, get_recent_crashes
File "/usr/lib/python3/dist-packages/apport/__init__.py", line 5, in <module>
from apport.report import Report
File "/usr/lib/python3/dist-packages/apport/report.py", line 21, in <module>
from urllib.request import urlopen
File "/usr/lib/python3.6/urllib/request.py", line 88, in <module>
import http.client
File "/usr/lib/python3.6/http/client.py", line 71, in <module>
import email.parser
File "/usr/lib/python3.6/email/parser.py", line 12, in <module>
from email.feedparser import FeedParser, BytesFeedParser
File "/usr/lib/python3.6/email/feedparser.py", line 27, in <module>
from email._policybase import compat32
File "/usr/lib/python3.6/email/_policybase.py", line 9, in <module>
from email.utils import _has_surrogates
File "/usr/lib/python3.6/email/utils.py", line 31, in <module>
import urllib.parse
File "/usr/lib/python3.6/urllib/parse.py", line 227, in <module>
_DefragResultBase = namedtuple('DefragResult', 'url fragment')
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
Original exception was:
Traceback (most recent call last):
File "/home/noorhadoop/Desktop/folder1/file1.py", line 4, in <module>
from pyspark import SparkContext
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/__init__.py", line 44, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/context.py", line 33, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/java_gateway.py", line 25, in <module>
File "/usr/lib/python3.6/platform.py", line 909, in <module>
"system node release version machine processor")
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
谢谢,
您没有 post 错误消息,因此很难知道确切的信息,但 sc.textFile
需要 HDFS 或本地文件系统上的文件的完整路径。
例如,如果您是 运行 本地模式的 spark,则必须使用 spark-submit 作为 -
传递参数
spark-submit \
--master local[*] \
--/path/to/file1.py \
"file://path/to/textfile1.txt"
或者如果您 运行 在群集上,请提供完整的 hdfs 路径作为参数
spark-submit \
--master spark://localhost:7077 \
--/path/to/file1.py \
"hdfs://localhost:9000/path/to/textfile1.txt"
假设我 运行 a python shell (file1.py) 以文本文件作为参数。我 运行 如下所示:
python file1.py textfile1.txt
里面file1.py下面的代码
from pyspark import SparkContext
....
#I can read the file using the follwoing command
sc = SparkContext()
inputfile= sc.textFile(sys.argv[1])
我必须做哪些必要的修改才能使 file1.py 运行 没有问题?
但是 pyspark 对我不起作用,通常,我使用的是 spark-submit!所以当 运行 在本地模式下使用 spark-submit 时它会给我以下错误
Traceback (most recent call last):
File "/home/noorhadoop/Desktop/folder1/file1.py", line 4, in <module>
from pyspark import SparkContext
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/__init__.py", line 44, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/context.py", line 33, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/java_gateway.py", line 25, in <module>
File "/usr/lib/python3.6/platform.py", line 909, in <module>
"system node release version machine processor")
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
Error in sys.excepthook:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/apport_python_hook.py", line 63, in apport_excepthook
from apport.fileutils import likely_packaged, get_recent_crashes
File "/usr/lib/python3/dist-packages/apport/__init__.py", line 5, in <module>
from apport.report import Report
File "/usr/lib/python3/dist-packages/apport/report.py", line 21, in <module>
from urllib.request import urlopen
File "/usr/lib/python3.6/urllib/request.py", line 88, in <module>
import http.client
File "/usr/lib/python3.6/http/client.py", line 71, in <module>
import email.parser
File "/usr/lib/python3.6/email/parser.py", line 12, in <module>
from email.feedparser import FeedParser, BytesFeedParser
File "/usr/lib/python3.6/email/feedparser.py", line 27, in <module>
from email._policybase import compat32
File "/usr/lib/python3.6/email/_policybase.py", line 9, in <module>
from email.utils import _has_surrogates
File "/usr/lib/python3.6/email/utils.py", line 31, in <module>
import urllib.parse
File "/usr/lib/python3.6/urllib/parse.py", line 227, in <module>
_DefragResultBase = namedtuple('DefragResult', 'url fragment')
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
Original exception was:
Traceback (most recent call last):
File "/home/noorhadoop/Desktop/folder1/file1.py", line 4, in <module>
from pyspark import SparkContext
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/__init__.py", line 44, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/context.py", line 33, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/java_gateway.py", line 25, in <module>
File "/usr/lib/python3.6/platform.py", line 909, in <module>
"system node release version machine processor")
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
hduser@noorhadoop-virtual-machine:/usr/local/spark$ ./bin/spark-submit --master local[3] /home/noorhadoop/Desktop/folder1/file1.py /home/noorhadoop/Desktop/folder1/simple1.txt
Traceback (most recent call last):
File "/home/noorhadoop/Desktop/folder1/file1.py", line 4, in <module>
from pyspark import SparkContext
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/__init__.py", line 44, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/context.py", line 33, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/java_gateway.py", line 25, in <module>
File "/usr/lib/python3.6/platform.py", line 909, in <module>
"system node release version machine processor")
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
Error in sys.excepthook:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/apport_python_hook.py", line 63, in apport_excepthook
from apport.fileutils import likely_packaged, get_recent_crashes
File "/usr/lib/python3/dist-packages/apport/__init__.py", line 5, in <module>
from apport.report import Report
File "/usr/lib/python3/dist-packages/apport/report.py", line 21, in <module>
from urllib.request import urlopen
File "/usr/lib/python3.6/urllib/request.py", line 88, in <module>
import http.client
File "/usr/lib/python3.6/http/client.py", line 71, in <module>
import email.parser
File "/usr/lib/python3.6/email/parser.py", line 12, in <module>
from email.feedparser import FeedParser, BytesFeedParser
File "/usr/lib/python3.6/email/feedparser.py", line 27, in <module>
from email._policybase import compat32
File "/usr/lib/python3.6/email/_policybase.py", line 9, in <module>
from email.utils import _has_surrogates
File "/usr/lib/python3.6/email/utils.py", line 31, in <module>
import urllib.parse
File "/usr/lib/python3.6/urllib/parse.py", line 227, in <module>
_DefragResultBase = namedtuple('DefragResult', 'url fragment')
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
Original exception was:
Traceback (most recent call last):
File "/home/noorhadoop/Desktop/folder1/file1.py", line 4, in <module>
from pyspark import SparkContext
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/__init__.py", line 44, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/context.py", line 33, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/java_gateway.py", line 25, in <module>
File "/usr/lib/python3.6/platform.py", line 909, in <module>
"system node release version machine processor")
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
谢谢,
您没有 post 错误消息,因此很难知道确切的信息,但 sc.textFile
需要 HDFS 或本地文件系统上的文件的完整路径。
例如,如果您是 运行 本地模式的 spark,则必须使用 spark-submit 作为 -
传递参数spark-submit \
--master local[*] \
--/path/to/file1.py \
"file://path/to/textfile1.txt"
或者如果您 运行 在群集上,请提供完整的 hdfs 路径作为参数
spark-submit \
--master spark://localhost:7077 \
--/path/to/file1.py \
"hdfs://localhost:9000/path/to/textfile1.txt"