从本地的 pyspark 读取 azure datalake gen2 文件
Reading azure datalake gen2 file from pyspark in local
我正在尝试使用 pyspark 脚本从本地 spark(版本 spark-3.0.1-bin-hadoop3.2)读取位于 Azure Datalake Gen2 中的文件。
脚本如下
import dbutils as dbutils
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
file = "abfs://myaccount.dfs.core.windows.net/test-file-system/dummy.txt"
key = "key"
appName = "DataExtract"
master = "local[*]"
sparkConf = SparkConf() \
.setAppName(appName) \
.setMaster(master) \
.set("fs.azure.myaccount.key.myaccount.dfs.core.windows.net", key)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
df = spark.read.text(file)
df.show()
我的错误是
Traceback (most recent call last):
File "test.py", line 18, in <module>
df = spark.read.text(file)
File "C:\apps\spark-3.0.1-bin-hadoop3.2\python\pyspark\sql\readwriter.py", line 389, in text
return self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(paths)))
File "C:\apps\spark-3.0.1-bin-hadoop3.2\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py", line 1305, in __call__
File "C:\apps\spark-3.0.1-bin-hadoop3.2\python\pyspark\sql\utils.py", line 128, in deco
return f(*a, **kw)
File "C:\apps\spark-3.0.1-bin-hadoop3.2\python\lib\py4j-0.10.9-src.zip\py4j\protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o34.text.
: abfs://myaccount.dfs.core.windows.net/test-file-system/dummy.txt has invalid authority.
at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.authorityParts(AzureBlobFileSystemStore.java:159)
at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.<init>(AzureBlobFileSystemStore.java:122)
at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem.initialize(AzureBlobFileSystem.java:108)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3303)
at org.apache.hadoop.fs.FileSystem.access0(FileSystem.java:124)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:46)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:366)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:297)
at org.apache.spark.sql.DataFrameReader.$anonfun$load(DataFrameReader.scala:286)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:286)
at org.apache.spark.sql.DataFrameReader.text(DataFrameReader.scala:843)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:834)
有没有类似的错误?我不知道错误是代码或我的电脑中缺少某些配置还是 datalake 的 azure 帐户中的某些配置。
预先感谢您的回答!
我找到了解决方案。
文件路径必须是
file="abfss://<<container>>@<<storageaccount>>.dfs.core.windows.net/<<topfolder>>/<<subfolder>>/file"
我在
https://deep.data.blog/2019/07/12/diy-apache-spark-and-adls-gen-2-support/
并检查是否安装了所有必需的 .jar。
我正在尝试使用 pyspark 脚本从本地 spark(版本 spark-3.0.1-bin-hadoop3.2)读取位于 Azure Datalake Gen2 中的文件。
脚本如下
import dbutils as dbutils
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
file = "abfs://myaccount.dfs.core.windows.net/test-file-system/dummy.txt"
key = "key"
appName = "DataExtract"
master = "local[*]"
sparkConf = SparkConf() \
.setAppName(appName) \
.setMaster(master) \
.set("fs.azure.myaccount.key.myaccount.dfs.core.windows.net", key)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
df = spark.read.text(file)
df.show()
我的错误是
Traceback (most recent call last):
File "test.py", line 18, in <module>
df = spark.read.text(file)
File "C:\apps\spark-3.0.1-bin-hadoop3.2\python\pyspark\sql\readwriter.py", line 389, in text
return self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(paths)))
File "C:\apps\spark-3.0.1-bin-hadoop3.2\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py", line 1305, in __call__
File "C:\apps\spark-3.0.1-bin-hadoop3.2\python\pyspark\sql\utils.py", line 128, in deco
return f(*a, **kw)
File "C:\apps\spark-3.0.1-bin-hadoop3.2\python\lib\py4j-0.10.9-src.zip\py4j\protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o34.text.
: abfs://myaccount.dfs.core.windows.net/test-file-system/dummy.txt has invalid authority.
at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.authorityParts(AzureBlobFileSystemStore.java:159)
at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore.<init>(AzureBlobFileSystemStore.java:122)
at org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem.initialize(AzureBlobFileSystem.java:108)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3303)
at org.apache.hadoop.fs.FileSystem.access0(FileSystem.java:124)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3352)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3320)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:479)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:46)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:366)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:297)
at org.apache.spark.sql.DataFrameReader.$anonfun$load(DataFrameReader.scala:286)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:286)
at org.apache.spark.sql.DataFrameReader.text(DataFrameReader.scala:843)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.base/java.lang.Thread.run(Thread.java:834)
有没有类似的错误?我不知道错误是代码或我的电脑中缺少某些配置还是 datalake 的 azure 帐户中的某些配置。
预先感谢您的回答!
我找到了解决方案。 文件路径必须是
file="abfss://<<container>>@<<storageaccount>>.dfs.core.windows.net/<<topfolder>>/<<subfolder>>/file"
我在 https://deep.data.blog/2019/07/12/diy-apache-spark-and-adls-gen-2-support/
并检查是否安装了所有必需的 .jar。