从 pyspark 连接到 SQL 服务器时出错
Error while connecting to SQL Server from pyspark
from pyspark.sql import SQLContext
from pyspark import SparkContext
from impala.dbapi import connect
import sys
sc = SparkContext("local", "first app")
sqlContext = SQLContext(sc)
with open(sys.argv[1]) as file:
config = yaml.safe_load(file)
print(config)
table_name = 'loadout'
df = sqlContext.read \
.format("jdbc") \
.option("url", config['db_config']['url']) \
.option("dbtable", "(SELECT TOP 100 * FROM dbo.{0})".format(table_name)) \
.option("user", config['db_config']['username']) \
.option("password", config['db_config']['password']) \
.option("driver", config['db_config']['driver']) \
.load()
df.show()
抛出错误:
> Traceback (most recent call last): File
> "/home/rkumbar/ddl_generation/test.py", line 34, in <module>
> .option("driver", config['db_config']['driver']) \ File "/opt/cloudera/parcels/CDH-5.12.1-1.cdh5.12.1.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 139, in load File
> "/opt/cloudera/parcels/CDH-5.12.1-1.cdh5.12.1.p0.3/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in __call__ File
> "/opt/cloudera/parcels/CDH-5.12.1-1.cdh5.12.1.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py",
> line 45, in deco File
> "/opt/cloudera/parcels/CDH-5.12.1-1.cdh5.12.1.p0.3/lib/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py",
> line 308, in get_return_value py4j.protocol.Py4JJavaError: An error
> occurred while calling o56.load. :
> com.microsoft.sqlserver.jdbc.SQLServerException: Incorrect syntax near
> the keyword 'WHERE'.
我的 spark 提交命令:
spark-submit --driver-class-path /home/xyz/ddl_generation/mssql-jdbc-6.1.0.jre7.jar test.py config.yml
当我在其他变量中更改 'dbtable' 查询并传递到该选项时,它起作用了。
mssqlquery = """ (
SELECT TOP 100 *
FROM dbo.xyz
) t """
...
dbtable= mssqlquery,
...
from pyspark.sql import SQLContext
from pyspark import SparkContext
from impala.dbapi import connect
import sys
sc = SparkContext("local", "first app")
sqlContext = SQLContext(sc)
with open(sys.argv[1]) as file:
config = yaml.safe_load(file)
print(config)
table_name = 'loadout'
df = sqlContext.read \
.format("jdbc") \
.option("url", config['db_config']['url']) \
.option("dbtable", "(SELECT TOP 100 * FROM dbo.{0})".format(table_name)) \
.option("user", config['db_config']['username']) \
.option("password", config['db_config']['password']) \
.option("driver", config['db_config']['driver']) \
.load()
df.show()
抛出错误:
> Traceback (most recent call last): File
> "/home/rkumbar/ddl_generation/test.py", line 34, in <module>
> .option("driver", config['db_config']['driver']) \ File "/opt/cloudera/parcels/CDH-5.12.1-1.cdh5.12.1.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 139, in load File
> "/opt/cloudera/parcels/CDH-5.12.1-1.cdh5.12.1.p0.3/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in __call__ File
> "/opt/cloudera/parcels/CDH-5.12.1-1.cdh5.12.1.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py",
> line 45, in deco File
> "/opt/cloudera/parcels/CDH-5.12.1-1.cdh5.12.1.p0.3/lib/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py",
> line 308, in get_return_value py4j.protocol.Py4JJavaError: An error
> occurred while calling o56.load. :
> com.microsoft.sqlserver.jdbc.SQLServerException: Incorrect syntax near
> the keyword 'WHERE'.
我的 spark 提交命令:
spark-submit --driver-class-path /home/xyz/ddl_generation/mssql-jdbc-6.1.0.jre7.jar test.py config.yml
当我在其他变量中更改 'dbtable' 查询并传递到该选项时,它起作用了。
mssqlquery = """ (
SELECT TOP 100 *
FROM dbo.xyz
) t """
...
dbtable= mssqlquery,
...