Pyspark 指定变量的对象类型
Pyspark specify object type of variable
我从 pyspark 中的 json 文件中读取了以下数据:
{"positionmessage":{"callsign": "PPH1", "name": "testschip-10", "mmsi": 100,"timestamplast": "2019-08-01T00:00:08Z"}}
{"positionmessage":{"callsign": "PPH2", "name": "testschip-11", "mmsi": 200,"timestamplast": "2019-08-01T00:00:01Z"}}
代码如下所示:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType, DateType, FloatType, TimestampType
appName = "PySpark Example - JSON file to Spark Data Frame"
master = "local"
# Create Spark session
spark = SparkSession.builder \
.appName(appName) \
.master(master) \
.getOrCreate()
schema = StructType([
StructField("positionmessage",
StructType([
StructField('callsign', StringType(), True),
StructField('name', StringType(), True),
StructField('timestamplast', TimestampType(), True),
StructField('mmsi', IntegerType(), True)
]))])
file_name = "data.json"
df = spark.read.json(file_name).select("positionmessage.*")
现在我想从 "name" 中删除 "testschip-"。我是这样做的:
import pyspark.sql.functions as follows:
df = df.withColumn("name", f.split(df['name'], '\-')[1]).show() # strips the string "testschip-"
现在如何使 "name" 成为整数?
只需转换为 int
。
from pyspark.sql.functions import *
from pyspark.sql.types import *
df = df.withColumn("name", f.split(df['name'], '\-')[1].cast("int")).show()
我从 pyspark 中的 json 文件中读取了以下数据:
{"positionmessage":{"callsign": "PPH1", "name": "testschip-10", "mmsi": 100,"timestamplast": "2019-08-01T00:00:08Z"}}
{"positionmessage":{"callsign": "PPH2", "name": "testschip-11", "mmsi": 200,"timestamplast": "2019-08-01T00:00:01Z"}}
代码如下所示:
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType, DateType, FloatType, TimestampType
appName = "PySpark Example - JSON file to Spark Data Frame"
master = "local"
# Create Spark session
spark = SparkSession.builder \
.appName(appName) \
.master(master) \
.getOrCreate()
schema = StructType([
StructField("positionmessage",
StructType([
StructField('callsign', StringType(), True),
StructField('name', StringType(), True),
StructField('timestamplast', TimestampType(), True),
StructField('mmsi', IntegerType(), True)
]))])
file_name = "data.json"
df = spark.read.json(file_name).select("positionmessage.*")
现在我想从 "name" 中删除 "testschip-"。我是这样做的:
import pyspark.sql.functions as follows:
df = df.withColumn("name", f.split(df['name'], '\-')[1]).show() # strips the string "testschip-"
现在如何使 "name" 成为整数?
只需转换为 int
。
from pyspark.sql.functions import *
from pyspark.sql.types import *
df = df.withColumn("name", f.split(df['name'], '\-')[1].cast("int")).show()