使用 flatMapValues 时 pyspark 中的错误键
wrong key in pyspark while using flatMapValues
我想添加 flatMapValues 中使用的 'key',但我总是弄错
这里是rdd.collect()
[{'a': 1, 'b': 2, 'c': [1, 2, 3, 4]},
{'a': 11, 'b': 22, 'c': [5, 6, 7, 8]},
{'a': 11, 'b': 23, 'c': [5, 6, 7, 8]}]
并且操作是
def add_key(x):
x[0]['key'] = x[1]
return x
rdd.map(lambda x: (x, x['c'])).flatMapValues(lambda x: x).map(add_key).map(lambda x:(x[1],x[0])
但我得到了这样的结果
[(1, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 1}),
(2, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 3}),
(3, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 3}),
(4, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 4}),
(5, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 5}),
(6, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 7}),
(7, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 7}),
(8, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 8}),
(5, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 5}),
(6, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 7}),
(7, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 7}),
(8, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 8})]
其中'key'部分错误,结果应该是
[(1, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 1}),
(2, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 2}),
(3, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 3}),
(4, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 4}),
(5, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 5}),
(6, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 6}),
(7, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 7}),
(8, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 8}),
(5, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 5}),
(6, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 6}),
(7, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 7}),
(8, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 8})]
这是一个有趣的(错误?)但我不明白。但这里有一个应该给出预期结果的替代方法:
rdd2 = (rdd
.map(lambda x: (x, x['c']))
.flatMapValues(lambda x: x)
.map(lambda x: (x[1], {**x[0], **{'key':x[1]}}))
)
rdd2.collect()
[(1, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 1}),
(2, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 2}),
(3, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 3}),
(4, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 4}),
(5, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 5}),
(6, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 6}),
(7, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 7}),
(8, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 8}),
(5, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 5}),
(6, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 6}),
(7, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 7}),
(8, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 8})]
我想添加 flatMapValues 中使用的 'key',但我总是弄错
这里是rdd.collect()
[{'a': 1, 'b': 2, 'c': [1, 2, 3, 4]},
{'a': 11, 'b': 22, 'c': [5, 6, 7, 8]},
{'a': 11, 'b': 23, 'c': [5, 6, 7, 8]}]
并且操作是
def add_key(x):
x[0]['key'] = x[1]
return x
rdd.map(lambda x: (x, x['c'])).flatMapValues(lambda x: x).map(add_key).map(lambda x:(x[1],x[0])
但我得到了这样的结果
[(1, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 1}),
(2, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 3}),
(3, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 3}),
(4, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 4}),
(5, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 5}),
(6, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 7}),
(7, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 7}),
(8, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 8}),
(5, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 5}),
(6, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 7}),
(7, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 7}),
(8, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 8})]
其中'key'部分错误,结果应该是
[(1, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 1}),
(2, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 2}),
(3, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 3}),
(4, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 4}),
(5, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 5}),
(6, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 6}),
(7, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 7}),
(8, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 8}),
(5, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 5}),
(6, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 6}),
(7, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 7}),
(8, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 8})]
这是一个有趣的(错误?)但我不明白。但这里有一个应该给出预期结果的替代方法:
rdd2 = (rdd
.map(lambda x: (x, x['c']))
.flatMapValues(lambda x: x)
.map(lambda x: (x[1], {**x[0], **{'key':x[1]}}))
)
rdd2.collect()
[(1, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 1}),
(2, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 2}),
(3, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 3}),
(4, {'a': 1, 'b': 2, 'c': [1, 2, 3, 4], 'key': 4}),
(5, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 5}),
(6, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 6}),
(7, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 7}),
(8, {'a': 11, 'b': 22, 'c': [5, 6, 7, 8], 'key': 8}),
(5, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 5}),
(6, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 6}),
(7, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 7}),
(8, {'a': 11, 'b': 23, 'c': [5, 6, 7, 8], 'key': 8})]