从包含字典的 Pandas 列中提取信息以创建新的 Pandas 列

Extracting info from a Pandas column containing a dict to create a new Pandas column

数据框df2中的一列如下:

from pandas import DataFrame, Series
df = DataFrame(data)
df2 = df.ix[0:, [17, 18, 19, 13]]
df2.columns = ['a', 'b', 'c', 'd']
df2 = df2.rename(columns={'a':'name', 'b':'stuff', 'c':'scanner', 'd': 'geo'})
print df2['geo']

{u'type': u'Point', u'coordinates': [40.70477818, -74.18183193]}
{u'type': u'Point', u'coordinates': [25.78915569, -80.1845325]}
{u'type': u'Point', u'coordinates': [35.20042459, -101.91661173]}
{u'type': u'Point', u'coordinates': [32.51591725, -92.15884093]}
{u'type': u'Point', u'coordinates': [43.35457272, -79.78553736]}
{u'type': u'Point', u'coordinates': [43.35460763, -79.78536878]}
{u'type': u'Point', u'coordinates': [47.80446395, 16.16058828]}

我想提取第一个数字作为纬度(新列),第二个数字作为经度。已经尝试了很长时间,但似乎无法找出正确的语法。我的非工作代码:

df2['latitude'] = df2['geo']['coordinates'][0]

KeyError: 'coordinates'

编辑:我将 unicode 转换为 dict 的工作解决方案,仍然想知道是否有任何更快的单行代码 python 代码可以做同样的事情...

def uni_to_dict_lat(row):
    uni_string = row['geo']
    to_dict = ast.literal_eval(uni_string)
    return to_dict['coordinates'][0]

def uni_to_dict_lon(row):
    uni_string = row['geo']
    to_dict = ast.literal_eval(uni_string)
    return to_dict['coordinates'][1]

df2['lat'] = df2.apply(uni_to_dict_lat, axis=1)
df2['lon'] = df2.apply(uni_to_dict_lon, axis=1)

您可以使用

df2['latitude'] = df2['geo'].apply(lambda x: x['coordinates'][0])

你能试试这个吗?

import pandas as pd
import numpy as np

# replicate your data structure
# =================================================================== 
a, b, c = np.arange(7), np.arange(7), np.arange(7)
d = [{u'type': u'Point', u'coordinates': [40.70477818, -74.18183193]},
{u'type': u'Point', u'coordinates': [25.78915569, -80.1845325]},
{u'type': u'Point', u'coordinates': [35.20042459, -101.91661173]},
{u'type': u'Point', u'coordinates': [32.51591725, -92.15884093]},
{u'type': u'Point', u'coordinates': [43.35457272, -79.78553736]},
{u'type': u'Point', u'coordinates': [43.35460763, -79.78536878]},
{u'type': u'Point', u'coordinates': [47.80446395, 16.16058828]}]

df2 = pd.DataFrame(dict(a=a,b=b,c=c,d=d))
df2 = df2.rename(columns={'a':'name', 'b':'stuff', 'c':'scanner', 'd': 'geo'})

Out[54]: 
   name  stuff  scanner                                                                geo
0     0      0        0   {u'type': u'Point', u'coordinates': [40.70477818, -74.18183193]}
1     1      1        1    {u'type': u'Point', u'coordinates': [25.78915569, -80.1845325]}
2     2      2        2  {u'type': u'Point', u'coordinates': [35.20042459, -101.91661173]}
3     3      3        3   {u'type': u'Point', u'coordinates': [32.51591725, -92.15884093]}
4     4      4        4   {u'type': u'Point', u'coordinates': [43.35457272, -79.78553736]}
5     5      5        5   {u'type': u'Point', u'coordinates': [43.35460763, -79.78536878]}
6     6      6        6    {u'type': u'Point', u'coordinates': [47.80446395, 16.16058828]}


# do list comprehension
# ===================================================================
df2['latitude'] = [x['coordinates'][0] for x in df2['geo'].values]
df2['longitude'] = [x['coordinates'][1] for x in df2['geo'].values]
df2.drop('geo', axis=1)

Out[57]: 
   name  stuff  scanner  latitude  longitude
0     0      0        0   40.7048   -74.1818
1     1      1        1   25.7892   -80.1845
2     2      2        2   35.2004  -101.9166
3     3      3        3   32.5159   -92.1588
4     4      4        4   43.3546   -79.7855
5     5      5        5   43.3546   -79.7854
6     6      6        6   47.8045    16.1606