使用循环在 Python 中创建面板数据
Create panel data in Python using loop
我正在尝试在 Python 中创建面板数据框,例如5 个国家(A、B、C、D、E),每个国家都有 3 年的数据(2000、2001、2002)。
import numpy as np
import pandas as pd
df = {'id': [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5],
'country': ['A', 'B', 'C', 'D', 'E', 'A', 'B', 'C', 'D', 'E', 'A', 'B', 'C', 'D', 'E'],
'year': [2000, 2001, 2002, 2000, 2001, 2002, 2000, 2001, 2002, 2000, 2001, 2002, 2000, 2001, 2002]
}
df = pd.DataFrame(df)
df
为了将其扩展到更大的数据集,我尝试使用以下代码循环以获得上述结果,但它没有给我所需的数据框。
n_country = 5 # number of countries
n_year = 3 # number of years of data for each country
columns = ("id", "country", "year")
n_rows = n_country*n_year
data = pd.DataFrame(np.empty(shape = (n_rows, 3)), columns = columns)
# set country numbers which will identify each country, create country id ranging from 1 to 5
country_id = range(1, 1 + n_country)
list(country_id)
# create year from 2000 to 2002
year = range(2000, 2000 + n_year)
list(year)
# create dictionary that maps from country id to country name
country_name = dict(zip(country_id, ['A', 'B', 'C', 'D', 'E']))
country_name
# loop starts here
i = 0
for id in country_id:
for country in ["A", "B", "C", "D", "E"]:
for year in [2000, 2001, 2002]:
data.loc[i, "id"] = id
data.loc[i, "year"] = year
data.loc[i, "country"] = country_name[id]
i = +1
生成的数据框不是预期的。
如有网友指出上述循环中的错误,将不胜感激。
谢谢!
我会在 year/countries 上使用产品,然后使用 cat.codes 来标记国家/地区。
from itertools import product
import pandas as pd
start_year = 2000
end_year = 2003
countries = ['A','B','C','D','E']
df = pd.DataFrame(list(product(range(start_year,end_year+1),countries)), columns=['year','country'])
df['id'] = df.country.astype('category').cat.codes+1
print(df)
输出
year country id
0 2000 A 1
1 2000 B 2
2 2000 C 3
3 2000 D 4
4 2000 E 5
5 2001 A 1
6 2001 B 2
7 2001 C 3
8 2001 D 4
9 2001 E 5
10 2002 A 1
11 2002 B 2
12 2002 C 3
13 2002 D 4
14 2002 E 5
15 2003 A 1
16 2003 B 2
17 2003 C 3
18 2003 D 4
19 2003 E 5
至于您当前的循环,您可能需要 zip
id 和 country,以便每年循环重复使用它们,并且需要 i+=1
而不是 i=+1
n_country = 5 # number of countries
n_year = 3 # number of years of data for each country
columns = ("id", "country", "year")
n_rows = n_country*n_year
data = pd.DataFrame(np.empty(shape = (n_rows, 3)), columns = columns)
# set country numbers which will identify each country, create country id ranging from 1 to 5
country_id = range(1, 1 + n_country)
list(country_id)
# create year from 2000 to 2002
year = range(2000, 2000 + n_year)
list(year)
# create dictionary that maps from country id to country name
country_name = dict(zip(country_id, ['A', 'B', 'C', 'D', 'E']))
country_name
# loop starts here
i = 0
for c_id,country in zip(country_id,["A", "B", "C", "D", "E"]):
print(c_id, country)
for year in [2000, 2001, 2002]:
data.loc[i, "id"] = c_id
data.loc[i, "year"] = year
data.loc[i, "country"] = country
i +=1
我正在尝试在 Python 中创建面板数据框,例如5 个国家(A、B、C、D、E),每个国家都有 3 年的数据(2000、2001、2002)。
import numpy as np
import pandas as pd
df = {'id': [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5],
'country': ['A', 'B', 'C', 'D', 'E', 'A', 'B', 'C', 'D', 'E', 'A', 'B', 'C', 'D', 'E'],
'year': [2000, 2001, 2002, 2000, 2001, 2002, 2000, 2001, 2002, 2000, 2001, 2002, 2000, 2001, 2002]
}
df = pd.DataFrame(df)
df
为了将其扩展到更大的数据集,我尝试使用以下代码循环以获得上述结果,但它没有给我所需的数据框。
n_country = 5 # number of countries
n_year = 3 # number of years of data for each country
columns = ("id", "country", "year")
n_rows = n_country*n_year
data = pd.DataFrame(np.empty(shape = (n_rows, 3)), columns = columns)
# set country numbers which will identify each country, create country id ranging from 1 to 5
country_id = range(1, 1 + n_country)
list(country_id)
# create year from 2000 to 2002
year = range(2000, 2000 + n_year)
list(year)
# create dictionary that maps from country id to country name
country_name = dict(zip(country_id, ['A', 'B', 'C', 'D', 'E']))
country_name
# loop starts here
i = 0
for id in country_id:
for country in ["A", "B", "C", "D", "E"]:
for year in [2000, 2001, 2002]:
data.loc[i, "id"] = id
data.loc[i, "year"] = year
data.loc[i, "country"] = country_name[id]
i = +1
生成的数据框不是预期的。
如有网友指出上述循环中的错误,将不胜感激。
谢谢!
我会在 year/countries 上使用产品,然后使用 cat.codes 来标记国家/地区。
from itertools import product
import pandas as pd
start_year = 2000
end_year = 2003
countries = ['A','B','C','D','E']
df = pd.DataFrame(list(product(range(start_year,end_year+1),countries)), columns=['year','country'])
df['id'] = df.country.astype('category').cat.codes+1
print(df)
输出
year country id
0 2000 A 1
1 2000 B 2
2 2000 C 3
3 2000 D 4
4 2000 E 5
5 2001 A 1
6 2001 B 2
7 2001 C 3
8 2001 D 4
9 2001 E 5
10 2002 A 1
11 2002 B 2
12 2002 C 3
13 2002 D 4
14 2002 E 5
15 2003 A 1
16 2003 B 2
17 2003 C 3
18 2003 D 4
19 2003 E 5
至于您当前的循环,您可能需要 zip
id 和 country,以便每年循环重复使用它们,并且需要 i+=1
而不是 i=+1
n_country = 5 # number of countries
n_year = 3 # number of years of data for each country
columns = ("id", "country", "year")
n_rows = n_country*n_year
data = pd.DataFrame(np.empty(shape = (n_rows, 3)), columns = columns)
# set country numbers which will identify each country, create country id ranging from 1 to 5
country_id = range(1, 1 + n_country)
list(country_id)
# create year from 2000 to 2002
year = range(2000, 2000 + n_year)
list(year)
# create dictionary that maps from country id to country name
country_name = dict(zip(country_id, ['A', 'B', 'C', 'D', 'E']))
country_name
# loop starts here
i = 0
for c_id,country in zip(country_id,["A", "B", "C", "D", "E"]):
print(c_id, country)
for year in [2000, 2001, 2002]:
data.loc[i, "id"] = c_id
data.loc[i, "year"] = year
data.loc[i, "country"] = country
i +=1