使用依赖于 Python 数据帧中的前一行的众多条件优化函数
Optimise a function with numerous conditions that depends on the previous row in a Python dataframe
我有以下数据框:
country_ID
ID
direction
date
ESP_1
0
IN
2021-02-28
ENG
0
IN
2021-03-03
ENG
0
OUT
2021-03-04
ESP_2
0
IN
2021-03-05
FRA
1
OUT
2021-03-07
ENG
1
OUT
2021-03-09
ENG
1
OUT
2021-03-10
ENG
2
IN
2021-03-13
我实现了以下功能:
ef create_columns_analysis(df):
df['visit_ESP'] = 0
df['visit_ENG'] = 0
df['visit_FRA'] = 0
list_ids = []
for i in range(len(df)):
if df.loc[i,'country_ID'] == 'ENG':
country_ID_ENG(df, i, list_ids)
else:
# case country_ID = {FRA, ESP_1, ESP_2}
# other methods not specified
return df
对于具有特定 country_ID 的每一行,应用类似结构的函数。
我想优化或简化 country_ID_ENG 函数的代码。 country_ID_ENG函数定义如下:
def country_ID_ENG(df, i, list_ids):
# If it is the first time the ID is detected
if df.loc[i,'ID'] not in list_ids:
# It adds up to one visit regardless of the direction of the ID
df.loc[i,'visit_ENG'] = 1
# Add the ID to the read list
list_ids.append(df.loc[i, 'ID'])
# Assigns the error column a start message
df.loc[i,'error'] = 'ERROR:1'
# If it is not the first time it detects that ID
else:
# Saves the information of the previous row
prev_row = df.loc[i-1]
# If the current row direction is 'IN'
if df.loc[i,'direction'] == 'IN':
# Add a visit
df.loc[i,'visit_ENG'] = 1
# Behaviour dependent on the previous row
# If the current row direction is 'IN' and previous row is 'IN'
if prev_row['direction'] == 'IN':
if prev_row['country_ID'] == 'FRA':
df.loc[i,'error'] = 'ERROR:0'
elif prev_row['country_ID'] in ['ESP_1','ESP_2']:
df.loc[i,'error'] = 'ERROR:2'
df.loc[i,'visit_FRA'] = 1
else:
df.loc[i,'error'] = 'ERROR:3'
# If the current row direction is 'IN' and previous row is 'OUT'
else:
if prev_row['country_ID'] == 'ENG':
df.loc[i,'error'] = 'ERROR:0'
elif prev_row['country_ID'] in ['FRA','ESP_2']:
df.loc[i,'error'] = 'ERROR:4'
df.loc[i,'visit_FRA'] = 1
else:
df.loc[i,'error'] = 'ERROR:5'
df.loc[i,'visit_ESP'] = 1
df.loc[i,'visit_FRA'] = 1
# If the current row direction is 'OUT'
else:
# If the current row direction is 'OUT' and previous row is 'IN'
if prev_row['direction'] == 'IN':
# If it detects an output before an input of the same 'country_ID',
# it calculates the visit time
if prev_row['country_ID'] == 'ENG':
df.loc[i,'mean_time'] = df.loc[i,'date']-prev_row['date']
df.loc[i,'error'] = 'ERROR:0'
elif prev_row['country_ID'] in ['ESP_1','ESP_2']:
df.loc[i,'error'] = 'ERROR:6'
df.loc[i,'visit_FRA'] = 1
df.loc[i,'visit_ENG'] = 1
else:
df.loc[i,'error'] = 'ERROR:7'
df.loc[i,'visit_ENG'] = 1
# If the current row direction is 'OUT' and previous row is 'OUT'
else:
df.loc[i,'visit_ENG'] = 1
if prev_row['country_ID'] == 'ENG':
df.loc[i,'error'] = 'ERROR:8'
elif prev_row['country_ID'] in ['FRA','ESP_2']:
df.loc[i,'error'] = 'ERROR:9'
df.loc[i,'visit_FRA'] = 1
else:
df.loc[i,'error'] = 'ERROR:10'
df.loc[i,'visit_ESP'] = 1
df.loc[i,'visit_FRA'] = 1
以上函数使用当前行和上一行(如果有的话)的信息为visit_ENG、visit_ESP、visit_FRA、[=39=创建新列] 和错误。
对于示例数据框函数,将函数 country_ID_ENG 应用于 country_ID 等于 ENG 的行,应该 return 以下结果:
country_ID
ID
direction
date
visit_ENG
visit_FRA
visit_ESP
mean_time
error
ESP_1
0
IN
2021-02-28
-
-
-
-
-
ENG
0
IN
2021-03-03
0
1
0
NaN
ERROR:2
ENG
0
OUT
2021-03-04
0
0
0
1 days
ERROR:0
ESP_2
0
IN
2021-03-05
-
-
-
-
-
FRA
1
OUT
2021-03-07
-
-
-
-
-
ENG
1
OUT
2021-03-09
1
1
0
NaN
ERROR:9
ENG
1
OUT
2021-03-10
1
0
0
NaN
ERROR:8
ENG
2
IN
2021-03-13
1
0
0
NaN
ERROR:1
函数很长,country_ID行等于ESP或FRA的其他函数也有同样的复杂度。我希望您能帮助我简化或优化此函数的代码,以便在定义 country_ID_ESP 和 country_ID_FRA 函数时也将其考虑在内。感谢您的帮助。
我最近不得不完成类似的事情。我的解决方案是创建一个自定义 class 来迭代将某些逻辑移出循环并移入 class。它不是一个完整的解决方案,但足以使用。
main.py
import pandas as pd
DATA= {
'country_id': ['ESP_1', 'FRA', 'ENG', 'FRA'],
'ID': [0, 1, 2, 0, ],
'direction': ['IN', 'IN', 'OUT', 'OUT'],
'date': ['2021-02-28', '2021-02-28', '2021-02-28', '2021-02-28']
}
class CountryIDs:
def __init__(self, df: pd.DataFrame):
self._list_ids = []
self._country_ids = []
self._df = df
def __iter__(self):
for tup in self._df.itertuples():
yield tup, self._list_ids
def update_list_ids(self, new_value):
self._list_ids = [*self._list_ids, new_value.ID]
self._country_ids = [*self._country_ids, new_value.country_id]
def get_list(self):
return [self._list_ids, self._country_ids]
def start():
country_data = CountryIDs(pd.DataFrame(DATA))
for named_tuple, list_ids in country_data:
if named_tuple.ID not in list_ids:
country_data.update_list_ids(named_tuple)
print(ids.get_list())
if __name__ == '__main__':
start()
结果
[[0, 1, 2], ['ESP_1', 'FRA', 'ENG']]
我有以下数据框:
country_ID | ID | direction | date |
---|---|---|---|
ESP_1 | 0 | IN | 2021-02-28 |
ENG | 0 | IN | 2021-03-03 |
ENG | 0 | OUT | 2021-03-04 |
ESP_2 | 0 | IN | 2021-03-05 |
FRA | 1 | OUT | 2021-03-07 |
ENG | 1 | OUT | 2021-03-09 |
ENG | 1 | OUT | 2021-03-10 |
ENG | 2 | IN | 2021-03-13 |
我实现了以下功能:
ef create_columns_analysis(df):
df['visit_ESP'] = 0
df['visit_ENG'] = 0
df['visit_FRA'] = 0
list_ids = []
for i in range(len(df)):
if df.loc[i,'country_ID'] == 'ENG':
country_ID_ENG(df, i, list_ids)
else:
# case country_ID = {FRA, ESP_1, ESP_2}
# other methods not specified
return df
对于具有特定 country_ID 的每一行,应用类似结构的函数。
我想优化或简化 country_ID_ENG 函数的代码。 country_ID_ENG函数定义如下:
def country_ID_ENG(df, i, list_ids):
# If it is the first time the ID is detected
if df.loc[i,'ID'] not in list_ids:
# It adds up to one visit regardless of the direction of the ID
df.loc[i,'visit_ENG'] = 1
# Add the ID to the read list
list_ids.append(df.loc[i, 'ID'])
# Assigns the error column a start message
df.loc[i,'error'] = 'ERROR:1'
# If it is not the first time it detects that ID
else:
# Saves the information of the previous row
prev_row = df.loc[i-1]
# If the current row direction is 'IN'
if df.loc[i,'direction'] == 'IN':
# Add a visit
df.loc[i,'visit_ENG'] = 1
# Behaviour dependent on the previous row
# If the current row direction is 'IN' and previous row is 'IN'
if prev_row['direction'] == 'IN':
if prev_row['country_ID'] == 'FRA':
df.loc[i,'error'] = 'ERROR:0'
elif prev_row['country_ID'] in ['ESP_1','ESP_2']:
df.loc[i,'error'] = 'ERROR:2'
df.loc[i,'visit_FRA'] = 1
else:
df.loc[i,'error'] = 'ERROR:3'
# If the current row direction is 'IN' and previous row is 'OUT'
else:
if prev_row['country_ID'] == 'ENG':
df.loc[i,'error'] = 'ERROR:0'
elif prev_row['country_ID'] in ['FRA','ESP_2']:
df.loc[i,'error'] = 'ERROR:4'
df.loc[i,'visit_FRA'] = 1
else:
df.loc[i,'error'] = 'ERROR:5'
df.loc[i,'visit_ESP'] = 1
df.loc[i,'visit_FRA'] = 1
# If the current row direction is 'OUT'
else:
# If the current row direction is 'OUT' and previous row is 'IN'
if prev_row['direction'] == 'IN':
# If it detects an output before an input of the same 'country_ID',
# it calculates the visit time
if prev_row['country_ID'] == 'ENG':
df.loc[i,'mean_time'] = df.loc[i,'date']-prev_row['date']
df.loc[i,'error'] = 'ERROR:0'
elif prev_row['country_ID'] in ['ESP_1','ESP_2']:
df.loc[i,'error'] = 'ERROR:6'
df.loc[i,'visit_FRA'] = 1
df.loc[i,'visit_ENG'] = 1
else:
df.loc[i,'error'] = 'ERROR:7'
df.loc[i,'visit_ENG'] = 1
# If the current row direction is 'OUT' and previous row is 'OUT'
else:
df.loc[i,'visit_ENG'] = 1
if prev_row['country_ID'] == 'ENG':
df.loc[i,'error'] = 'ERROR:8'
elif prev_row['country_ID'] in ['FRA','ESP_2']:
df.loc[i,'error'] = 'ERROR:9'
df.loc[i,'visit_FRA'] = 1
else:
df.loc[i,'error'] = 'ERROR:10'
df.loc[i,'visit_ESP'] = 1
df.loc[i,'visit_FRA'] = 1
以上函数使用当前行和上一行(如果有的话)的信息为visit_ENG、visit_ESP、visit_FRA、[=39=创建新列] 和错误。
对于示例数据框函数,将函数 country_ID_ENG 应用于 country_ID 等于 ENG 的行,应该 return 以下结果:
country_ID | ID | direction | date | visit_ENG | visit_FRA | visit_ESP | mean_time | error |
---|---|---|---|---|---|---|---|---|
ESP_1 | 0 | IN | 2021-02-28 | - | - | - | - | - |
ENG | 0 | IN | 2021-03-03 | 0 | 1 | 0 | NaN | ERROR:2 |
ENG | 0 | OUT | 2021-03-04 | 0 | 0 | 0 | 1 days | ERROR:0 |
ESP_2 | 0 | IN | 2021-03-05 | - | - | - | - | - |
FRA | 1 | OUT | 2021-03-07 | - | - | - | - | - |
ENG | 1 | OUT | 2021-03-09 | 1 | 1 | 0 | NaN | ERROR:9 |
ENG | 1 | OUT | 2021-03-10 | 1 | 0 | 0 | NaN | ERROR:8 |
ENG | 2 | IN | 2021-03-13 | 1 | 0 | 0 | NaN | ERROR:1 |
函数很长,country_ID行等于ESP或FRA的其他函数也有同样的复杂度。我希望您能帮助我简化或优化此函数的代码,以便在定义 country_ID_ESP 和 country_ID_FRA 函数时也将其考虑在内。感谢您的帮助。
我最近不得不完成类似的事情。我的解决方案是创建一个自定义 class 来迭代将某些逻辑移出循环并移入 class。它不是一个完整的解决方案,但足以使用。
main.py
import pandas as pd
DATA= {
'country_id': ['ESP_1', 'FRA', 'ENG', 'FRA'],
'ID': [0, 1, 2, 0, ],
'direction': ['IN', 'IN', 'OUT', 'OUT'],
'date': ['2021-02-28', '2021-02-28', '2021-02-28', '2021-02-28']
}
class CountryIDs:
def __init__(self, df: pd.DataFrame):
self._list_ids = []
self._country_ids = []
self._df = df
def __iter__(self):
for tup in self._df.itertuples():
yield tup, self._list_ids
def update_list_ids(self, new_value):
self._list_ids = [*self._list_ids, new_value.ID]
self._country_ids = [*self._country_ids, new_value.country_id]
def get_list(self):
return [self._list_ids, self._country_ids]
def start():
country_data = CountryIDs(pd.DataFrame(DATA))
for named_tuple, list_ids in country_data:
if named_tuple.ID not in list_ids:
country_data.update_list_ids(named_tuple)
print(ids.get_list())
if __name__ == '__main__':
start()
结果
[[0, 1, 2], ['ESP_1', 'FRA', 'ENG']]