pandas 根据条件合并行
pandas combine rows based on conditions
大家好,我正在处理包含以下示例的数据集:
数据包含start_time、end_time、id和url。对于一个 id 和 url 组,我有不同的输入和输出值问题是输入值和输出值在不同的行中,我想填充缺失的 end_time/start_time 值。为此,我必须使用以下逻辑:
- 如果我在 start_time 中有值并且结束时间为空,那么考虑到 end_time >= [=59,我必须用最接近的 end_time 填充 end_time =] 并删除 used/matched 行
- 在所有 star_time 行被填充并删除 used/matched 行之后,仍然有一些空行 start_time,然后我必须填充 start_time 与 end_time.
具有相同的值
- 如果没有为给定的 start_time 找到匹配的 end_time 值,那么我必须用相同的 start_time 值填充 end_time 值。
考虑到以上几点,预期结果应该与以下类似,我分两个阶段给出输出,以便于理解
用 start_time 填充匹配的 end_time 并删除 used/matched 行:
最终输出填充剩余的start_time/end_time值:
目前我正在使用以下方式来实现这一点,但我觉得它没有优化:
def process(self, param, context):
df = context['data']
# df = df.drop_duplicates()
key_cols = param['keys_cols']
start_time_col = param['start_time_col']
end_time_col = param['end_time_col']
guid_col = param.get('guid_col','guid')
df_groupby = df.groupby(key_cols).size().reset_index()
final_dfs = []
condition = ''
for key in key_cols:
if condition == '':
condition = '(df[\''+str(key)+"\']==row[\'"+str(key)+"\'])"
else:
condition = condition + ' & ' +'(df[\'' + str(key) + "\']==row[\'" + str(key) + "\'])"
for index, row in df_groupby.iterrows():
sub_df = df[eval(condition)]
if sub_df[start_time_col].isnull().sum() != len(sub_df[start_time_col]) and (sub_df[end_time_col].isnull().sum() != len(sub_df[end_time_col])):
sub_df = sub_df.sort_values([start_time_col, end_time_col], ascending=True)
subdf_start_time_not_null = sub_df[sub_df[start_time_col].notnull()]
subdf_end_time_not_null = sub_df[sub_df[end_time_col].notnull()]
subdf_end_time_not_null['combined'] = subdf_end_time_not_null[end_time_col] +"__"+ subdf_end_time_not_null[guid_col]
end_time_values = subdf_end_time_not_null['combined'].values.tolist()
for row_number, (stime_index, stime_row) in enumerate(subdf_start_time_not_null.iterrows()):
delete_index = row_number
if row_number < len(end_time_values):
end_time_value = np.nan
if int(str(subdf_start_time_not_null.at[stime_index,start_time_col]).replace(":","").replace(" ","").replace("-","")) <= int(str(end_time_values[row_number]).split("__")[0].replace(":","").replace(" ","").replace("-","")):
end_time_value = end_time_values[row_number]
subdf_start_time_not_null.at[stime_index,end_time_col] = str(end_time_values[row_number]).split("__")[0]
else:
prev_index = end_time_values.index(end_time_values[row_number])
for end_time in end_time_values:
current_index = end_time_values.index(end_time)
if current_index > prev_index:
if int(str(subdf_start_time_not_null.at[stime_index,start_time_col]).replace(":","").replace(" ","").replace("-","")) <= int(str(end_time_values[current_index]).split("__")[0].replace(":","").replace(" ","").replace("-","")):
subdf_start_time_not_null.at[stime_index, end_time_col] = end_time_values[current_index]
delete_index = current_index
end_time_value = end_time_values.pop(delete_index)
break
subdf_end_time_not_null = subdf_end_time_not_null[subdf_end_time_not_null[guid_col]!=end_time_value.split("__")[1]]
else:
subdf_start_time_not_null.at[stime_index,end_time_col] = subdf_start_time_not_null.at[stime_index,start_time_col]
subdf_end_time_not_null.drop('combined', axis=1, inplace=True)
sub_df = pd.concat([subdf_start_time_not_null,subdf_end_time_not_null])
sub_df[start_time_col] = np.where(sub_df[start_time_col].isnull(),sub_df[end_time_col],sub_df[start_time_col])
sub_df[end_time_col] = np.where(sub_df[end_time_col].isnull(),sub_df[start_time_col],sub_df[end_time_col])
final_dfs.append(sub_df)
# LOGGER.info('do something' +str(index))
df = pd.concat(final_dfs)
context['data'] = df
context['continue'] = True
return context
其中参数如下:
param = {"keys_cols":['id', 'url'], "start_time_col":"start_time","end_time_col":"end_time"}
“df”是数据。
请帮助查看并建议如何使其更优化,我有超过 70000 行数据,一个文件中有超过 12000 对 id 和 urls
期待你们。
谢谢
数据:
>>> import pandas as pd
>>> df = pd.DataFrame(
{"id" : ["o6FlbuA_5565423"]*8,
"url" : ["https://vaa.66new"]*8,
"type" : ["out"]*3 + ["in"]*2 + ["out"]*3,
"start_time" : ["NULL"]*3 + ['2021-08-25 15:23:37', '2021-08-25 15:23:56'] +["NULL"]*3,
"end_time" : ['2021-08-25 15:23:28', '2021-08-25 15:27:34', '2021-08-25 15:23:52', 'NULL', 'NULL', '2021-08-25 15:10:29', '2021-08-25 15:25:00', '2021-08-25 15:15:49']}
)
>>> df[['start_time', 'end_time']] = df[['start_time', 'end_time']].apply(pd.to_datetime, errors='coerce')
>>> df
id url type start_time end_time
0 o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25 15:23:28
1 o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25 15:27:34
2 o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25 15:23:52
3 o6FlbuA_5565423 https://vaa.66new in 2021-08-25 15:23:37 NaT
4 o6FlbuA_5565423 https://vaa.66new in 2021-08-25 15:23:56 NaT
5 o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25 15:10:29
6 o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25 15:25:00
7 o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25 15:15:49
解决方案:
# Get epoch time for both 'start_time' and 'end_time' columns
>>> df['start_time_epoch'] = df.start_time.apply(lambda x: x.timestamp() if not pd.isna(x) else None).astype('Int64')
>>> df['end_time_epoch'] = df.end_time.apply(lambda x: x.timestamp() if not pd.isna(x) else None).astype('Int64')
# Get closest value
>>> to_remove = []
>>> def fun(x):
>>> for i in df.sort_values("end_time_epoch").end_time_epoch:
>>> if i >= x.start_time_epoch:
>>> to_remove.append(i)
>>> return pd.to_datetime(i, unit='s')
>>> else:
>>> return pd.to_datetime(x.start_time_epoch, unit='s')
>>> r = df[df.start_time.notna() & df.end_time.isna()].apply(fun, axis=1).to_list()
# Fill with gotten values
>>> df.loc[df.start_time.notna() & df.end_time.isna(), 'end_time'] = r
# Remove rows from where we filled missed values.
>>> df = df[~df.end_time_epoch.isin(to_remove)]
# Fill 'start_time' with 'end_time'
>>> df.loc[df.start_time.isna(), 'start_time'] = df.loc[df.start_time.isna(), 'end_time'].to_list()
# Drop helping variables.
>>> df.drop(["start_time_epoch", "end_time_epoch"], axis=1, inplace=True)
>>> df
id url type start_time end_time
0 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:23:28 2021-08-25 15:23:28
1 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:27:34 2021-08-25 15:27:34
3 o6FlbuA_5565423 https://vaa.66new in 2021-08-25 15:23:37 2021-08-25 15:23:52
4 o6FlbuA_5565423 https://vaa.66new in 2021-08-25 15:23:56 2021-08-25 15:25:00
5 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:10:29 2021-08-25 15:10:29
7 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:15:49 2021-08-25 15:15:49
如果我正确理解要求,我们可以在 pandas
内完成所有这些。这里基本上有两个步骤:
- 使用
pandas.merge_asof
填写最近的end_date
- 使用
drop_duplicates
删除我们在步骤 1 中使用的 out
条记录
text = StringIO(
"""
id url type start_time end_time
o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25T15:23:28
o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25T15:27:34
o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25T15:23:52
o6FlbuA_5565423 https://vaa.66new in 2021-08-25T15:23:37 NaT
o6FlbuA_5565423 https://vaa.66new in 2021-08-25T15:43:56 NaT # note: no record with `end_time` after this records `start_time`
o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25T15:10:29
o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25T15:25:00
o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25T15:15:49
o6FlbuA_5565423 https://vaa.66new in 2021-08-25T15:33:37 2021-08-25T15:34:37 # additional already complete record
"""
)
df = pd.read_csv(text, delim_whitespace=True, parse_dates=["start_time", "end_time"], comment="#")
# separate out unmatched `in` records and unmatched `out` records
df_in_unmatched = (
df[(df.type == "in") & ~df.start_time.isna() & df.end_time.isna()]
.drop(columns=["end_time"])
.sort_values("start_time")
)
df_out_unmatched = (
df[(df.type == "out") & df.start_time.isna() & ~df.end_time.isna()]
.drop(columns=["type", "start_time"])
.sort_values("end_time")
)
# match `in` records to closest `out` record with `out.end_time` >= `in.start_time`
df_in_matched = pd.merge_asof(
df_in_unmatched,
df_out_unmatched,
by=["id", "url"],
left_on="start_time",
right_on="end_time",
direction="forward",
allow_exact_matches=True,
)
# fill in missing `end_time` for records with only `start_time`
df_in_matched["end_time"] = df_in_matched["end_time"].combine_first(
df_in_matched["start_time"]
)
# combine matched records with remaining unmatched and deduplicate
# in order to remove "used" records
df_matched = (
pd.concat([df_in_matched, df_out_unmatched], ignore_index=True)
.drop_duplicates(subset=["id", "url", "end_time"], keep="first")
.dropna(subset=["end_time"])
.fillna({"type": "out"})
)
# fill in missing `start_time` for records with only `end_time`
df_matched["start_time"] = df_matched["start_time"].combine_first(
df_matched["end_time"]
)
# combine matched records with unprocessed records: i.e. records
# that had both `start_time` and `end_time` (if extant)
df_final = pd.concat(
[df_matched, df.dropna(subset=["start_time", "end_time"])], ignore_index=True
)
结果:
id url type start_time end_time
0 o6FlbuA_5565423 https://vaa.66new in 2021-08-25 15:23:37 2021-08-25 15:23:52
1 o6FlbuA_5565423 https://vaa.66new in 2021-08-25 15:43:56 2021-08-25 15:43:56
2 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:10:29 2021-08-25 15:10:29
3 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:15:49 2021-08-25 15:15:49
4 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:23:28 2021-08-25 15:23:28
5 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:25:00 2021-08-25 15:25:00
6 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:27:34 2021-08-25 15:27:34
7 o6FlbuA_5565423 https://vaa.66new in 2021-08-25 15:33:37 2021-08-25 15:34:37
大家好,我正在处理包含以下示例的数据集:
数据包含start_time、end_time、id和url。对于一个 id 和 url 组,我有不同的输入和输出值问题是输入值和输出值在不同的行中,我想填充缺失的 end_time/start_time 值。为此,我必须使用以下逻辑:
- 如果我在 start_time 中有值并且结束时间为空,那么考虑到 end_time >= [=59,我必须用最接近的 end_time 填充 end_time =] 并删除 used/matched 行
- 在所有 star_time 行被填充并删除 used/matched 行之后,仍然有一些空行 start_time,然后我必须填充 start_time 与 end_time. 具有相同的值
- 如果没有为给定的 start_time 找到匹配的 end_time 值,那么我必须用相同的 start_time 值填充 end_time 值。
考虑到以上几点,预期结果应该与以下类似,我分两个阶段给出输出,以便于理解
用 start_time 填充匹配的 end_time 并删除 used/matched 行:
最终输出填充剩余的start_time/end_time值:
目前我正在使用以下方式来实现这一点,但我觉得它没有优化:
def process(self, param, context):
df = context['data']
# df = df.drop_duplicates()
key_cols = param['keys_cols']
start_time_col = param['start_time_col']
end_time_col = param['end_time_col']
guid_col = param.get('guid_col','guid')
df_groupby = df.groupby(key_cols).size().reset_index()
final_dfs = []
condition = ''
for key in key_cols:
if condition == '':
condition = '(df[\''+str(key)+"\']==row[\'"+str(key)+"\'])"
else:
condition = condition + ' & ' +'(df[\'' + str(key) + "\']==row[\'" + str(key) + "\'])"
for index, row in df_groupby.iterrows():
sub_df = df[eval(condition)]
if sub_df[start_time_col].isnull().sum() != len(sub_df[start_time_col]) and (sub_df[end_time_col].isnull().sum() != len(sub_df[end_time_col])):
sub_df = sub_df.sort_values([start_time_col, end_time_col], ascending=True)
subdf_start_time_not_null = sub_df[sub_df[start_time_col].notnull()]
subdf_end_time_not_null = sub_df[sub_df[end_time_col].notnull()]
subdf_end_time_not_null['combined'] = subdf_end_time_not_null[end_time_col] +"__"+ subdf_end_time_not_null[guid_col]
end_time_values = subdf_end_time_not_null['combined'].values.tolist()
for row_number, (stime_index, stime_row) in enumerate(subdf_start_time_not_null.iterrows()):
delete_index = row_number
if row_number < len(end_time_values):
end_time_value = np.nan
if int(str(subdf_start_time_not_null.at[stime_index,start_time_col]).replace(":","").replace(" ","").replace("-","")) <= int(str(end_time_values[row_number]).split("__")[0].replace(":","").replace(" ","").replace("-","")):
end_time_value = end_time_values[row_number]
subdf_start_time_not_null.at[stime_index,end_time_col] = str(end_time_values[row_number]).split("__")[0]
else:
prev_index = end_time_values.index(end_time_values[row_number])
for end_time in end_time_values:
current_index = end_time_values.index(end_time)
if current_index > prev_index:
if int(str(subdf_start_time_not_null.at[stime_index,start_time_col]).replace(":","").replace(" ","").replace("-","")) <= int(str(end_time_values[current_index]).split("__")[0].replace(":","").replace(" ","").replace("-","")):
subdf_start_time_not_null.at[stime_index, end_time_col] = end_time_values[current_index]
delete_index = current_index
end_time_value = end_time_values.pop(delete_index)
break
subdf_end_time_not_null = subdf_end_time_not_null[subdf_end_time_not_null[guid_col]!=end_time_value.split("__")[1]]
else:
subdf_start_time_not_null.at[stime_index,end_time_col] = subdf_start_time_not_null.at[stime_index,start_time_col]
subdf_end_time_not_null.drop('combined', axis=1, inplace=True)
sub_df = pd.concat([subdf_start_time_not_null,subdf_end_time_not_null])
sub_df[start_time_col] = np.where(sub_df[start_time_col].isnull(),sub_df[end_time_col],sub_df[start_time_col])
sub_df[end_time_col] = np.where(sub_df[end_time_col].isnull(),sub_df[start_time_col],sub_df[end_time_col])
final_dfs.append(sub_df)
# LOGGER.info('do something' +str(index))
df = pd.concat(final_dfs)
context['data'] = df
context['continue'] = True
return context
其中参数如下:
param = {"keys_cols":['id', 'url'], "start_time_col":"start_time","end_time_col":"end_time"}
“df”是数据。
请帮助查看并建议如何使其更优化,我有超过 70000 行数据,一个文件中有超过 12000 对 id 和 urls
期待你们。
谢谢
数据:
>>> import pandas as pd
>>> df = pd.DataFrame(
{"id" : ["o6FlbuA_5565423"]*8,
"url" : ["https://vaa.66new"]*8,
"type" : ["out"]*3 + ["in"]*2 + ["out"]*3,
"start_time" : ["NULL"]*3 + ['2021-08-25 15:23:37', '2021-08-25 15:23:56'] +["NULL"]*3,
"end_time" : ['2021-08-25 15:23:28', '2021-08-25 15:27:34', '2021-08-25 15:23:52', 'NULL', 'NULL', '2021-08-25 15:10:29', '2021-08-25 15:25:00', '2021-08-25 15:15:49']}
)
>>> df[['start_time', 'end_time']] = df[['start_time', 'end_time']].apply(pd.to_datetime, errors='coerce')
>>> df
id url type start_time end_time
0 o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25 15:23:28
1 o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25 15:27:34
2 o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25 15:23:52
3 o6FlbuA_5565423 https://vaa.66new in 2021-08-25 15:23:37 NaT
4 o6FlbuA_5565423 https://vaa.66new in 2021-08-25 15:23:56 NaT
5 o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25 15:10:29
6 o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25 15:25:00
7 o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25 15:15:49
解决方案:
# Get epoch time for both 'start_time' and 'end_time' columns
>>> df['start_time_epoch'] = df.start_time.apply(lambda x: x.timestamp() if not pd.isna(x) else None).astype('Int64')
>>> df['end_time_epoch'] = df.end_time.apply(lambda x: x.timestamp() if not pd.isna(x) else None).astype('Int64')
# Get closest value
>>> to_remove = []
>>> def fun(x):
>>> for i in df.sort_values("end_time_epoch").end_time_epoch:
>>> if i >= x.start_time_epoch:
>>> to_remove.append(i)
>>> return pd.to_datetime(i, unit='s')
>>> else:
>>> return pd.to_datetime(x.start_time_epoch, unit='s')
>>> r = df[df.start_time.notna() & df.end_time.isna()].apply(fun, axis=1).to_list()
# Fill with gotten values
>>> df.loc[df.start_time.notna() & df.end_time.isna(), 'end_time'] = r
# Remove rows from where we filled missed values.
>>> df = df[~df.end_time_epoch.isin(to_remove)]
# Fill 'start_time' with 'end_time'
>>> df.loc[df.start_time.isna(), 'start_time'] = df.loc[df.start_time.isna(), 'end_time'].to_list()
# Drop helping variables.
>>> df.drop(["start_time_epoch", "end_time_epoch"], axis=1, inplace=True)
>>> df
id url type start_time end_time
0 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:23:28 2021-08-25 15:23:28
1 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:27:34 2021-08-25 15:27:34
3 o6FlbuA_5565423 https://vaa.66new in 2021-08-25 15:23:37 2021-08-25 15:23:52
4 o6FlbuA_5565423 https://vaa.66new in 2021-08-25 15:23:56 2021-08-25 15:25:00
5 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:10:29 2021-08-25 15:10:29
7 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:15:49 2021-08-25 15:15:49
如果我正确理解要求,我们可以在 pandas
内完成所有这些。这里基本上有两个步骤:
- 使用
pandas.merge_asof
填写最近的end_date
- 使用
drop_duplicates
删除我们在步骤 1 中使用的out
条记录
text = StringIO(
"""
id url type start_time end_time
o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25T15:23:28
o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25T15:27:34
o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25T15:23:52
o6FlbuA_5565423 https://vaa.66new in 2021-08-25T15:23:37 NaT
o6FlbuA_5565423 https://vaa.66new in 2021-08-25T15:43:56 NaT # note: no record with `end_time` after this records `start_time`
o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25T15:10:29
o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25T15:25:00
o6FlbuA_5565423 https://vaa.66new out NaT 2021-08-25T15:15:49
o6FlbuA_5565423 https://vaa.66new in 2021-08-25T15:33:37 2021-08-25T15:34:37 # additional already complete record
"""
)
df = pd.read_csv(text, delim_whitespace=True, parse_dates=["start_time", "end_time"], comment="#")
# separate out unmatched `in` records and unmatched `out` records
df_in_unmatched = (
df[(df.type == "in") & ~df.start_time.isna() & df.end_time.isna()]
.drop(columns=["end_time"])
.sort_values("start_time")
)
df_out_unmatched = (
df[(df.type == "out") & df.start_time.isna() & ~df.end_time.isna()]
.drop(columns=["type", "start_time"])
.sort_values("end_time")
)
# match `in` records to closest `out` record with `out.end_time` >= `in.start_time`
df_in_matched = pd.merge_asof(
df_in_unmatched,
df_out_unmatched,
by=["id", "url"],
left_on="start_time",
right_on="end_time",
direction="forward",
allow_exact_matches=True,
)
# fill in missing `end_time` for records with only `start_time`
df_in_matched["end_time"] = df_in_matched["end_time"].combine_first(
df_in_matched["start_time"]
)
# combine matched records with remaining unmatched and deduplicate
# in order to remove "used" records
df_matched = (
pd.concat([df_in_matched, df_out_unmatched], ignore_index=True)
.drop_duplicates(subset=["id", "url", "end_time"], keep="first")
.dropna(subset=["end_time"])
.fillna({"type": "out"})
)
# fill in missing `start_time` for records with only `end_time`
df_matched["start_time"] = df_matched["start_time"].combine_first(
df_matched["end_time"]
)
# combine matched records with unprocessed records: i.e. records
# that had both `start_time` and `end_time` (if extant)
df_final = pd.concat(
[df_matched, df.dropna(subset=["start_time", "end_time"])], ignore_index=True
)
结果:
id url type start_time end_time
0 o6FlbuA_5565423 https://vaa.66new in 2021-08-25 15:23:37 2021-08-25 15:23:52
1 o6FlbuA_5565423 https://vaa.66new in 2021-08-25 15:43:56 2021-08-25 15:43:56
2 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:10:29 2021-08-25 15:10:29
3 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:15:49 2021-08-25 15:15:49
4 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:23:28 2021-08-25 15:23:28
5 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:25:00 2021-08-25 15:25:00
6 o6FlbuA_5565423 https://vaa.66new out 2021-08-25 15:27:34 2021-08-25 15:27:34
7 o6FlbuA_5565423 https://vaa.66new in 2021-08-25 15:33:37 2021-08-25 15:34:37