(Python) 将日志数据处理成时间序列
(Python) Processing log data into time series
我想将原始数据:[to][from][time] 转换为时间序列,例如:[time][Loc1][ Loc2][Loc3].
我写的代码似乎效率低下而且可能是错误的。如有任何帮助或提示,我们将不胜感激!
df = pd.DataFrame({'From': ['Loc84', 'Loc77', 'Loc83', 'Loc84', 'Loc66'],
'To': ['Loc77', 'Loc60', 'Loc9', 'Loc16', 'Loc9'],
'Time':['01/01/2008 17:56','02/01/2008 16:25','02/01/2008 22:38','03/01/2008 13:59','02/01/2008 23:44']})
#list of all location
column_values = df[["From", "To"]].values
uniq_wards = np.unique(column_values)
#create an empty dataframe
new_df = pd.DataFrame(columns = uniq_wards)
new_df.insert(0, "Time", 0)
def transform(row):
global new_df
prev_loc = row['From']
next_loc = row['To']
time = row['Time']
if new_df.empty:
#create a brand new entry
new_df.loc[0] = 0
new_df[prev_loc][0] = -1
new_df[next_loc][0] = 1
new_df['Time'][0] = time
else:
#get last entry and modify it
next_entry = new_df.tail(1)
next_entry[prev_loc][0] += -1
next_entry[next_loc][0] += 1
next_entry['Time'][0] = time
#append new row
new_df = new_df.append(next_entry)
df.apply(transform, axis = 1)
这是一种使用数据帧和 pandas 方法而不循环的方法。
import pandas as pd
df = pd.DataFrame({'From': ['Loc84', 'Loc77', 'Loc83', 'Loc84', 'Loc66'],
'To': ['Loc77', 'Loc60', 'Loc9', 'Loc16', 'Loc9'],
'Time':['01/01/2008 17:56','02/01/2008 16:25',
'02/01/2008 22:38','03/01/2008 13:59',
'02/01/2008 23:44']})
#Use get_dummies and cumsum to calculate volumns entering and leaving a location
df_from = df.set_index('Time')['From'].str.get_dummies().replace(1,-1).cumsum()
df_to = df.set_index('Time')['To'].str.get_dummies().cumsum()
#Union locations from both dataframes and sort based on numbers in locations
cols = df_from.columns.union(df_to.columns)
idx = cols.str.extract('(\d+)').squeeze().astype(int).sort_values().index
cols = cols[idx]
#Reindex columns from both dataframes and add together and mask leading zeroes.
df_out = df_from.reindex(cols, axis=1).add(df_to.reindex(cols, axis=1), fill_value=0)
df_out = df_out.mask(df_out.cumsum()==0, '').reset_index()
print(df_out)
输出:
Time Loc9 Loc16 Loc60 Loc66 Loc77 Loc83 Loc84
0 01/01/2008 17:56 1 -1
1 02/01/2008 16:25 1 0 -1
2 02/01/2008 22:38 1 1 0 -1 -1
3 03/01/2008 13:59 1 1 1 0 -1 -2
4 02/01/2008 23:44 2 1 1 -1 0 -1 -2
时间戳的更正更新:
import pandas as pd
df = pd.DataFrame({'From': ['Loc84', 'Loc77', 'Loc83', 'Loc84', 'Loc66'],
'To': ['Loc77', 'Loc60', 'Loc9', 'Loc16', 'Loc9'],
'Time':['01/01/2008 17:56','02/01/2008 16:25',
'02/01/2008 22:38','03/01/2008 13:59',
'02/01/2008 23:44']})
df['Time'] = pd.to_datetime(df['Time'])
df = df.sort_values('Time')
df_from = df.set_index('Time')['From'].str.get_dummies().replace(1,-1).cumsum()
df_to = df.set_index('Time')['To'].str.get_dummies().cumsum()
cols = df_from.columns.union(df_to.columns)
idx = cols.str.extract('(\d+)').squeeze().astype(int).sort_values().index
cols = cols[idx]
df_out = df_from.reindex(cols, axis=1).add(df_to.reindex(cols, axis=1), fill_value=0)
df_out = df_out.mask(df_out.cumsum()==0, '').reset_index()
print(df_out)
输出:
Time Loc9 Loc16 Loc60 Loc66 Loc77 Loc83 Loc84
0 2008-01-01 17:56:00 1 -1
1 2008-02-01 16:25:00 1 0 -1
2 2008-02-01 22:38:00 1 1 0 -1 -1
3 2008-02-01 23:44:00 2 1 -1 0 -1 -1
4 2008-03-01 13:59:00 2 1 1 -1 0 -1 -2
我想将原始数据:[to][from][time] 转换为时间序列,例如:[time][Loc1][ Loc2][Loc3].
我写的代码似乎效率低下而且可能是错误的。如有任何帮助或提示,我们将不胜感激!
df = pd.DataFrame({'From': ['Loc84', 'Loc77', 'Loc83', 'Loc84', 'Loc66'],
'To': ['Loc77', 'Loc60', 'Loc9', 'Loc16', 'Loc9'],
'Time':['01/01/2008 17:56','02/01/2008 16:25','02/01/2008 22:38','03/01/2008 13:59','02/01/2008 23:44']})
#list of all location
column_values = df[["From", "To"]].values
uniq_wards = np.unique(column_values)
#create an empty dataframe
new_df = pd.DataFrame(columns = uniq_wards)
new_df.insert(0, "Time", 0)
def transform(row):
global new_df
prev_loc = row['From']
next_loc = row['To']
time = row['Time']
if new_df.empty:
#create a brand new entry
new_df.loc[0] = 0
new_df[prev_loc][0] = -1
new_df[next_loc][0] = 1
new_df['Time'][0] = time
else:
#get last entry and modify it
next_entry = new_df.tail(1)
next_entry[prev_loc][0] += -1
next_entry[next_loc][0] += 1
next_entry['Time'][0] = time
#append new row
new_df = new_df.append(next_entry)
df.apply(transform, axis = 1)
这是一种使用数据帧和 pandas 方法而不循环的方法。
import pandas as pd
df = pd.DataFrame({'From': ['Loc84', 'Loc77', 'Loc83', 'Loc84', 'Loc66'],
'To': ['Loc77', 'Loc60', 'Loc9', 'Loc16', 'Loc9'],
'Time':['01/01/2008 17:56','02/01/2008 16:25',
'02/01/2008 22:38','03/01/2008 13:59',
'02/01/2008 23:44']})
#Use get_dummies and cumsum to calculate volumns entering and leaving a location
df_from = df.set_index('Time')['From'].str.get_dummies().replace(1,-1).cumsum()
df_to = df.set_index('Time')['To'].str.get_dummies().cumsum()
#Union locations from both dataframes and sort based on numbers in locations
cols = df_from.columns.union(df_to.columns)
idx = cols.str.extract('(\d+)').squeeze().astype(int).sort_values().index
cols = cols[idx]
#Reindex columns from both dataframes and add together and mask leading zeroes.
df_out = df_from.reindex(cols, axis=1).add(df_to.reindex(cols, axis=1), fill_value=0)
df_out = df_out.mask(df_out.cumsum()==0, '').reset_index()
print(df_out)
输出:
Time Loc9 Loc16 Loc60 Loc66 Loc77 Loc83 Loc84
0 01/01/2008 17:56 1 -1
1 02/01/2008 16:25 1 0 -1
2 02/01/2008 22:38 1 1 0 -1 -1
3 03/01/2008 13:59 1 1 1 0 -1 -2
4 02/01/2008 23:44 2 1 1 -1 0 -1 -2
时间戳的更正更新:
import pandas as pd
df = pd.DataFrame({'From': ['Loc84', 'Loc77', 'Loc83', 'Loc84', 'Loc66'],
'To': ['Loc77', 'Loc60', 'Loc9', 'Loc16', 'Loc9'],
'Time':['01/01/2008 17:56','02/01/2008 16:25',
'02/01/2008 22:38','03/01/2008 13:59',
'02/01/2008 23:44']})
df['Time'] = pd.to_datetime(df['Time'])
df = df.sort_values('Time')
df_from = df.set_index('Time')['From'].str.get_dummies().replace(1,-1).cumsum()
df_to = df.set_index('Time')['To'].str.get_dummies().cumsum()
cols = df_from.columns.union(df_to.columns)
idx = cols.str.extract('(\d+)').squeeze().astype(int).sort_values().index
cols = cols[idx]
df_out = df_from.reindex(cols, axis=1).add(df_to.reindex(cols, axis=1), fill_value=0)
df_out = df_out.mask(df_out.cumsum()==0, '').reset_index()
print(df_out)
输出:
Time Loc9 Loc16 Loc60 Loc66 Loc77 Loc83 Loc84
0 2008-01-01 17:56:00 1 -1
1 2008-02-01 16:25:00 1 0 -1
2 2008-02-01 22:38:00 1 1 0 -1 -1
3 2008-02-01 23:44:00 2 1 -1 0 -1 -1
4 2008-03-01 13:59:00 2 1 1 -1 0 -1 -2