(Python) 将日志数据处理成时间序列

(Python) Processing log data into time series

我想将原始数据:[to][from][time] 转换为时间序列,例如:[time][Loc1][ Loc2][Loc3].

我写的代码似乎效率低下而且可能是错误的。如有任何帮助或提示,我们将不胜感激!

df = pd.DataFrame({'From': ['Loc84', 'Loc77', 'Loc83', 'Loc84', 'Loc66'],
                   'To': ['Loc77', 'Loc60', 'Loc9', 'Loc16', 'Loc9'],
                   'Time':['01/01/2008 17:56','02/01/2008 16:25','02/01/2008 22:38','03/01/2008 13:59','02/01/2008 23:44']})

#list of all location
column_values = df[["From", "To"]].values
uniq_wards = np.unique(column_values)

#create an empty dataframe
new_df = pd.DataFrame(columns = uniq_wards)
new_df.insert(0, "Time", 0)

def transform(row):
    
    global new_df
    
    prev_loc = row['From']
    next_loc = row['To']
    time = row['Time']
    
    if new_df.empty:
        
        #create a brand new entry
        new_df.loc[0] = 0
        new_df[prev_loc][0] = -1
        new_df[next_loc][0] = 1
        new_df['Time'][0] = time
        
    else:
        #get last entry and modify it
        next_entry = new_df.tail(1)
        
        next_entry[prev_loc][0] += -1
        next_entry[next_loc][0] += 1
        next_entry['Time'][0] = time
        
        #append new row
        new_df = new_df.append(next_entry)
        
df.apply(transform, axis = 1)

这是一种使用数据帧和 pandas 方法而不循环的方法。

import pandas as pd

df = pd.DataFrame({'From': ['Loc84', 'Loc77', 'Loc83', 'Loc84', 'Loc66'],
                   'To': ['Loc77', 'Loc60', 'Loc9', 'Loc16', 'Loc9'],
                   'Time':['01/01/2008 17:56','02/01/2008 16:25',
                           '02/01/2008 22:38','03/01/2008 13:59',
                           '02/01/2008 23:44']})

#Use get_dummies and cumsum to calculate volumns entering and leaving a location
df_from = df.set_index('Time')['From'].str.get_dummies().replace(1,-1).cumsum()
df_to = df.set_index('Time')['To'].str.get_dummies().cumsum()

#Union locations from both dataframes and sort based on numbers in locations
cols = df_from.columns.union(df_to.columns)
idx = cols.str.extract('(\d+)').squeeze().astype(int).sort_values().index
cols = cols[idx]

#Reindex columns from both dataframes and add together and mask leading zeroes.    
df_out = df_from.reindex(cols, axis=1).add(df_to.reindex(cols, axis=1), fill_value=0)
df_out = df_out.mask(df_out.cumsum()==0, '').reset_index()
print(df_out)

输出:

               Time Loc9 Loc16 Loc60 Loc66  Loc77 Loc83  Loc84
0  01/01/2008 17:56                             1           -1
1  02/01/2008 16:25                1            0           -1
2  02/01/2008 22:38    1           1            0    -1     -1
3  03/01/2008 13:59    1     1     1            0    -1     -2
4  02/01/2008 23:44    2     1     1    -1      0    -1     -2

时间戳的更正更新:

import pandas as pd

df = pd.DataFrame({'From': ['Loc84', 'Loc77', 'Loc83', 'Loc84', 'Loc66'],
                   'To': ['Loc77', 'Loc60', 'Loc9', 'Loc16', 'Loc9'],
                   'Time':['01/01/2008 17:56','02/01/2008 16:25',
                           '02/01/2008 22:38','03/01/2008 13:59',
                           '02/01/2008 23:44']})


df['Time'] = pd.to_datetime(df['Time'])
df = df.sort_values('Time')

df_from = df.set_index('Time')['From'].str.get_dummies().replace(1,-1).cumsum()
df_to = df.set_index('Time')['To'].str.get_dummies().cumsum()

cols = df_from.columns.union(df_to.columns)
idx = cols.str.extract('(\d+)').squeeze().astype(int).sort_values().index
cols = cols[idx]

df_out = df_from.reindex(cols, axis=1).add(df_to.reindex(cols, axis=1), fill_value=0)
df_out = df_out.mask(df_out.cumsum()==0, '').reset_index()
print(df_out)

输出:

                 Time Loc9 Loc16 Loc60 Loc66  Loc77 Loc83  Loc84
0 2008-01-01 17:56:00                             1           -1
1 2008-02-01 16:25:00                1            0           -1
2 2008-02-01 22:38:00    1           1            0    -1     -1
3 2008-02-01 23:44:00    2           1    -1      0    -1     -1
4 2008-03-01 13:59:00    2     1     1    -1      0    -1     -2