在 LSTM 自动编码器中需要帮助 - 异常检测
Need help in LSTM Autoencoder - Anomaly detection
我正在尝试使用 LSTM 进行异常检测。
我能够绘制具有局部和全局异常的所有特征,但我无法同时打印所有异常值、日期时间、损失、阈值和日期(如 table)。
计算测试后按以下方式训练MAE:
Y_train_pred = self.model.predict(self.X_train)
train_mae_loss = np.mean(np.abs(self.Y_train_pred - self.Y_train), axis=1)
Y_test_pred = self.model.predict(self.X_test)
test_mae_loss = np.mean(np.abs(self.Y_test_pred - self.Y_test), axis=1)
test = self.test[:len(Y_test_pred)]
我尝试通过这种方式匹配日期、损失、阈值和异常来制作table:
test_score_df = pd.DataFrame(index=self.test.index)
print(test_score_df)
test_score_df['loss'] = loss_mean_vec
test_score_df['threshold'] = threshold_mean_exp
test_score_df['anomaly'] = test_score_df.loss > test_score_df.threshold
test_score_df['dckw'] = self.test[:].dckw
anomalies = test_score_df[test_score_df.anomaly == True]
print(anomalies.head())
但它抛出错误:
AttributeError: 'DataFrame' object has no attribute 'dckw'
当我打印 self.test 时,它具有 header datetimeAt、dckw ......
的所有功能
当我删除这一行时test_score_df['dckw'] = self.test[:].dckw
它给了我这个结果:
loss threshold anomaly
0 0.106414 0.037134 True
1 0.107169 0.037134 True
2 0.107001 0.037134 True
3 0.105836 0.037134 True
4 0.103779 0.037134 True
那么我怎样才能得到最后一个 table 以及 csv 文件中的日期时间和其他特征,以便我可以绘制日期时间并查看异常出现的时间?
我的代码和文件很重,所以我将它们上传到 git 中心:
https://github.com/Laudarisd/test.git
print(self.test) 给我这个输出:
dckw ackw dayTotal wind_spd temp pres
datetimeAt
2021-12-08 19:00:00 1.880145e-39 0.000 70.0 0.5 3.5 1027.6
2021-12-08 19:00:00 1.875275e-39 0.000 70.8 0.5 3.5 1027.6
2021-12-08 19:00:00 1.879741e-39 0.000 68.9 0.5 3.5 1027.6
2021-12-08 19:00:00 1.881514e-39 0.000 69.8 0.5 3.5 1027.6
2021-12-08 20:00:00 1.881775e-39 0.000 69.9 1.0 3.1 1027.6
代码如下所示:
197 lines (166 sloc) 7.99 KB
from os import path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, RepeatVector, TimeDistributed, Activation
import matplotlib.pyplot as plt
import seaborn as sns
TIME_STEPS = 30
ALPHA = 0.9
DATA_POINT_TO_PREDICT = 3
def Data():
dataset = pd.read_csv('./combined.csv')
dataset = dataset.fillna(0)
#dates = dataset['datetimeAt']
dataset = dataset.drop(columns = ['invno', 'ts'])
dataset = dataset.set_index('datetimeAt')
return dataset
#print(Data())
class AutoEncoder:
def __init__(self):
self.data = Data()
print(self.data.shape)
def create_dataset(self, X, y, time_steps=1):
Xs, ys = [], []
for i in range(len(X) - time_steps):
v = X.iloc[i:(i + time_steps)].values
Xs.append(v)
u = y.iloc[i:(i + time_steps)].values
ys.append(u)
return np.array(Xs), np.array(ys)
def split_train_test(self, test_size=0.2):
df = self.data
train_size = int(len(df) * (1 - test_size))
self.train, self.test = df.iloc[0:train_size], df.iloc[train_size:len(df)]
#print(self.test)
#index=self.test
#print(index)
def split_X_Y(self, data_point_to_predict=0):
self.X_train, self.Y_train = self.create_dataset(self.train, self.train, TIME_STEPS)
self.X_test, self.Y_test = self.create_dataset(self.test, self.test, TIME_STEPS)
if (data_point_to_predict > 0):
#print(self.X_train)
self.X_train = self.X_train[slice(None, self.X_train.shape[0] - data_point_to_predict)]
#print(self.X_train)
self.X_test = self.X_test[slice(None, self.X_test.shape[0] - data_point_to_predict)]
#print(self.Y_train)
self.Y_train = self.Y_train[slice(data_point_to_predict, None)]
#print(self.Y_train)
self.Y_test = self.Y_test[slice(data_point_to_predict,
def normalize(self):
scaler = MinMaxScaler().fit(self.train)
self.train = pd.DataFrame(scaler.transform(self.train))
self.test = pd.DataFrame(scaler.transform(self.test))
错误是因为这一步
def normalize(self):
scaler = MinMaxScaler().fit(self.train)
self.train = pd.DataFrame(scaler.transform(self.train))
self.test = pd.DataFrame(scaler.transform(self.test))
从 self.train
和 self.test
中删除索引和列名。要解决此问题,您需要按如下方式更新代码:
self.train = pd.DataFrame(
data=scaler.transform(self.train),
columns=self.train.columns,
index=self.train.index
)
self.test = pd.DataFrame(
data=scaler.transform(self.test),
columns=self.test.columns,
index=self.test.index
)
之后你还需要更新anomalies
数据框的定义如下:
test_score_df = pd.DataFrame(index=self.test.index)
test_score_df['loss'] = np.append(np.zeros(DATA_POINT_TO_PREDICT + TIME_STEPS), loss_mean_vec.values)
test_score_df['threshold'] = threshold_mean_exp
test_score_df['anomaly'] = test_score_df.loss > test_score_df.threshold
test_score_df['dckw'] = self.test[:].dckw
test_score_df.index = pd.DatetimeIndex(test_score_df.index)
anomalies = test_score_df[test_score_df.anomaly == True]
print(anomalies.head())
loss threshold anomaly dckw
# datetimeAt
# 2021-12-09 01:00:00 0.111500 0.037009 True 0.0
# 2021-12-09 02:00:00 0.113632 0.037009 True 0.0
# 2021-12-09 02:00:00 0.115057 0.037009 True 0.0
# 2021-12-09 02:00:00 0.115312 0.037009 True 0.0
# 2021-12-09 02:00:00 0.114501 0.037009 True 0.0
假设您没有损失测试集中前 DATA_POINT_TO_PREDICT + TIME_STEPS
个数据点。完成后,您可以绘制结果:
plt.plot(test_score_df.index, test_score_df['dckw'].values, color='black')
plt.scatter(anomalies.index, anomalies['dckw'].values, color='red')
plt.show()
我正在尝试使用 LSTM 进行异常检测。 我能够绘制具有局部和全局异常的所有特征,但我无法同时打印所有异常值、日期时间、损失、阈值和日期(如 table)。
计算测试后按以下方式训练MAE:
Y_train_pred = self.model.predict(self.X_train)
train_mae_loss = np.mean(np.abs(self.Y_train_pred - self.Y_train), axis=1)
Y_test_pred = self.model.predict(self.X_test)
test_mae_loss = np.mean(np.abs(self.Y_test_pred - self.Y_test), axis=1)
test = self.test[:len(Y_test_pred)]
我尝试通过这种方式匹配日期、损失、阈值和异常来制作table:
test_score_df = pd.DataFrame(index=self.test.index)
print(test_score_df)
test_score_df['loss'] = loss_mean_vec
test_score_df['threshold'] = threshold_mean_exp
test_score_df['anomaly'] = test_score_df.loss > test_score_df.threshold
test_score_df['dckw'] = self.test[:].dckw
anomalies = test_score_df[test_score_df.anomaly == True]
print(anomalies.head())
但它抛出错误:
AttributeError: 'DataFrame' object has no attribute 'dckw'
当我打印 self.test 时,它具有 header datetimeAt、dckw ......
的所有功能当我删除这一行时test_score_df['dckw'] = self.test[:].dckw
它给了我这个结果:
loss threshold anomaly
0 0.106414 0.037134 True
1 0.107169 0.037134 True
2 0.107001 0.037134 True
3 0.105836 0.037134 True
4 0.103779 0.037134 True
那么我怎样才能得到最后一个 table 以及 csv 文件中的日期时间和其他特征,以便我可以绘制日期时间并查看异常出现的时间?
我的代码和文件很重,所以我将它们上传到 git 中心: https://github.com/Laudarisd/test.git
print(self.test) 给我这个输出:
dckw ackw dayTotal wind_spd temp pres
datetimeAt
2021-12-08 19:00:00 1.880145e-39 0.000 70.0 0.5 3.5 1027.6
2021-12-08 19:00:00 1.875275e-39 0.000 70.8 0.5 3.5 1027.6
2021-12-08 19:00:00 1.879741e-39 0.000 68.9 0.5 3.5 1027.6
2021-12-08 19:00:00 1.881514e-39 0.000 69.8 0.5 3.5 1027.6
2021-12-08 20:00:00 1.881775e-39 0.000 69.9 1.0 3.1 1027.6
代码如下所示:
197 lines (166 sloc) 7.99 KB
from os import path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, RepeatVector, TimeDistributed, Activation
import matplotlib.pyplot as plt
import seaborn as sns
TIME_STEPS = 30
ALPHA = 0.9
DATA_POINT_TO_PREDICT = 3
def Data():
dataset = pd.read_csv('./combined.csv')
dataset = dataset.fillna(0)
#dates = dataset['datetimeAt']
dataset = dataset.drop(columns = ['invno', 'ts'])
dataset = dataset.set_index('datetimeAt')
return dataset
#print(Data())
class AutoEncoder:
def __init__(self):
self.data = Data()
print(self.data.shape)
def create_dataset(self, X, y, time_steps=1):
Xs, ys = [], []
for i in range(len(X) - time_steps):
v = X.iloc[i:(i + time_steps)].values
Xs.append(v)
u = y.iloc[i:(i + time_steps)].values
ys.append(u)
return np.array(Xs), np.array(ys)
def split_train_test(self, test_size=0.2):
df = self.data
train_size = int(len(df) * (1 - test_size))
self.train, self.test = df.iloc[0:train_size], df.iloc[train_size:len(df)]
#print(self.test)
#index=self.test
#print(index)
def split_X_Y(self, data_point_to_predict=0):
self.X_train, self.Y_train = self.create_dataset(self.train, self.train, TIME_STEPS)
self.X_test, self.Y_test = self.create_dataset(self.test, self.test, TIME_STEPS)
if (data_point_to_predict > 0):
#print(self.X_train)
self.X_train = self.X_train[slice(None, self.X_train.shape[0] - data_point_to_predict)]
#print(self.X_train)
self.X_test = self.X_test[slice(None, self.X_test.shape[0] - data_point_to_predict)]
#print(self.Y_train)
self.Y_train = self.Y_train[slice(data_point_to_predict, None)]
#print(self.Y_train)
self.Y_test = self.Y_test[slice(data_point_to_predict,
def normalize(self):
scaler = MinMaxScaler().fit(self.train)
self.train = pd.DataFrame(scaler.transform(self.train))
self.test = pd.DataFrame(scaler.transform(self.test))
错误是因为这一步
def normalize(self):
scaler = MinMaxScaler().fit(self.train)
self.train = pd.DataFrame(scaler.transform(self.train))
self.test = pd.DataFrame(scaler.transform(self.test))
从 self.train
和 self.test
中删除索引和列名。要解决此问题,您需要按如下方式更新代码:
self.train = pd.DataFrame(
data=scaler.transform(self.train),
columns=self.train.columns,
index=self.train.index
)
self.test = pd.DataFrame(
data=scaler.transform(self.test),
columns=self.test.columns,
index=self.test.index
)
之后你还需要更新anomalies
数据框的定义如下:
test_score_df = pd.DataFrame(index=self.test.index)
test_score_df['loss'] = np.append(np.zeros(DATA_POINT_TO_PREDICT + TIME_STEPS), loss_mean_vec.values)
test_score_df['threshold'] = threshold_mean_exp
test_score_df['anomaly'] = test_score_df.loss > test_score_df.threshold
test_score_df['dckw'] = self.test[:].dckw
test_score_df.index = pd.DatetimeIndex(test_score_df.index)
anomalies = test_score_df[test_score_df.anomaly == True]
print(anomalies.head())
loss threshold anomaly dckw
# datetimeAt
# 2021-12-09 01:00:00 0.111500 0.037009 True 0.0
# 2021-12-09 02:00:00 0.113632 0.037009 True 0.0
# 2021-12-09 02:00:00 0.115057 0.037009 True 0.0
# 2021-12-09 02:00:00 0.115312 0.037009 True 0.0
# 2021-12-09 02:00:00 0.114501 0.037009 True 0.0
假设您没有损失测试集中前 DATA_POINT_TO_PREDICT + TIME_STEPS
个数据点。完成后,您可以绘制结果:
plt.plot(test_score_df.index, test_score_df['dckw'].values, color='black')
plt.scatter(anomalies.index, anomalies['dckw'].values, color='red')
plt.show()