使用 Pycaret 和 plotly 的奇怪时间序列图

Weird Time-Series Graph Using Pycaret and plotly

我正在尝试使用 pycaret 和 plotly dash python 库将空气质量数据可视化为时间序列图表,但我得到的图表非常奇怪,下面是我的代码:

import pandas as pd
import plotly.express as px
data = pd.read_csv('E:/Self Learning/Djang_Dash/2019-2020_5.csv')
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
#data.set_index('Date', inplace=True)


# combine store and item column as time_series
data['OBJECTID'] = ['Location_' + str(i) for i in data['OBJECTID']]
#data['AQI_Bins_AI'] = ['Bin_' + str(i) for i in data['AQI_Bins_AI']]
data['time_series'] = data[['OBJECTID']].apply(lambda x: '_'.join(x), axis=1)
data.drop(['OBJECTID'], axis=1, inplace=True)
# extract features from date
data['month'] = [i.month for i in data['Date']]
data['year'] = [i.year for i in data['Date']]
data['day_of_week'] = [i.dayofweek for i in data['Date']]
data['day_of_year'] = [i.dayofyear for i in data['Date']]
data.head(4000)

data['time_series'].nunique()


for i in data['time_series'].unique():
    subset = data[data['time_series'] == i]
    subset['moving_average'] = subset['CO'].rolling(window = 30).mean()
    fig = px.line(subset, x="Date", y=["CO","moving_average"], title = i, template = 'plotly_dark')
    fig.show()

需要这方面的帮助,

这是我的样本数据Google Drive Link

  • 没有以可用的方式提供数据。寻找公开可用的类似数据。发现:https://www.kaggle.com/rohanrao/air-quality-data-in-india?select=station_hour.csv
  • 使用这些数据,对您的代码进行几次清理,绘图没有问题。我怀疑您的数据存在以下问题之一
    1. 日期不在您的数据框中 datetime64[ns]
    2. 日期未排序,导致以您注意到的方式绘制线条
  • 通过重构计算移动平均值的方式,您可以使用动画代替大量单独的数字

获取一些数据

import kaggle.cli
import sys, math
import pandas as pd
from pathlib import Path
from zipfile import ZipFile
import plotly.express as px

# download data set
# https://www.kaggle.com/rohanrao/air-quality-data-in-india?select=station_hour.csv
sys.argv = [
    sys.argv[0]
] + "datasets download rohanrao/air-quality-data-in-india".split(
    " "
)
kaggle.cli.main()

zfile = ZipFile("air-quality-data-in-india.zip")
print([f.filename for f in zfile.infolist()])

使用问题

中的代码绘图
import pandas as pd
import plotly.express as px
from pathlib import Path
from distutils.version import StrictVersion

# data = pd.read_csv('E:/Self Learning/Djang_Dash/2019-2020_5.csv')
# use kaggle data
# dfs = {f.filename:pd.read_csv(zfile.open(f)) for f in zfile.infolist() if f.filename in ['station_day.csv',"stations.csv"]}
# data = pd.merge(dfs['station_day.csv'],dfs["stations.csv"], on="StationId")

# data['Date'] = pd.to_datetime(data['Date'])
# # kaggle data is different from question, make it compatible with questions data
# data = data.assign(OBJECTID=lambda d: d["StationId"])

# sample data from google drive link
data2 = pd.read_csv(Path.home().joinpath("Downloads").joinpath("AQI.csv"))
data2["Date"] = pd.to_datetime(data2["Date"])

data = data2
# as per very first commment - it's important data is ordered !
data = data.sort_values(["Date","OBJECTID"])
data['time_series'] = "Location_" + data["OBJECTID"].astype(str)
# clean up data, remove rows where there is no CO value
data = data.dropna(subset=["CO"])
# can do moving average in one step (can also be used by animation)
if StrictVersion(pd.__version__) < StrictVersion("1.3.0"):
    data["moving_average"] = data.groupby("time_series",as_index=False)["CO"].rolling(window=30).mean().to_frame()["CO"].values
else:
    data["moving_average"] = data.groupby("time_series",as_index=False)["CO"].rolling(window=30).mean()["CO"]

# just first two for purpose of demonstration
for i in data['time_series'].unique()[0:3]:
    subset = data.loc[data['time_series'] == i]
    fig = px.line(subset, x="Date", y=["CO","moving_average"], title = i, template = 'plotly_dark')
    fig.show()

可以使用动画

px.line(
    data,
    x="Date",
    y=["CO", "moving_average"],
    animation_frame="time_series",
    template="plotly_dark",
).update_layout(yaxis={"range":[data["CO"].min(), data["CO"].quantile(.97)]})