如何在 matplotlib/seaborn 中突出显示绘图的线段?
How to highlight line segments of a plot in matplotlib/seaborn?
我有多个时间序列和多个标签。每当有可用标签时,我想突出显示时间序列,即红色。
现有地块
我有一个折线图,我可以在其中突出显示情节的某些元素,例如:
for cohort_id in sorted(df.cohort_id.unique()):
print(cohort_id)
figsize = (25, 9)
fig, ax = plt.subplots(figsize=figsize)
ax = sns.lineplot(x='hour', y='metrik_0', data=df[df.cohort_id == cohort_id], ax=ax)
ax.xaxis.set_major_locator(aut_locator)
ax.xaxis.set_major_formatter(aut_formatter)
plt.title(f'cohort_id: {cohort_id}', fontsize=45)
plt.xlabel('')
plt.ylabel('metrik_0', fontsize=35)
for index, row in marker_labels.iterrows():
start = row.start
end = row.end
marker_type = row.marker_type
if marker_type == 'b':
ax.axvspan(start, end, color='gray', alpha=0.2)
else:
ax.axvspan(start, end, color='orange', alpha=0.5)
plt.show()
此图可以转换为突出显示某些周期性的循环图,例如:
for cohort_id in sorted(df.cohort_id.unique()):
print(cohort_id)
figsize = (25, 9)
fig, ax = plt.subplots(figsize=figsize)
a1 = sns.lineplot(x=df['hour'].dt.hour, y='metrik_0', hue='device_id', units='dt', style='dt', estimator=None, data=df[(df.cohort_id == cohort_id)], ax=ax)
handles, labels = a1.get_legend_handles_labels()
a1.legend(handles=handles[1:], labels=labels[1:], loc='center', bbox_to_anchor=(0.5, -0.25), ncol=6, fontsize=20)
plt.title(f'cohort_id: {cohort_id}', fontsize=35)
plt.xlabel('hour of the day', fontsize=35)
plt.ylabel('metrik_0', fontsize=35)
plt.show()
但是现在标签不能显示了。
问题
如何将标签重新添加到循环图中?任何方法都可以。但到目前为止,我认为最好用红色突出显示匹配的时间间隔
数据生成
生成一些示例数据:
%pylab inline
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.dates as mdates
aut_locator = mdates.AutoDateLocator(minticks=3, maxticks=7)
aut_formatter = mdates.ConciseDateFormatter(aut_locator)
import random
random_seed = 47
np.random.seed(random_seed)
random.seed(random_seed)
def generate_df_for_device(n_observations, n_metrics, device_id, geo_id, topology_id, cohort_id):
df = pd.DataFrame(np.random.randn(n_observations,n_metrics), index=pd.date_range('2020', freq='H', periods=n_observations))
df.columns = [f'metrik_{c}' for c in df.columns]
df['geospatial_id'] = geo_id
df['topology_id'] = topology_id
df['cohort_id'] = cohort_id
df['device_id'] = device_id
return df
def generate_multi_device(n_observations, n_metrics, n_devices, cohort_levels, topo_levels):
results = []
for i in range(1, n_devices +1):
#print(i)
r = random.randrange(1, n_devices)
cohort = random.randrange(1, cohort_levels)
topo = random.randrange(1, topo_levels)
df_single_dvice = generate_df_for_device(n_observations, n_metrics, i, r, topo, cohort)
results.append(df_single_dvice)
#print(r)
return pd.concat(results)
# hourly data, 1 week of data
n_observations = 7 * 24
n_metrics = 3
n_devices = 20
cohort_levels = 3
topo_levels = 5
df = generate_multi_device(n_observations, n_metrics, n_devices, cohort_levels, topo_levels)
df = df.sort_index()
df = df.reset_index().rename(columns={'index':'hour'})
df['dt'] = df.hour.dt.date
and labels:
marker_labels = pd.DataFrame({'cohort_id':[1,1, 1], 'marker_type':['a', 'b', 'a'], 'start':['2020-01-2', '2020-01-04 05', '2020-01-06'], 'end':[np.nan, '2020-01-05 16', np.nan]})
marker_labels['start'] = pd.to_datetime(marker_labels['start'])
marker_labels['end'] = pd.to_datetime(marker_labels['end'])
marker_labels.loc[marker_labels['end'].isnull(), 'end'] = marker_labels.start + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)
marker_labels
可以在此处找到包含示例数据和当前绘图代码的详细 Jupyter 笔记本:https://github.com/geoHeil/plotting_tricks
编辑
假设我们对时间段的标签进行左连接:
merged_res = (df.reset_index()
.merge(marker_labels, on='cohort_id', how='left')
.query('start <= hour <= end')
.set_index('index')
.reindex(df.index)
)
merged_res = merged_res.combine_first(df)
merged_res.marker_type = merged_res.marker_type.fillna('no_labels_reported')
绘图代码为:
for cohort_id in sorted(merged_res.cohort_id.unique()):
print(cohort_id)
figsize = (25, 9)
fig, ax = plt.subplots(figsize=figsize)
a1 = sns.lineplot(x=merged_res['hour'].dt.hour, y='metrik_0', hue='marker_type', units='dt', style='dt', estimator=None, data=merged_res[(merged_res.cohort_id == cohort_id)], ax=ax)
handles, labels = a1.get_legend_handles_labels()
a1.legend(handles=handles[1:], labels=labels[1:], loc='center', bbox_to_anchor=(0.5, -0.25), ncol=6, fontsize=20)
plt.title(f'cohort_id: {cohort_id}', fontsize=35)
plt.xlabel('hour of the day', fontsize=35)
plt.ylabel('metrik_0', fontsize=35)
plt.show()
结果:
但是:
- 还是比较乱
- 设备的各个时间序列在可视化中aggregated/averaged
到目前为止最好的东西似乎是 hvplot:
merged_res['hour_time'] = merged_res['hour'].dt.hour
merged_res.device_id = merged_res.device_id.astype(str)
for cohort_id in sorted(merged_res.cohort_id.unique()):
print(cohort_id)
current_plot = merged_res[merged_res.cohort_id == cohort_id].set_index(['hour_time'])[['metrik_0', 'marker_type', 'device_id', 'dt']].hvplot(by=['marker_type'],
hover_cols=['dt', 'device_id'], width=width, height=height).opts(active_tools=['box_zoom'])
display(current_plot)
导致:
因为我仍然不完全满意 - 我会保持开放(未答复)以查看是否有人提出更好的解决方案。
特别是,我不喜欢它显示线条 - 可能点会更好。 IE。当某些东西从无标签变为有标签时,时间序列不会连续绘制(=改变颜色)但实际上会跳跃(=创建了一条新的不同线。因此使用点也只是一种解决方法(但可能比有更好跳线。
我有多个时间序列和多个标签。每当有可用标签时,我想突出显示时间序列,即红色。
现有地块
我有一个折线图,我可以在其中突出显示情节的某些元素,例如:
for cohort_id in sorted(df.cohort_id.unique()):
print(cohort_id)
figsize = (25, 9)
fig, ax = plt.subplots(figsize=figsize)
ax = sns.lineplot(x='hour', y='metrik_0', data=df[df.cohort_id == cohort_id], ax=ax)
ax.xaxis.set_major_locator(aut_locator)
ax.xaxis.set_major_formatter(aut_formatter)
plt.title(f'cohort_id: {cohort_id}', fontsize=45)
plt.xlabel('')
plt.ylabel('metrik_0', fontsize=35)
for index, row in marker_labels.iterrows():
start = row.start
end = row.end
marker_type = row.marker_type
if marker_type == 'b':
ax.axvspan(start, end, color='gray', alpha=0.2)
else:
ax.axvspan(start, end, color='orange', alpha=0.5)
plt.show()
此图可以转换为突出显示某些周期性的循环图,例如:
for cohort_id in sorted(df.cohort_id.unique()):
print(cohort_id)
figsize = (25, 9)
fig, ax = plt.subplots(figsize=figsize)
a1 = sns.lineplot(x=df['hour'].dt.hour, y='metrik_0', hue='device_id', units='dt', style='dt', estimator=None, data=df[(df.cohort_id == cohort_id)], ax=ax)
handles, labels = a1.get_legend_handles_labels()
a1.legend(handles=handles[1:], labels=labels[1:], loc='center', bbox_to_anchor=(0.5, -0.25), ncol=6, fontsize=20)
plt.title(f'cohort_id: {cohort_id}', fontsize=35)
plt.xlabel('hour of the day', fontsize=35)
plt.ylabel('metrik_0', fontsize=35)
plt.show()
但是现在标签不能显示了。
问题
如何将标签重新添加到循环图中?任何方法都可以。但到目前为止,我认为最好用红色突出显示匹配的时间间隔
数据生成
生成一些示例数据:
%pylab inline
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.dates as mdates
aut_locator = mdates.AutoDateLocator(minticks=3, maxticks=7)
aut_formatter = mdates.ConciseDateFormatter(aut_locator)
import random
random_seed = 47
np.random.seed(random_seed)
random.seed(random_seed)
def generate_df_for_device(n_observations, n_metrics, device_id, geo_id, topology_id, cohort_id):
df = pd.DataFrame(np.random.randn(n_observations,n_metrics), index=pd.date_range('2020', freq='H', periods=n_observations))
df.columns = [f'metrik_{c}' for c in df.columns]
df['geospatial_id'] = geo_id
df['topology_id'] = topology_id
df['cohort_id'] = cohort_id
df['device_id'] = device_id
return df
def generate_multi_device(n_observations, n_metrics, n_devices, cohort_levels, topo_levels):
results = []
for i in range(1, n_devices +1):
#print(i)
r = random.randrange(1, n_devices)
cohort = random.randrange(1, cohort_levels)
topo = random.randrange(1, topo_levels)
df_single_dvice = generate_df_for_device(n_observations, n_metrics, i, r, topo, cohort)
results.append(df_single_dvice)
#print(r)
return pd.concat(results)
# hourly data, 1 week of data
n_observations = 7 * 24
n_metrics = 3
n_devices = 20
cohort_levels = 3
topo_levels = 5
df = generate_multi_device(n_observations, n_metrics, n_devices, cohort_levels, topo_levels)
df = df.sort_index()
df = df.reset_index().rename(columns={'index':'hour'})
df['dt'] = df.hour.dt.date
and labels:
marker_labels = pd.DataFrame({'cohort_id':[1,1, 1], 'marker_type':['a', 'b', 'a'], 'start':['2020-01-2', '2020-01-04 05', '2020-01-06'], 'end':[np.nan, '2020-01-05 16', np.nan]})
marker_labels['start'] = pd.to_datetime(marker_labels['start'])
marker_labels['end'] = pd.to_datetime(marker_labels['end'])
marker_labels.loc[marker_labels['end'].isnull(), 'end'] = marker_labels.start + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)
marker_labels
可以在此处找到包含示例数据和当前绘图代码的详细 Jupyter 笔记本:https://github.com/geoHeil/plotting_tricks
编辑
假设我们对时间段的标签进行左连接:
merged_res = (df.reset_index()
.merge(marker_labels, on='cohort_id', how='left')
.query('start <= hour <= end')
.set_index('index')
.reindex(df.index)
)
merged_res = merged_res.combine_first(df)
merged_res.marker_type = merged_res.marker_type.fillna('no_labels_reported')
绘图代码为:
for cohort_id in sorted(merged_res.cohort_id.unique()):
print(cohort_id)
figsize = (25, 9)
fig, ax = plt.subplots(figsize=figsize)
a1 = sns.lineplot(x=merged_res['hour'].dt.hour, y='metrik_0', hue='marker_type', units='dt', style='dt', estimator=None, data=merged_res[(merged_res.cohort_id == cohort_id)], ax=ax)
handles, labels = a1.get_legend_handles_labels()
a1.legend(handles=handles[1:], labels=labels[1:], loc='center', bbox_to_anchor=(0.5, -0.25), ncol=6, fontsize=20)
plt.title(f'cohort_id: {cohort_id}', fontsize=35)
plt.xlabel('hour of the day', fontsize=35)
plt.ylabel('metrik_0', fontsize=35)
plt.show()
结果:
但是:
- 还是比较乱
- 设备的各个时间序列在可视化中aggregated/averaged
到目前为止最好的东西似乎是 hvplot:
merged_res['hour_time'] = merged_res['hour'].dt.hour
merged_res.device_id = merged_res.device_id.astype(str)
for cohort_id in sorted(merged_res.cohort_id.unique()):
print(cohort_id)
current_plot = merged_res[merged_res.cohort_id == cohort_id].set_index(['hour_time'])[['metrik_0', 'marker_type', 'device_id', 'dt']].hvplot(by=['marker_type'],
hover_cols=['dt', 'device_id'], width=width, height=height).opts(active_tools=['box_zoom'])
display(current_plot)
导致:
因为我仍然不完全满意 - 我会保持开放(未答复)以查看是否有人提出更好的解决方案。
特别是,我不喜欢它显示线条 - 可能点会更好。 IE。当某些东西从无标签变为有标签时,时间序列不会连续绘制(=改变颜色)但实际上会跳跃(=创建了一条新的不同线。因此使用点也只是一种解决方法(但可能比有更好跳线。