Plotly - 在悬停时突出显示数据点和最近的三个点

Plotly - Highlight data point and nearest three points on hover

我用 plotly 绘制了 word2vec 模型的散点图。
我想要在悬停时突出显示特定数据点以及最接近该数据点的前 3 个向量的功能。 如果有人可以指导我或提出任何其他选择,那将很有帮助

model
csv

代码:

import gensim
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import plotly.express as px

def get_2d_coordinates(model, words):
    arr = np.empty((0,100), dtype='f')
    labels = []
    for wrd_score in words:
        try:
            wrd_vector = model.wv.get_vector(wrd_score)
            arr = np.append(arr, np.array([wrd_vector]), axis=0)
            labels.append(wrd_score)
        except:
            pass
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)
    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    return x_coords, y_coords

ic_model = gensim.models.Word2Vec.load("w2v_IceCream.model")
ic = pd.read_csv('ic_prods.csv')

icx, icy = get_2d_coordinates(ic_model, ic['ITEM_DESC'])
ic_data = {'Category': ic['SUB_CATEGORY'],
            'Words':ic['ITEM_DESC'],
            'X':icx,
            'Y':icy}
ic_df = pd.DataFrame(ic_data)
ic_df.head()
ic_fig = px.scatter(ic_df, x=icx, y=icy, color=ic_df['Category'], hover_name=ic_df['Words'], title='IceCream Data')
ic_fig.show()

在 plotly-python 中,我认为没有一种简单的方法可以检索光标的位置。您可以尝试使用 go.FigureWidget 来突出显示 中描述的轨迹,但我认为您将受到 plotly-python 的限制,我不确定是否突出显示最接近的将有n分。

但是,我相信您可以在 plotly-dash 中完成您想要的,因为支持回调 - 这意味着您将能够检索光标的位置,然后计算 n 最近的数据点到您的光标并根据需要突出显示数据点。

下面是此类解决方案的示例。如果你以前没见过它,它看起来很复杂,但发生的事情是我把你点击的那个点作为输入。 plotly 是 plotly.js 的底层,所以它以字典的形式出现(而不是某种 plotly-python 对象)。然后我通过比较数据框中每个其他点的坐标来计算与单击的输入点最近的三个数据点,将来自三个最近点的信息作为轨迹添加到颜色为 teal(或任何颜色)的输入中您选择的),并将修改后的输入作为输出发回,并更新图形。

我使用的是点击而不是悬停,因为悬停会导致突出显示的点在您拖动鼠标经过这些点时闪烁太多。

此外,dash 应用程序无法完美运行,因为我认为当您双击点时会出现一些问题(您可以看到我在下面的 gif 中单击一次,然后才开始运行),但是这个基本框架希望足够接近你想要的。干杯!

import gensim
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.graph_objects as go

import json

import dash
from dash import dcc, html, Input, Output

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app = dash.Dash(__name__, external_stylesheets=external_stylesheets)


def get_2d_coordinates(model, words):
    arr = np.empty((0,100), dtype='f')
    labels = []
    for wrd_score in words:
        try:
            wrd_vector = model.wv.get_vector(wrd_score)
            arr = np.append(arr, np.array([wrd_vector]), axis=0)
            labels.append(wrd_score)
        except:
            pass
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)
    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    return x_coords, y_coords

ic_model = gensim.models.Word2Vec.load("w2v_IceCream.model")
ic = pd.read_csv('ic_prods.csv')

icx, icy = get_2d_coordinates(ic_model, ic['ITEM_DESC'])
ic_data = {'Category': ic['SUB_CATEGORY'],
            'Words':ic['ITEM_DESC'],
            'X':icx,
            'Y':icy}

ic_df = pd.DataFrame(ic_data)
ic_fig = px.scatter(ic_df, x=icx, y=icy, color=ic_df['Category'], hover_name=ic_df['Words'], title='IceCream Data')

NUMBER_OF_TRACES = len(ic_df['Category'].unique())
ic_fig.update_layout(clickmode='event+select')

app.layout = html.Div([
    dcc.Graph(
        id='ic_figure',
        figure=ic_fig)
    ])

## we take the 4 closest points because the 1st closest point will be the point itself
def get_n_closest_points(x0, y0, df=ic_df[['X','Y']].copy(), n=4):

    """we can save some computation time by looking for the smallest distance^2 instead of distance"""
    """distance = sqrt[(x1-x0)^2 + (y1-y0)^2]"""
    """distance^2 = [(x1-x0)^2 + (y1-y0)^2]"""
    
    df["dist"] = (df["X"]-x0)**2 + (df["Y"]-y0)**2

    ## we don't return the point itself which will always be closest to itself
    return df.sort_values(by="dist")[1:n][["X","Y"]].values

@app.callback(
    Output('ic_figure', 'figure'),
    [Input('ic_figure', 'clickData'),
    Input('ic_figure', 'figure')]
    )
def display_hover_data(clickData, figure):
    print(clickData)
    if clickData is None:
        # print("nothing was clicked")
        return figure
    else:
        hover_x, hover_y = clickData['points'][0]['x'], clickData['points'][0]['y']
        closest_points = get_n_closest_points(hover_x, hover_y)

        ## this means that this function has ALREADY added another trace, so we reduce the number of traces down the original number
        if len(figure['data']) > NUMBER_OF_TRACES:
            # print(f'reducing the number of traces to {NUMBER_OF_TRACES}')
            figure['data'] = figure['data'][:NUMBER_OF_TRACES]
            # print(figure['data'])
        
        new_traces = [{
            'marker': {'color': 'teal', 'symbol': 'circle'},
            'mode': 'markers',
            'orientation': 'v',
            'showlegend': False,
            'x': [x],
            'xaxis': 'x',
            'y': [y],
            'yaxis': 'y',
            'type': 'scatter',
            'selectedpoints': [0]
        } for x,y in closest_points]

        figure['data'].extend(new_traces)
        # print("after\n")
        # print(figure['data'])
        return figure

if __name__ == '__main__':
    app.run_server(debug=True)