为什么 Seaborn 在图例中显示的值不在数据框中

Why is Seaborn showing values in the legend that aren't in the data frame

我正在绘制我通过 api 访问的一些 Covid 数据。我已经设法将其放入 df,尽管这不太可能是最佳实践,也不是这个问题的重点。

# import packages
import json
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# set options
pd.set_option('display.max_columns', None)


# list of indicators
import re
indicators = "'covid' or 'flu' or 'cli_w11' or 'ili_W11' or 'mask' or'contact' or 'finance'  or 'anosmia'  or 'vaccine_acpt' or 'access_wash' or 'covid_vaccine' or 'trust_fam'  or 'trust_healthcare' or 'trust_who' or 'trust_govt'  or'trust_politicians'  or 'twodoses' or 'concerned_sideeffects' or 'wash_hands_24h_3to6' or 'wash_hands_24h_7orMore'  or 'hesitant_sideeffects'  or 'modified_acceptance'  or 'cmty_covid' or 'barrier_reason_side_effects' or 'barrier_reason_wontwork' or 'barrier_reason_dontbelieve' or 'barrier_reason_dontlike' or 'barrier_reason_waitlater' or 'barrier_reason_otherpeople' or 'barrier_reason_cost' or 'barrier_reason_religious' or 'barrier_reason_other' or 'trust_doctors' or 'barrier_reason_dontneed_alreadyhad' or 'barrier_reason_dontneed_dontspendtime' or 'barrier_reason_dontneed_nothighrisk' or 'barrier_reason_dontneed_takeprecautions' or 'barrier_reason_dontneed_notserious' or 'barrier_reason_dontneed_notbeneficial' or 'barrier_reason_dontneed_other' or 'informed_access' or 'appointment_have' or 'appointment_tried' or 'barrier_reason_government' or 'activity_work_outside_home' or 'activity_shop' or 'activity_restaurant_bar' or 'activity_spent_time' or 'activity_large_event' or 'activity_public_transit' or 'food_security' or 'anxious_7d' or 'depressed_7d' or 'worried_become_ill' or 'symp_fever' or 'symp_cough' or 'symp_diff_breathing' or 'symp_fatigue' or 'symp_stuffy_nose' or 'symp_aches' or 'symp_sore_throat' or 'symp_chest_pain' or 'symp_nausea' or 'symp_eye_pain' or 'symp_headache' or 'sick_spend_time_7d' or 'ever_tested' or 'pay_test' or 'reduce_spending' or 'symp_chills' or 'symp_changes' or 'testing_rate' or 'tested_positive_14d' or 'tested_positive_recent' or 'flu_vaccine_thisyr' or 'flu_vaccine_lastyr' or 'avoid_contact' or 'vaccinated_appointment_or_accept' or 'appointment_or_accept_covid_vaccine' or 'accept_covid_vaccine_no_appointment' or 'appointment_not_vaccinated' or 'vaccine_tried' or 'had_covid_ever' or 'worried_catch_covid' or 'belief_distancing_effective' or 'belief_masking_effective' or 'others_distanced_public' or 'others_masked_public' or 'covid_vaccinated_friends' or 'belief_vaccinated_mask_unnecessary' or 'belief_children_immune' or 'belief_no_spread_hot_humid' or 'received_news_local_health' or 'received_news_experts' or 'received_news_who' or 'received_news_govt_health' or 'received_news_politicians' or 'received_news_journalists' or 'received_news_friends' or 'received_news_religious' or 'received_news_none' or 'trust_covid_info_local_health' or 'trust_covid_info_experts' or 'trust_covid_info_who' or 'trust_covid_info_govt_health' or 'trust_covid_info_politicians' or 'trust_covid_info_journalists' or 'trust_covid_info_friends' or 'trust_covid_info_religious' or 'want_info_covid_treatment' or 'want_info_vaccine_access' or 'want_info_covid_variants' or 'want_info_children_education' or 'want_info_economic_impact' or 'want_info_mental_health' or 'want_info_relationships' or 'want_info_employment' or 'want_info_none' or 'news_online' or 'news_messaging' or 'news_newspaper' or 'news_television' or 'news_radio' or 'news_none' or 'trust_news_online' or 'trust_news_messaging' or 'trust_news_newspaper' or 'trust_news_television' or 'trust_news_radio' or 'vaccinate_children' or 'delayed_care_cost' or 'vaccine_barrier_eligible' or 'vaccine_barrier_no_appointments' or 'vaccine_barrier_appointment_time' or 'vaccine_barrier_technical_difficulties' or 'vaccine_barrier_document' or 'vaccine_barrier_technology_access' or 'vaccine_barrier_travel' or 'vaccine_barrier_language' or 'vaccine_barrier_childcare' or 'vaccine_barrier_time' or 'vaccine_barrier_type' or 'vaccine_barrier_none' or 'try_vaccinate_1m' or 'vaccine_barrier_eligible_has' or 'vaccine_barrier_no_appointments_has' or 'vaccine_barrier_appointment_time_has' or 'vaccine_barrier_technical_difficulties_has' or 'vaccine_barrier_document_has' or 'vaccine_barrier_technology_access_has' or 'vaccine_barrier_travel_has' or 'vaccine_barrier_language_has' or 'vaccine_barrier_childcare_has' or 'vaccine_barrier_time_has' or 'vaccine_barrier_type_has' or 'vaccine_barrier_none_has' or 'vaccine_barrier_eligible_tried' or 'vaccine_barrier_no_appointments_tried' or 'vaccine_barrier_appointment_time_tried' or 'vaccine_barrier_technical_difficulties_tried' or 'vaccine_barrier_document_tried' or 'vaccine_barrier_technology_access_tried' or 'vaccine_barrier_travel_tried' or 'vaccine_barrier_language_tried' or 'vaccine_barrier_childcare_tried' or 'vaccine_barrier_time_tried' or 'vaccine_barrier_type_tried' or 'vaccine_barrier_none_tried' or 'mask_work_outside_home_1d' or 'mask_shop_1d' or 'mask_restaurant_1d' or 'mask_spent_time_1d' or 'mask_large_event_1d' or 'mask_public_transit_1d'"

indicators = re.sub(r"\s{2}or\s"," or ", indicators)
indicators = re.sub(r"\sor'", " or '", indicators)
inds = re.split(r"' or '", indicators)


# request list of countries
r = requests.get('https://covidmap.umd.edu/api/country')
json_countries = r.json()
countries = pd.DataFrame.from_dict(json_countries['data'])
countries = countries.country.values.tolist()


# loop through all countries for given indicator + append to list
ind = 'covid'

response_list = []
for country in countries:
    r = requests.get(url, params = {'type' : 'daily', 'indicator' : 'covid', 'country' : country, 'daterange' : '20190101-20221231'})
    response = r.json()['data']
    response_list.append(response)
    
    
# reshape list of dictionaries to list of dfs
df_list = []
for i in response_list:
    df_list.append(pd.DataFrame(i))
    
covid_df = pd.concat(df_list)
    

# change df data types
covid_df = covid_df.astype({'country' : 'category', 'iso_code' : 'category', 'gid_0' : 'category'})
covid_df['survey_date'] = pd.to_datetime(covid_df['survey_date'], format = '%Y%m%d')


# sample plot data
filtered_data = covid_df.loc[(covid_df['country'] == 'Australia' | (covid_df['country'] == 'Poland'), :]

首先,我试图在一个图形上绘制几个国家(澳大利亚和波兰),以色调区分。

我已经将所有国家/地区的 df 子集化,为了简单起见,只减少了两个。当我绘制数据时,图例显示了原始 df 中的每个国家,即使我已指定使用子集 df (filtered_df)。

# plot data
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

# sns.set(rc={"figure.dpi":300, 'savefig.dpi':300}) # increase dpi for higher res plot
sns.set_theme('paper')
sns.set_style('white')


g = sns.relplot(x = "survey_date", y = 'pct_covid', data = filtered_data, hue = 'country', kind = 'line', height = 10, aspect = 2) # width = height * aspect 
g.fig.suptitle("Covid-19 Rates")
g.set(ylabel = "Covid Rate (%)")

plt.show()

知道为什么图例显示的值不在 filtered_data df 中吗?

我认为错误的原因是您将国家名称用作类别变量。我从 运行 你的代码中得到的案例数是 1370,但是当我在结果数据框中得到国家名称的数量时,我得到的结果包括所有国家名称。当我禁用我创建分类变量的行时,我得到了预期的输出。

编辑

禁用以下行

#covid_df = covid_df.astype({'country' : 'category', 'iso_code' : 'category', 'gid_0' : 'category'})

修正前

filtered_data['country'].value_counts()

Poland         686
Australia      684
Albania          0
Norway           0
Qatar            0
              ... 
France           0
Finland          0
Ethiopia         0
El Salvador  

修正后

filtered_data['country'].value_counts()

Poland       686
Australia    684
Name: country, dtype: int64