从 class 字符的混合长格式列中绘制 (ggplot) 数值

Plotting (ggplot) numeric values from mixed long format column of class character

遵循整洁的数据标准,我的数据是长格式的,带有一个键和一个值列。某些键的值是数字,其他键的值是字符,因此 R 将整个列设置为字符 class.

当我使用 filter() 仅将数字数据通过管道传输到 ggplot(带有键 'a' 的数据是数字),然后在值定义上使用 as.numeric() 时,它不会正确转换 - 我看到只是获取序列号而不是值。我究竟做错了什么?

filter(data, measure == "a") %>% 
  ggplot(aes(x = as.numeric(value), 
             x = as.factor(subject_instance), 
             color = as.factor(subject_instance))) + 
  geom_boxplot()

以长格式处理混合 classes 的最佳方法是什么(即它不是我正在做的方式)。

如何让 ggplot 正确转换为数字?

可行的示例(来自较大集合的 40 行样本):

    mydata <- structure(list(ResponseID = c("R_40LUIW7O8Lnj7Cd", "R_aXo4IXJ2eRTyThr", 
"R_9sHFiKGtn4ZhNiJ", "R_0BMN3JynUPiB0dn", "R_9mqmDcAKzae6ko5", 
"R_4T7qN9appsgbnxj", "R_5BeXW1ygKISxISV", "R_3JJY4UGvbzzDYTX", 
"R_0AN81Cdgz7ncPDD", "R_aXo4IXJ2eRTyThr", "R_40LUIW7O8Lnj7Cd", 
"R_8BOtUltxr8O6AeN", "R_40LUIW7O8Lnj7Cd", "R_1KUj25KpGbKOaGh", 
"R_5BeXW1ygKISxISV", "R_0AN81Cdgz7ncPDD", "R_aXo4IXJ2eRTyThr", 
"R_aXo4IXJ2eRTyThr", "R_0N8LUMfEP12P4Wh", "R_0wuddsG9KJhHkRn", 
"R_1R3kGCm3vPWi4dL", "R_50W5K8wp8m1yOZ7", "R_0wuddsG9KJhHkRn", 
"R_ctKujSc0Zr5fldz", "R_4SDzTFmolPaB8wt", "R_0Ng4gEnCnkCTuoB", 
"R_0Ng4gEnCnkCTuoB", "R_eb5LkAh0nBVqc9n", "R_0vqNorszrDGN6MB", 
"R_40LUIW7O8Lnj7Cd", "R_6s1Q2hFaqRLMKKF", "R_8BOtUltxr8O6AeN", 
"R_4SDzTFmolPaB8wt", "R_3JJY4UGvbzzDYTX", "R_3JJY4UGvbzzDYTX", 
"R_77mJUnh0OPtvCEl", "R_bxtLgQnlf4iaCWx", "R_6s1Q2hFaqRLMKKF", 
"R_7X8L8LwKo6UdWgR", "R_9mqmDcAKzae6ko5"), ID = c("R_3I0G7xzqlA4lUmm", 
"R_12m5J3hXrv8ObMa", "R_3PmEIrRgCUr0X3L", "R_YQuCAn43cgRMHy9", 
"R_51GdFWDnxQ7zvpv", "R_x9g4FVQzeqAJG8h", "R_QmDHvIxNJUypJip", 
"R_2cuyzZ8C4khOGs8", "R_3fUUNvffCN7GUrn", "R_12m5J3hXrv8ObMa", 
"R_3I0G7xzqlA4lUmm", "R_xgbhYoALaqQ9TDX", "R_3I0G7xzqlA4lUmm", 
"R_28I21bSyxgRcyGo", "R_QmDHvIxNJUypJip", "R_3fUUNvffCN7GUrn", 
"R_12m5J3hXrv8ObMa", "R_12m5J3hXrv8ObMa", "R_9L8RxssmQOGrPAR", 
"R_3iExjba1az5mpLw", "R_2wodtnGyQkaGTbX", "R_dnln2Bzdjahd3ax", 
"R_3iExjba1az5mpLw", "R_29gE0fK7dB6HENJ", "R_2E0mBlZmT618zQp", 
"R_3EVZt1ncuzTbVRr", "R_3EVZt1ncuzTbVRr", "R_2anUpVhXXReyZAX", 
"R_1dz55WFaXZ3Lm3Y", "R_3I0G7xzqlA4lUmm", "R_vUJsBPPRxV9J6CJ", 
"R_xgbhYoALaqQ9TDX", "R_2E0mBlZmT618zQp", "R_2cuyzZ8C4khOGs8", 
"R_2cuyzZ8C4khOGs8", "R_3LYcR4i5YB2k0N0", "R_yL9qi0TMXHfuJK9", 
"R_vUJsBPPRxV9J6CJ", "R_1DqckuFAYHkKjDg", "R_51GdFWDnxQ7zvpv"
), icon = c(".rprt", ".mddm", ".cnsl", "ord.cnsl", "sgn.alrt", 
"ent.advr", "flg.lab2", "ord.lab2", ".mddm", ".mds2", "rmv.prb2", 
"sch.imgn", "edt.not2", "edt.prb4", "ord.lab", "grp.lab", "src.note", 
"sgn.alrt", "sgn.imgn", "sch.lab", "sch.lab", ".note", "viw.imgn", 
"flg.lab2", ".mddm", "ent.prbl", "ent.vtls", "ord.med", ".hstr", 
"rnw.alrt", "ent.vtls", "viw.vtls", "sch.lab2", "edt.note", "rnw.med", 
"ord.prcd", "rmv.prbl", "crt.grph", "edt.prb3", "ent.prb2"), 
    measure = c("firstclick", "lastclick", "subject", "clickcount", 
    "pagesubmit", "firstclick", "subject", "lastclick", "subject", 
    "pagesubmit", "firstclick", "subject", "clickcount", "lastclick", 
    "clickcount", "firstclick", "pagesubmit", "pagesubmit", "pagesubmit", 
    "lastclick", "action", "clickcount", "firstclick", "clickcount", 
    "subject", "clickcount", "firstclick", "lastclick", "subject", 
    "pagesubmit", "action", "lastclick", "lastclick", "pagesubmit", 
    "clickcount", "firstclick", "firstclick", "action", "pagesubmit", 
    "subject"), value = c("2.602", "4.849", "Consult(s)", "6", 
    "180", "1.456", "Lab / Imaging / Diagnostic", "70.335", "Medication(s)", 
    "180", "1.133", "Lab / Imaging / Diagnostic", "4", "3.938", 
    "4", "3.003", "180", "180", "180", "20.519", "Schedule", 
    "4", "4.758", "4", "Medication(s)", "4", "1.706", "8.582", 
    "Patient history", "11.599", "Enter", "9.098", "11.897", 
    "180", "4", "1.728", "2.423", "Search", "180", "Problem(s)"
    ), file = structure(c(60L, 37L, 4L, 41L, 67L, 17L, 25L, 44L, 
    37L, 39L, 57L, 63L, 11L, 15L, 43L, 29L, 66L, 67L, 68L, 64L, 
    64L, 40L, 78L, 25L, 37L, 20L, 22L, 45L, 33L, 58L, 22L, 87L, 
    65L, 10L, 59L, 47L, 56L, 7L, 14L, 21L), .Label = c("alert.png", 
    "allergies.png", "check-order.png", "consult.png", "copy-graph.png", 
    "create-encounter.png", "create-graph.png", "create-note.png", 
    "create-report.png", "edit-note.png", "edit-note2.png", "edit-problem.png", 
    "edit-problem2.png", "edit-problem3.png", "edit-problem4.png", 
    "encounter.png", "enter-adverse.png", "enter-med.png", "enter-medadmin.png", 
    "enter-problem.png", "enter-problem2.png", "enter-vitals.png", 
    "flag-imaging.png", "flag-lab.png", "flag-lab2.png", "flag-order.png", 
    "followup.png", "forward-alert.png", "graph-lab.png", "graph-lab2.png", 
    "graph-vitals.png", "graph.png", "history.png", "imaging.png", 
    "lab1.png", "lab2.png", "medadmin.png", "meds1.png", "meds2.png", 
    "note.png", "order-consult.png", "order-imaging.png", "order-lab.png", 
    "order-lab2.png", "order-med.png", "order-med2.png", "order-procedure.png", 
    "order-procedure2.png", "order.png", "problem1.png", "problem2.png", 
    "procedure1.png", "procedure2.png", "refill-med.png", "refill-med2.png", 
    "remove-problem.png", "remove-problem2.png", "renew-alert.png", 
    "renew-med.png", "report.png", "schedule-consult.png", "schedule-followup.png", 
    "schedule-imaging.png", "schedule-lab.png", "schedule-lab2.png", 
    "search-note.png", "sign-alert.png", "sign-imaging.png", 
    "sign-lab.png", "sign-lab2.png", "sign-note.png", "sign-order.png", 
    "sign-report.png", "sort-alert.png", "sort-vitals.png", "view-adverse.png", 
    "view-history.png", "view-imaging.png", "view-lab.png", "view-lab2.png", 
    "view-med.png", "view-note.png", "view-order.png", "view-problem.png", 
    "view-problem2.png", "view-report.png", "view-vitals.png", 
    "vitals.png"), class = "factor"), icon_action = c("", "", 
    "", "order", "sign", "enter", "flag", "order", "", "", "remove", 
    "schedule", "edit", "edit", "order", "graph", "search", "sign", 
    "sign", "schedule", "schedule", "", "view", "flag", "", "enter", 
    "enter", "order", "", "renew", "enter", "view", "schedule", 
    "edit", "renew", "order", "remove", "create", "edit", "enter"
    ), icon_subject = c("report", "medadmin", "consult", "consult", 
    "alert", "adverse", "lab2", "lab2", "medadmin", "meds2", 
    "problem2", "imaging", "note2", "problem4", "lab", "lab", 
    "note", "alert", "imaging", "lab", "lab", "note", "imaging", 
    "lab2", "medadmin", "problem", "vitals", "med", "history", 
    "alert", "vitals", "vitals", "lab2", "note", "med", "procedure", 
    "problem", "graph", "problem3", "problem2"), instance = structure(c(2L, 
    8L, 7L, 26L, 80L, 49L, 78L, 24L, 11L, 7L, 83L, 24L, 77L, 
    43L, 26L, 67L, 73L, 38L, 31L, 74L, 27L, 12L, 26L, 87L, 15L, 
    31L, 53L, 42L, 2L, 53L, 88L, 57L, 47L, 62L, 54L, 37L, 40L, 
    78L, 32L, 33L), .Label = c("1", "2", "3", "4", "5", "6", 
    "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", 
    "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", 
    "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", 
    "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", 
    "47", "48", "49", "50", "51", "52", "53", "54", "55", "56", 
    "57", "58", "59", "60", "61", "62", "63", "64", "65", "66", 
    "67", "68", "69", "70", "71", "72", "73", "74", "75", "76", 
    "77", "78", "79", "80", "81", "82", "83", "84", "85", "86", 
    "87", "88"), class = "factor"), action_instance = c(NA, NA, 
    NA, 2L, 7L, 3L, 4L, 1L, NA, NA, 2L, 1L, 5L, 2L, 2L, 2L, 1L, 
    6L, 2L, 4L, 1L, NA, 2L, 4L, NA, 2L, 3L, 4L, NA, 1L, 6L, 9L, 
    1L, 5L, 2L, 2L, 1L, 3L, 2L, 1L), subject_instance = c(1L, 
    1L, 1L, 2L, 4L, 2L, 7L, 2L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 6L, 
    5L, 3L, 2L, 3L, 2L, 1L, 2L, 7L, 1L, 2L, 4L, 2L, 1L, 5L, 5L, 
    3L, 4L, 4L, 3L, 1L, 3L, 3L, 1L, 2L)), .Names = c("ResponseID", 
"ID", "icon", "measure", "value", "file", "icon_action", "icon_subject", 
"instance", "action_instance", "subject_instance"), class = c("tbl_df", 
"data.frame"), row.names = c(NA, -40L))

filter(mydata, measure=="pagesubmit") %>% ggplot(aes(y=as.numeric(value), x=as.factor(subject_instance), color=as.factor(subject_instance))) + geom_boxplot()

另外,关于半相关的说明,为什么这不起作用?:

filter(icon, measure=="pagesubmit") %>% mean(value)

如@Spacedman 所述,您的示例不是 reproducible/consistent。

第 1 部分

这个有效:

filter(mydata, measure == "pagesubmit") %>% 
  ggplot(aes(
    y = as.numeric(value), 
    x = as.factor(subject_instance), 
    color = as.factor(subject_instance))) + 
  geom_boxplot()

第 2 部分

如果你想要所有measure的平均值等于pagesubmit,你可以这样做:

filter(mydata, measure == "pagesubmit") %>% 
  summarise(mean = mean(as.numeric(value)))

您最初的尝试没有成功,因为您正试图从整个数据框中取出 mean()。事实上,当您执行 ... %>% mean(value) 时,您会收到以下错误消息:

# Warning message:
# In mean.default(., value) : argument is not numeric or logical: returning NA

其中 . 是左侧参数(mydata 过滤后仅保留 measure 等于 pagesubmit),%>% 通过管道转发到 mean()。按照相同的逻辑,您应该改为:

mydata %>% 
  filter(measure == "pagesubmit") %>%
  .$value %>% ## extract a character vector of values
  as.numeric() %>% ## convert it to numeric
  mean() ## calculate the mean