根据级别中的下分“_”将列拆分为三列
split column into three columns based on the under score "_" in the levels
我有一个 data.frame 我想用 ggplot2
绘制
df <- read.table(text = c("
scenario value
measured_raw21 55
measured_raw22 60
simulated_raw21 54
measured_raw21_drain 55
measured_raw22_drain 60
measured_adj21 23
simulated_raw22 59
simulated_raw21_drain 54.5
simulated_raw22_drain 60.2
measured_adj21_drain 23
measured_adj22 27
simulated_adj21 22
measured_adj22_drain 27
simulated_adj21_drain 23.4
simulated_adj22 27.5
simulated_adj22_drain 27.2
measured_0.5 40
measured_0.8 55
measured_0.5_drain 40
measured_0.8_drain 55
simulated_0.5 41
simulated_0.8 56
simulated_0.5_drain 39.9
simulated_0.8_drain 55.3"), header =T)
scenario
列有是simulated
还是measured
的信息,scenarioID(raw21,raw22),是否有drain
或者[=19] =]
我想将场景栏拆分为如下 3 栏
scenario cat scenario1 drain value
1 measured_raw21 measured raw21 nodrain 55.0
2 measured_raw22 measured raw22 nodrain 60.0
3 simulated_raw21 simulated raw21 nodrain 54.0
4 measured_raw21_drain measured raw21 drain 55.0
5 measured_raw22_drain measured raw22 drain 60.0
6 measured_adj21 measured adj21 nodrain 23.0
7 simulated_raw22 simulated raw22 nodrain 59.0
8 simulated_raw21_drain simulated raw21 drain 54.5
9 simulated_raw22_drain simulated raw22 drain 60.2
10 measured_adj21_drain measured adj21 drain 23.0
11 measured_adj22 measured adj22 nodrain 27.0
12 simulated_adj21 simulated adj21 nodrain 22.0
13 measured_adj22_drain meaured adj22 drain 27.0
14 simulated_adj21_drain simulated adj21 drain 23.4
15 simulated_adj22 simulated adj22 nodrain 27.5
16 simulated_adj22_drain simulated adj22 drain 27.2
17 meaured_0.5 measured 0.5 nodrain 40.0
18 meaured_0.8 measured 0.8 nodrain 55.0
19 meaured_0.5_drain measured 0.5 drain 40.0
20 meaured_0.8_drain measured 0.8 drain 55.0
21 simulated_0.5 simulated 0.5 nodrain 41.0
22 simulated_0.8 simulated 0.8 nodrain 56.0
23 simulated_0.5_drain simulated 0.5 drain 39.9
24 simulated_0.8_drain simulated 0.8 drain 55.3
我是这样做的
df$cat <- c("measured", "measured", "simulated","measured", "measured", "measured","simulated",
"simulated", "simulated", "measured", "measured", "simulated", "measured", "simulated", "simulated", "simulated",
"measured", "measured", "measured", "measured", "simulated", "simulated", "simulated", "simulated")
df$scenario1 <- c("raw21","raw22","raw21","raw21","raw22","adj21","raw22",'raw21',"raw22","adj21","adj22",
"adj21","adj22",'adj21',"adj22", 'adj22', "0.5", "0.8", "0.5", "0.8", "0.5", "0.8", "0.5", "0.8")
df$drain <- c("nodrain", "nodrain", "nodrain", "drain", "drain", "nodrain", "nodrain" ,"drain",
"drain", "drain", "nodrain", "nodrain", "drain", "drain", "nodrain", "drain", "nodrain", "nodrain",
"drain", "drain", "nodrain", "nodrain", "drain", "drain")
这是我想要的最终剧情
library(tidyr)
library(dplyr)
library(ggplot2)
df_fin <- df %>%
select(scenario1, drain, cat, value) %>%
spread(cat, value)
ggplot(df_fin, aes(measured, simulated, col = scenario1))+
geom_point()+
facet_wrap(~drain)
我手动将 df
中的 scenario
列拆分为 3 列。然而,在可重现的示例中只有 24 个观察结果,但在我原来的 data.frame 中,我有很多观察结果,这将花费时间按照我上面的方式拆分场景列。
对于将场景列拆分为 3 列的有效方法,我将不胜感激?
您可以使用 separate
:
library(tidyr)
df %>%
separate(scenario, into = c("cat", "scenario1", "drain"), sep = "_", remove = FALSE) %>%
replace_na(list(drain = "nodrain"))
# if there is no drain, the cell will be <NA>, repalce it with nodrain
# scenario cat scenario1 drain value
#1 measured_raw21 measured raw21 nodrain 55.0
#2 measured_raw22 measured raw22 nodrain 60.0
#3 simulated_raw21 simulated raw21 nodrain 54.0
#4 measured_raw21_drain measured raw21 drain 55.0
#5 measured_raw22_drain measured raw22 drain 60.0
#6 measured_adj21 measured adj21 nodrain 23.0
#7 simulated_raw22 simulated raw22 nodrain 59.0
#8 simulated_raw21_drain simulated raw21 drain 54.5
#9 simulated_raw22_drain simulated raw22 drain 60.2
#10 measured_adj21_drain measured adj21 drain 23.0
#11 measured_adj22 measured adj22 nodrain 27.0
#12 simulated_adj21 simulated adj21 nodrain 22.0
#13 measured_adj22_drain measured adj22 drain 27.0
# ...
这里有一个基于 R
的可能解决方案,我认为只要 _drain
只出现在字符串的末尾就可以使用。在那种情况下,我只是将 _nodrain
添加到不包含 _drain
的字符串,然后在 _
.:
上拆分
df <- read.table(text = c("
scenario value
measured_raw21 55
measured_raw22 60
simulated_raw21 54
measured_raw21_drain 55
measured_raw22_drain 60
measured_adj21 23
simulated_raw22 59
simulated_raw21_drain 54.5
simulated_raw22_drain 60.2
measured_adj21_drain 23
measured_adj22 27
simulated_adj21 22
measured_adj22_drain 27
simulated_adj21_drain 23.4
simulated_adj22 27.5
simulated_adj22_drain 27.2
measured_0.5 40
measured_0.8 55
measured_0.5_drain 40
measured_0.8_drain 55
simulated_0.5 41
simulated_0.8 56
simulated_0.5_drain 39.9
simulated_0.8_drain 55.3"), header =T, stringsAsFactors = FALSE)
df$scenario <- ifelse(grepl("_drain", df$scenario) == FALSE,
paste0(df$scenario, '_nodrain'), df$scenario)
x = strsplit(df$scenario, "_", fixed = TRUE)
do.call(rbind, x)
# [,1] [,2] [,3]
# [1,] "measured" "raw21" "nodrain"
# [2,] "measured" "raw22" "nodrain"
# [3,] "simulated" "raw21" "nodrain"
# [4,] "measured" "raw21" "drain"
# [5,] "measured" "raw22" "drain"
# [6,] "measured" "adj21" "nodrain"
# [7,] "simulated" "raw22" "nodrain"
# [8,] "simulated" "raw21" "drain"
# [9,] "simulated" "raw22" "drain"
#[10,] "measured" "adj21" "drain"
#[11,] "measured" "adj22" "nodrain"
#[12,] "simulated" "adj21" "nodrain"
#[13,] "measured" "adj22" "drain"
#[14,] "simulated" "adj21" "drain"
#[15,] "simulated" "adj22" "nodrain"
#[16,] "simulated" "adj22" "drain"
#[17,] "measured" "0.5" "nodrain"
#[18,] "measured" "0.8" "nodrain"
#[19,] "measured" "0.5" "drain"
#[20,] "measured" "0.8" "drain"
#[21,] "simulated" "0.5" "nodrain"
#[22,] "simulated" "0.8" "nodrain"
#[23,] "simulated" "0.5" "drain"
#[24,] "simulated" "0.8" "drain"
我有一个 data.frame 我想用 ggplot2
df <- read.table(text = c("
scenario value
measured_raw21 55
measured_raw22 60
simulated_raw21 54
measured_raw21_drain 55
measured_raw22_drain 60
measured_adj21 23
simulated_raw22 59
simulated_raw21_drain 54.5
simulated_raw22_drain 60.2
measured_adj21_drain 23
measured_adj22 27
simulated_adj21 22
measured_adj22_drain 27
simulated_adj21_drain 23.4
simulated_adj22 27.5
simulated_adj22_drain 27.2
measured_0.5 40
measured_0.8 55
measured_0.5_drain 40
measured_0.8_drain 55
simulated_0.5 41
simulated_0.8 56
simulated_0.5_drain 39.9
simulated_0.8_drain 55.3"), header =T)
scenario
列有是simulated
还是measured
的信息,scenarioID(raw21,raw22),是否有drain
或者[=19] =]
我想将场景栏拆分为如下 3 栏
scenario cat scenario1 drain value
1 measured_raw21 measured raw21 nodrain 55.0
2 measured_raw22 measured raw22 nodrain 60.0
3 simulated_raw21 simulated raw21 nodrain 54.0
4 measured_raw21_drain measured raw21 drain 55.0
5 measured_raw22_drain measured raw22 drain 60.0
6 measured_adj21 measured adj21 nodrain 23.0
7 simulated_raw22 simulated raw22 nodrain 59.0
8 simulated_raw21_drain simulated raw21 drain 54.5
9 simulated_raw22_drain simulated raw22 drain 60.2
10 measured_adj21_drain measured adj21 drain 23.0
11 measured_adj22 measured adj22 nodrain 27.0
12 simulated_adj21 simulated adj21 nodrain 22.0
13 measured_adj22_drain meaured adj22 drain 27.0
14 simulated_adj21_drain simulated adj21 drain 23.4
15 simulated_adj22 simulated adj22 nodrain 27.5
16 simulated_adj22_drain simulated adj22 drain 27.2
17 meaured_0.5 measured 0.5 nodrain 40.0
18 meaured_0.8 measured 0.8 nodrain 55.0
19 meaured_0.5_drain measured 0.5 drain 40.0
20 meaured_0.8_drain measured 0.8 drain 55.0
21 simulated_0.5 simulated 0.5 nodrain 41.0
22 simulated_0.8 simulated 0.8 nodrain 56.0
23 simulated_0.5_drain simulated 0.5 drain 39.9
24 simulated_0.8_drain simulated 0.8 drain 55.3
我是这样做的
df$cat <- c("measured", "measured", "simulated","measured", "measured", "measured","simulated",
"simulated", "simulated", "measured", "measured", "simulated", "measured", "simulated", "simulated", "simulated",
"measured", "measured", "measured", "measured", "simulated", "simulated", "simulated", "simulated")
df$scenario1 <- c("raw21","raw22","raw21","raw21","raw22","adj21","raw22",'raw21',"raw22","adj21","adj22",
"adj21","adj22",'adj21',"adj22", 'adj22', "0.5", "0.8", "0.5", "0.8", "0.5", "0.8", "0.5", "0.8")
df$drain <- c("nodrain", "nodrain", "nodrain", "drain", "drain", "nodrain", "nodrain" ,"drain",
"drain", "drain", "nodrain", "nodrain", "drain", "drain", "nodrain", "drain", "nodrain", "nodrain",
"drain", "drain", "nodrain", "nodrain", "drain", "drain")
这是我想要的最终剧情
library(tidyr)
library(dplyr)
library(ggplot2)
df_fin <- df %>%
select(scenario1, drain, cat, value) %>%
spread(cat, value)
ggplot(df_fin, aes(measured, simulated, col = scenario1))+
geom_point()+
facet_wrap(~drain)
我手动将 df
中的 scenario
列拆分为 3 列。然而,在可重现的示例中只有 24 个观察结果,但在我原来的 data.frame 中,我有很多观察结果,这将花费时间按照我上面的方式拆分场景列。
对于将场景列拆分为 3 列的有效方法,我将不胜感激?
您可以使用 separate
:
library(tidyr)
df %>%
separate(scenario, into = c("cat", "scenario1", "drain"), sep = "_", remove = FALSE) %>%
replace_na(list(drain = "nodrain"))
# if there is no drain, the cell will be <NA>, repalce it with nodrain
# scenario cat scenario1 drain value
#1 measured_raw21 measured raw21 nodrain 55.0
#2 measured_raw22 measured raw22 nodrain 60.0
#3 simulated_raw21 simulated raw21 nodrain 54.0
#4 measured_raw21_drain measured raw21 drain 55.0
#5 measured_raw22_drain measured raw22 drain 60.0
#6 measured_adj21 measured adj21 nodrain 23.0
#7 simulated_raw22 simulated raw22 nodrain 59.0
#8 simulated_raw21_drain simulated raw21 drain 54.5
#9 simulated_raw22_drain simulated raw22 drain 60.2
#10 measured_adj21_drain measured adj21 drain 23.0
#11 measured_adj22 measured adj22 nodrain 27.0
#12 simulated_adj21 simulated adj21 nodrain 22.0
#13 measured_adj22_drain measured adj22 drain 27.0
# ...
这里有一个基于 R
的可能解决方案,我认为只要 _drain
只出现在字符串的末尾就可以使用。在那种情况下,我只是将 _nodrain
添加到不包含 _drain
的字符串,然后在 _
.:
df <- read.table(text = c("
scenario value
measured_raw21 55
measured_raw22 60
simulated_raw21 54
measured_raw21_drain 55
measured_raw22_drain 60
measured_adj21 23
simulated_raw22 59
simulated_raw21_drain 54.5
simulated_raw22_drain 60.2
measured_adj21_drain 23
measured_adj22 27
simulated_adj21 22
measured_adj22_drain 27
simulated_adj21_drain 23.4
simulated_adj22 27.5
simulated_adj22_drain 27.2
measured_0.5 40
measured_0.8 55
measured_0.5_drain 40
measured_0.8_drain 55
simulated_0.5 41
simulated_0.8 56
simulated_0.5_drain 39.9
simulated_0.8_drain 55.3"), header =T, stringsAsFactors = FALSE)
df$scenario <- ifelse(grepl("_drain", df$scenario) == FALSE,
paste0(df$scenario, '_nodrain'), df$scenario)
x = strsplit(df$scenario, "_", fixed = TRUE)
do.call(rbind, x)
# [,1] [,2] [,3]
# [1,] "measured" "raw21" "nodrain"
# [2,] "measured" "raw22" "nodrain"
# [3,] "simulated" "raw21" "nodrain"
# [4,] "measured" "raw21" "drain"
# [5,] "measured" "raw22" "drain"
# [6,] "measured" "adj21" "nodrain"
# [7,] "simulated" "raw22" "nodrain"
# [8,] "simulated" "raw21" "drain"
# [9,] "simulated" "raw22" "drain"
#[10,] "measured" "adj21" "drain"
#[11,] "measured" "adj22" "nodrain"
#[12,] "simulated" "adj21" "nodrain"
#[13,] "measured" "adj22" "drain"
#[14,] "simulated" "adj21" "drain"
#[15,] "simulated" "adj22" "nodrain"
#[16,] "simulated" "adj22" "drain"
#[17,] "measured" "0.5" "nodrain"
#[18,] "measured" "0.8" "nodrain"
#[19,] "measured" "0.5" "drain"
#[20,] "measured" "0.8" "drain"
#[21,] "simulated" "0.5" "nodrain"
#[22,] "simulated" "0.8" "nodrain"
#[23,] "simulated" "0.5" "drain"
#[24,] "simulated" "0.8" "drain"