在 R 中使用 pivot_wider() 时是否有正则表达式将列定向到特定行?
Is there a regular expression to direct columns to specific rows when using pivot_wider() in R?
我的真实数据的前三行是这样的;
# Real data example
fullname<-c("Argemone glauca", "Bacopa monnieri", "Brighamia insignis")
Mean.Germ.coef_0<-c(0.31, 0.768015267, 0.555758514)
Mean.Germ.coef_10<-c(0.119356725, 0.624444444, 0.479357585)
Mean.Germ.coef_20<-c(0.01, 0.202431661, 0.01)
Mean.Germ.coef_35<-c(0.01, 0.021111111, 0.01)
sd.germ.coef_0<-c(0.055079106, 0.148040638, 0.199485791)
sd.germ.coef_10<-c(0.15341342, 0.079546759, 0.068405754)
sd.germ.coef_20<-c(0, 0.059160256, 0)
sd.germ.coef_35<-c(0, 0.022308189, 0)
n_0<-c(5, 5, 5)
n_10<-c(5, 5, 5)
n_20<-c(5, 5, 5)
n_35<-c(5, 5, 5)
LRR10<-c(-0.954455598, -0.206947247, -0.147887029)
LRR_var10<-c(0.336731047, 0.010676627, 0.029840885)
LRR20<-c(-3.433987204, -1.333407261, -4.017748779)
LRR_var20<-c(0.006313648, 0.024512868, 0.025768057)
LRR35<-c(-3.433987204, -3.594010117, -4.017748779)
LRR_var35<-c(0.006313648, 0.230755613, 0.025768057)
df<-data.frame(fullname, Mean.Germ.coef_0, Mean.Germ.coef_10, Mean.Germ.coef_20, Mean.Germ.coef_35,
sd.germ.coef_0, sd.germ.coef_10, sd.germ.coef_20, sd.germ.coef_35,
LRR10, LRR_var10, LRR20, LRR_var20, LRR35, LRR_var35)
我需要一个很长的 data.frame(或 tibble),其中特定的列进入特定的行;
# Example output
fullname<-c(rep("Argemone glauca", 4), rep("Bacopa monnieri", 4), rep("Brighamia insignis", 4))
Treat<-rep(c(0, 10, 20, 35), 3)
Mean.Germ.coef<-c(0.31, 0.768015267, 0.555758514, 0.01, 0.202431661, 0.01,
0.01, 0.202431661, 0.01, 0.01, 0.021111111, 0.01)
sd.germ.coef<-c(0.055079106, 0.148040638, 0.199485791, 0.15341342, 0.079546759, 0.068405754,
0, 0.059160256, 0, 0, 0.022308189, 0)
n<-rep(5, 12)
LRR<-c("NA", -0.954455598, -0.206947247, -0.147887029,
"NA", -3.433987204, -1.333407261, -4.017748779,
"NA", -3.433987204, -3.594010117, -4.017748779)
LRR_var<-c("NA", 0.336731047, 0.010676627, 0.029840885,
"NA", 0.006313648, 0.024512868, 0.025768057,
"NA", 0.006313648, 0.230755613, 0.02576805)
output<-data.frame(fullname, Treat, Mean.Germ.coef, sd.germ.coef,
n, LRR, LRR_var)
我正在尝试使用 tidyr::pivot_longer()
,但我认为我的问题是缺乏对正则表达式的理解。
我们可以使用 pivot_longer
和 names_pattern
来捕获列名中的子字符串,即第二个捕获组仅包含末尾的数字 (\d+
) ($
) 的字符串。有些列前面有 _
,有些则没有。因此,我们在捕获组之前使用 [_\D]
来删除它并捕获前面的其余字符作为第一组 ((.*)
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -fullname, names_to = c(".value", "Treat"),
names_pattern = "(.*)[_\D](\d+)$")
-输出
# A tibble: 12 × 7
fullname Treat Mean.Germ.coef sd.germ.coef LR LRR_va n
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Argemone glauca 0 0.31 0.0551 NA NA 5
2 Argemone glauca 10 0.119 0.153 -0.954 0.337 5
3 Argemone glauca 20 0.01 0 -3.43 0.00631 5
4 Argemone glauca 35 0.01 0 -3.43 0.00631 5
5 Bacopa monnieri 0 0.768 0.148 NA NA 5
6 Bacopa monnieri 10 0.624 0.0795 -0.207 0.0107 5
7 Bacopa monnieri 20 0.202 0.0592 -1.33 0.0245 5
8 Bacopa monnieri 35 0.0211 0.0223 -3.59 0.231 5
9 Brighamia insignis 0 0.556 0.199 NA NA 5
10 Brighamia insignis 10 0.479 0.0684 -0.148 0.0298 5
11 Brighamia insignis 20 0.01 0 -4.02 0.0258 5
12 Brighamia insignis 35 0.01 0 -4.02 0.0258 5
或 names_sep
的另一个选项
library(stringr)
df %>%
pivot_longer(cols = -fullname, names_to = c(".value", "Treat"),
names_sep = "(?<=\D)(?=\d+$)") %>%
rename_with(~ str_remove(.x, "_"))
-输出
# A tibble: 12 × 7
fullname Treat Mean.Germ.coef sd.germ.coef LRR LRRvar n
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Argemone glauca 0 0.31 0.0551 NA NA 5
2 Argemone glauca 10 0.119 0.153 -0.954 0.337 5
3 Argemone glauca 20 0.01 0 -3.43 0.00631 5
4 Argemone glauca 35 0.01 0 -3.43 0.00631 5
5 Bacopa monnieri 0 0.768 0.148 NA NA 5
6 Bacopa monnieri 10 0.624 0.0795 -0.207 0.0107 5
7 Bacopa monnieri 20 0.202 0.0592 -1.33 0.0245 5
8 Bacopa monnieri 35 0.0211 0.0223 -3.59 0.231 5
9 Brighamia insignis 0 0.556 0.199 NA NA 5
10 Brighamia insignis 10 0.479 0.0684 -0.148 0.0298 5
11 Brighamia insignis 20 0.01 0 -4.02 0.0258 5
12 Brighamia insignis 35 0.01 0 -4.02 0.0258 5
数据
df <- structure(list(fullname = c("Argemone glauca", "Bacopa monnieri",
"Brighamia insignis"), Mean.Germ.coef_0 = c(0.31, 0.768015267,
0.555758514), Mean.Germ.coef_10 = c(0.119356725, 0.624444444,
0.479357585), Mean.Germ.coef_20 = c(0.01, 0.202431661, 0.01),
Mean.Germ.coef_35 = c(0.01, 0.021111111, 0.01),
sd.germ.coef_0 = c(0.055079106,
0.148040638, 0.199485791), sd.germ.coef_10 = c(0.15341342,
0.079546759, 0.068405754), sd.germ.coef_20 = c(0, 0.059160256,
0), sd.germ.coef_35 = c(0, 0.022308189, 0), LRR10 = c(-0.954455598,
-0.206947247, -0.147887029),
LRR_var10 = c(0.336731047, 0.010676627,
0.029840885), LRR20 = c(-3.433987204, -1.333407261, -4.017748779
), LRR_var20 = c(0.006313648, 0.024512868, 0.025768057),
LRR35 = c(-3.433987204, -3.594010117, -4.017748779),
LRR_var35 = c(0.006313648,
0.230755613, 0.025768057), n_0 = c(5, 5, 5), n_10 = c(5,
5, 5), n_20 = c(5, 5, 5), n_35 = c(5, 5, 5)),
class = "data.frame", row.names = c(NA,
-3L))
我的真实数据的前三行是这样的;
# Real data example
fullname<-c("Argemone glauca", "Bacopa monnieri", "Brighamia insignis")
Mean.Germ.coef_0<-c(0.31, 0.768015267, 0.555758514)
Mean.Germ.coef_10<-c(0.119356725, 0.624444444, 0.479357585)
Mean.Germ.coef_20<-c(0.01, 0.202431661, 0.01)
Mean.Germ.coef_35<-c(0.01, 0.021111111, 0.01)
sd.germ.coef_0<-c(0.055079106, 0.148040638, 0.199485791)
sd.germ.coef_10<-c(0.15341342, 0.079546759, 0.068405754)
sd.germ.coef_20<-c(0, 0.059160256, 0)
sd.germ.coef_35<-c(0, 0.022308189, 0)
n_0<-c(5, 5, 5)
n_10<-c(5, 5, 5)
n_20<-c(5, 5, 5)
n_35<-c(5, 5, 5)
LRR10<-c(-0.954455598, -0.206947247, -0.147887029)
LRR_var10<-c(0.336731047, 0.010676627, 0.029840885)
LRR20<-c(-3.433987204, -1.333407261, -4.017748779)
LRR_var20<-c(0.006313648, 0.024512868, 0.025768057)
LRR35<-c(-3.433987204, -3.594010117, -4.017748779)
LRR_var35<-c(0.006313648, 0.230755613, 0.025768057)
df<-data.frame(fullname, Mean.Germ.coef_0, Mean.Germ.coef_10, Mean.Germ.coef_20, Mean.Germ.coef_35,
sd.germ.coef_0, sd.germ.coef_10, sd.germ.coef_20, sd.germ.coef_35,
LRR10, LRR_var10, LRR20, LRR_var20, LRR35, LRR_var35)
我需要一个很长的 data.frame(或 tibble),其中特定的列进入特定的行;
# Example output
fullname<-c(rep("Argemone glauca", 4), rep("Bacopa monnieri", 4), rep("Brighamia insignis", 4))
Treat<-rep(c(0, 10, 20, 35), 3)
Mean.Germ.coef<-c(0.31, 0.768015267, 0.555758514, 0.01, 0.202431661, 0.01,
0.01, 0.202431661, 0.01, 0.01, 0.021111111, 0.01)
sd.germ.coef<-c(0.055079106, 0.148040638, 0.199485791, 0.15341342, 0.079546759, 0.068405754,
0, 0.059160256, 0, 0, 0.022308189, 0)
n<-rep(5, 12)
LRR<-c("NA", -0.954455598, -0.206947247, -0.147887029,
"NA", -3.433987204, -1.333407261, -4.017748779,
"NA", -3.433987204, -3.594010117, -4.017748779)
LRR_var<-c("NA", 0.336731047, 0.010676627, 0.029840885,
"NA", 0.006313648, 0.024512868, 0.025768057,
"NA", 0.006313648, 0.230755613, 0.02576805)
output<-data.frame(fullname, Treat, Mean.Germ.coef, sd.germ.coef,
n, LRR, LRR_var)
我正在尝试使用 tidyr::pivot_longer()
,但我认为我的问题是缺乏对正则表达式的理解。
我们可以使用 pivot_longer
和 names_pattern
来捕获列名中的子字符串,即第二个捕获组仅包含末尾的数字 (\d+
) ($
) 的字符串。有些列前面有 _
,有些则没有。因此,我们在捕获组之前使用 [_\D]
来删除它并捕获前面的其余字符作为第一组 ((.*)
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -fullname, names_to = c(".value", "Treat"),
names_pattern = "(.*)[_\D](\d+)$")
-输出
# A tibble: 12 × 7
fullname Treat Mean.Germ.coef sd.germ.coef LR LRR_va n
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Argemone glauca 0 0.31 0.0551 NA NA 5
2 Argemone glauca 10 0.119 0.153 -0.954 0.337 5
3 Argemone glauca 20 0.01 0 -3.43 0.00631 5
4 Argemone glauca 35 0.01 0 -3.43 0.00631 5
5 Bacopa monnieri 0 0.768 0.148 NA NA 5
6 Bacopa monnieri 10 0.624 0.0795 -0.207 0.0107 5
7 Bacopa monnieri 20 0.202 0.0592 -1.33 0.0245 5
8 Bacopa monnieri 35 0.0211 0.0223 -3.59 0.231 5
9 Brighamia insignis 0 0.556 0.199 NA NA 5
10 Brighamia insignis 10 0.479 0.0684 -0.148 0.0298 5
11 Brighamia insignis 20 0.01 0 -4.02 0.0258 5
12 Brighamia insignis 35 0.01 0 -4.02 0.0258 5
或 names_sep
library(stringr)
df %>%
pivot_longer(cols = -fullname, names_to = c(".value", "Treat"),
names_sep = "(?<=\D)(?=\d+$)") %>%
rename_with(~ str_remove(.x, "_"))
-输出
# A tibble: 12 × 7
fullname Treat Mean.Germ.coef sd.germ.coef LRR LRRvar n
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Argemone glauca 0 0.31 0.0551 NA NA 5
2 Argemone glauca 10 0.119 0.153 -0.954 0.337 5
3 Argemone glauca 20 0.01 0 -3.43 0.00631 5
4 Argemone glauca 35 0.01 0 -3.43 0.00631 5
5 Bacopa monnieri 0 0.768 0.148 NA NA 5
6 Bacopa monnieri 10 0.624 0.0795 -0.207 0.0107 5
7 Bacopa monnieri 20 0.202 0.0592 -1.33 0.0245 5
8 Bacopa monnieri 35 0.0211 0.0223 -3.59 0.231 5
9 Brighamia insignis 0 0.556 0.199 NA NA 5
10 Brighamia insignis 10 0.479 0.0684 -0.148 0.0298 5
11 Brighamia insignis 20 0.01 0 -4.02 0.0258 5
12 Brighamia insignis 35 0.01 0 -4.02 0.0258 5
数据
df <- structure(list(fullname = c("Argemone glauca", "Bacopa monnieri",
"Brighamia insignis"), Mean.Germ.coef_0 = c(0.31, 0.768015267,
0.555758514), Mean.Germ.coef_10 = c(0.119356725, 0.624444444,
0.479357585), Mean.Germ.coef_20 = c(0.01, 0.202431661, 0.01),
Mean.Germ.coef_35 = c(0.01, 0.021111111, 0.01),
sd.germ.coef_0 = c(0.055079106,
0.148040638, 0.199485791), sd.germ.coef_10 = c(0.15341342,
0.079546759, 0.068405754), sd.germ.coef_20 = c(0, 0.059160256,
0), sd.germ.coef_35 = c(0, 0.022308189, 0), LRR10 = c(-0.954455598,
-0.206947247, -0.147887029),
LRR_var10 = c(0.336731047, 0.010676627,
0.029840885), LRR20 = c(-3.433987204, -1.333407261, -4.017748779
), LRR_var20 = c(0.006313648, 0.024512868, 0.025768057),
LRR35 = c(-3.433987204, -3.594010117, -4.017748779),
LRR_var35 = c(0.006313648,
0.230755613, 0.025768057), n_0 = c(5, 5, 5), n_10 = c(5,
5, 5), n_20 = c(5, 5, 5), n_35 = c(5, 5, 5)),
class = "data.frame", row.names = c(NA,
-3L))