如何使用 tidyr 工具整理大型数据框
how to tidy up a large dataframe with the tidyr tool
目前,我有这个宽数据框。
> dput(head(data))
structure(list(Host.H = c("Human", "Human", "Human", "Human",
"Human", "Human"), Seq_ID.H = c(">H3-USA", ">H3-USA", ">H3-USA",
">H3-USA", ">H3-USA", ">H3-USA"), Start = c(1L, 121L, 161L, 401L,
721L, 1081L), End = c(160L, 240L, 280L, 520L, 1040L, 1280L),
Strand.H = c("Forward", "Forward", "Forward", "Forward",
"Forward", "Forward"), P.H = c(0.995962, 0.985782, 0.997249,
0.983122, 0.998574, 0.993674), Locus_ID.H = c("id \"locus1\"",
"id \"locus5\"", "id \"locus7\"", "id \"locus8\"", "id \"locus10\"",
"id \"locus12\""), Host.I = c(NA, "Intermediate", NA, NA,
NA, NA), Seq_ID.I = c(NA, ">I3-MM-CHA", NA, NA, NA, NA),
Strand.I = c(NA, "Forward", NA, NA, NA, NA), P.I = c(NA,
0.988441, NA, NA, NA, NA), Locus_ID.I = c(NA, "id \"locus5\"",
NA, NA, NA, NA), Host.B = c(NA, "Bat", "Bat", "Bat", "Bat",
NA), Seq_ID.B = c(NA, ">B2-RS-CHA", ">B2-RS-CHA", ">B2-RS-CHA",
">B2-RS-CHA", NA), Strand.B = c(NA, "Forward", "Forward",
"Forward", "Forward", NA), P.B = c(NA, 0.987457, 0.997273,
0.975433, 0.998187, NA), Locus_ID.B = c(NA, "id \"locus7\"",
"id \"locus9\"", "id \"locus10\"", "id \"locus11\"", NA),
Host.C = c(NA, "Consensus", "Consensus", "Consensus", "Consensus",
NA), Seq_ID.C = c(NA, ">I3-MM-CHA", ">I3-MM-CHA", ">I3-MM-CHA",
">I3-MM-CHA", NA), Strand.C = c(NA, "Forward", "Forward",
"Forward", "Forward", NA), P.C = c(NA, 0.98647, 0.997287,
0.981532, 0.998712, NA), Locus_ID.C = c(NA, "id \"locus7\"",
"id \"locus9\"", "id \"locus10\"", "id \"locus12\"", NA),
Type = c("Unique", "Conserved", "Shared", "Shared", "Shared",
"Unique")), row.names = c(NA, 6L), class = "data.frame")
我一直在寻找整理这些数据的方法。要执行此操作,我需要将所有后缀为 (.H、.I、.B 和 .C) 的列按如下方式分组到每一列中 (Host, Seq_ID, Start, End, Strand, P , Locus_ID 和类型)。
注意“Type”的值必须按照每一行对应的赋值来分配,我用下面的例子解释一下想要的输出
Host Seq_ID Start End Strand P Locus_ID Type
Human >H3-USA 1 160 Forward 0.99 id "locus1" Unique
Human >H3-USA 121 240 Forward 0.98 id "locus5" Conserved
Intermediate>I3-MM-CHA 121 240 Forward 0.98 id "locus5" Conserved
Bat >B2-RS-CHA 121 240 Forward 0.98 id "locus7" Conserved
Consensus >I3-MM-CHA 121 240 Forward 0.98 id "locus7" Conserved
Human >H3-USA 161 280 Forward 0.99 id "locus7" Shared
so on...
那我总结一下“开始”和“结束”的位置是否相等,只要“Host”相同即可。
我曾尝试使用“旋转更长”功能,但无法正常工作。为此,我希望你能与我合作。我将不胜感激
这是 tidyverse
中的解决方案,它一般利用 dplyr
,具体利用 tidyr::pivot_longer()
:
library(tidyverse)
# ...
# Code to generate 'data'.
# ...
data_2 <- data %>% pivot_longer(# "Unpivot" all columns except these:
cols = !c(Start, End, Type),
# Uses regex lookaround to target the last ".",
# used to separate column prefixes from suffixes.
names_sep = "\.(?!.*\.)",
# "Unpivot" to consolidate columns with the same
# prefix, and then record the suffixes in a new
# 'Suffix' column.
names_to = c(".value", "Suffix")) %>%
# Remove rows with missing 'Host'.
filter(!is.na(Host)) %>%
# Within each group of 'Host' and 'Type', select only the first row with each
# combo of 'Start' and 'End'.
group_by(Host, Type, Start, End) %>% filter(row_number() == 1) %>% ungroup() %>%
# Select the columns in proper order.
select(Host, Suffix, Seq_ID, Start, End, Strand, P, Locus_ID, Type) %>%
# Sort as desired.
arrange(Start, End) %>%
# Convert back into data.frame, if desired.
as.data.frame()
# Print the results.
data_2
给定一个数据集data
,就像您提供的示例
data <- structure(list(Host.H = c("Human", "Human", "Human", "Human", "Human", "Human"),
Seq_ID.H = c(">H3-USA", ">H3-USA", ">H3-USA", ">H3-USA", ">H3-USA", ">H3-USA"),
Start = c(1, 121, 161, 401, 721, 1081),
End = c(160, 240, 280, 520, 1040, 1280),
Strand.H = c("Forward", "Forward", "Forward", "Forward", "Forward", "Forward"),
P.H = c(0.995962, 0.985782, 0.997249, 0.983122, 0.998574, 0.993674),
Locus_ID.H = c("id \"locus1\"", "id \"locus5\"", "id \"locus7\"", "id \"locus8\"", "id \"locus10\"", "id \"locus12\""),
Host.I = c(NA, "Intermediate", NA, NA, NA, NA),
Seq_ID.I = c(NA, ">I3-MM-CHA", NA, NA, NA, NA),
Strand.I = c(NA, "Forward", NA, NA, NA, NA),
P.I = c(NA, 0.988441, NA, NA, NA, NA),
Locus_ID.I = c(NA, "id \"locus5\"", NA, NA, NA, NA),
Host.B = c(NA, "Bat", "Bat", "Bat", "Bat", NA),
Seq_ID.B = c(NA, ">B2-RS-CHA", ">B2-RS-CHA", ">B2-RS-CHA", ">B2-RS-CHA", NA),
Strand.B = c(NA, "Forward", "Forward", "Forward", "Forward", NA),
P.B = c(NA, 0.987457, 0.997273, 0.975433, 0.998187, NA),
Locus_ID.B = c(NA, "id \"locus7\"", "id \"locus9\"", "id \"locus10\"", "id \"locus11\"", NA),
Host.C = c(NA, "Consensus", "Consensus", "Consensus", "Consensus", NA),
Seq_ID.C = c(NA, ">I3-MM-CHA", ">I3-MM-CHA", ">I3-MM-CHA", ">I3-MM-CHA", NA),
Strand.C = c(NA, "Forward", "Forward", "Forward", "Forward", NA),
P.C = c(NA, 0.98647, 0.997287, 0.981532, 0.998712, NA),
Locus_ID.C = c(NA, "id \"locus7\"", "id \"locus9\"", "id \"locus10\"", "id \"locus12\"", NA),
Type = c("Unique", "Conserved", "Shared", "Shared", "Shared", "Unique")),
row.names = c(NA, -6L),
class = "data.frame")
您应该像这样获得 data_2
的输出:
Start End Type Suffix Host Seq_ID Strand P Locus_ID
1 1 160 Unique H Human >H3-USA Forward 0.995962 id "locus1"
2 121 240 Conserved H Human >H3-USA Forward 0.985782 id "locus5"
3 121 240 Conserved I Intermediate >I3-MM-CHA Forward 0.988441 id "locus5"
4 121 240 Conserved B Bat >B2-RS-CHA Forward 0.987457 id "locus7"
5 121 240 Conserved C Consensus >I3-MM-CHA Forward 0.986470 id "locus7"
6 161 280 Shared H Human >H3-USA Forward 0.997249 id "locus7"
7 161 280 Shared B Bat >B2-RS-CHA Forward 0.997273 id "locus9"
8 161 280 Shared C Consensus >I3-MM-CHA Forward 0.997287 id "locus9"
9 401 520 Shared H Human >H3-USA Forward 0.983122 id "locus8"
10 401 520 Shared B Bat >B2-RS-CHA Forward 0.975433 id "locus10"
11 401 520 Shared C Consensus >I3-MM-CHA Forward 0.981532 id "locus10"
12 721 1040 Shared H Human >H3-USA Forward 0.998574 id "locus10"
13 721 1040 Shared B Bat >B2-RS-CHA Forward 0.998187 id "locus11"
14 721 1040 Shared C Consensus >I3-MM-CHA Forward 0.998712 id "locus12"
15 1081 1280 Unique H Human >H3-USA Forward 0.993674 id "locus12"
目前,我有这个宽数据框。
> dput(head(data))
structure(list(Host.H = c("Human", "Human", "Human", "Human",
"Human", "Human"), Seq_ID.H = c(">H3-USA", ">H3-USA", ">H3-USA",
">H3-USA", ">H3-USA", ">H3-USA"), Start = c(1L, 121L, 161L, 401L,
721L, 1081L), End = c(160L, 240L, 280L, 520L, 1040L, 1280L),
Strand.H = c("Forward", "Forward", "Forward", "Forward",
"Forward", "Forward"), P.H = c(0.995962, 0.985782, 0.997249,
0.983122, 0.998574, 0.993674), Locus_ID.H = c("id \"locus1\"",
"id \"locus5\"", "id \"locus7\"", "id \"locus8\"", "id \"locus10\"",
"id \"locus12\""), Host.I = c(NA, "Intermediate", NA, NA,
NA, NA), Seq_ID.I = c(NA, ">I3-MM-CHA", NA, NA, NA, NA),
Strand.I = c(NA, "Forward", NA, NA, NA, NA), P.I = c(NA,
0.988441, NA, NA, NA, NA), Locus_ID.I = c(NA, "id \"locus5\"",
NA, NA, NA, NA), Host.B = c(NA, "Bat", "Bat", "Bat", "Bat",
NA), Seq_ID.B = c(NA, ">B2-RS-CHA", ">B2-RS-CHA", ">B2-RS-CHA",
">B2-RS-CHA", NA), Strand.B = c(NA, "Forward", "Forward",
"Forward", "Forward", NA), P.B = c(NA, 0.987457, 0.997273,
0.975433, 0.998187, NA), Locus_ID.B = c(NA, "id \"locus7\"",
"id \"locus9\"", "id \"locus10\"", "id \"locus11\"", NA),
Host.C = c(NA, "Consensus", "Consensus", "Consensus", "Consensus",
NA), Seq_ID.C = c(NA, ">I3-MM-CHA", ">I3-MM-CHA", ">I3-MM-CHA",
">I3-MM-CHA", NA), Strand.C = c(NA, "Forward", "Forward",
"Forward", "Forward", NA), P.C = c(NA, 0.98647, 0.997287,
0.981532, 0.998712, NA), Locus_ID.C = c(NA, "id \"locus7\"",
"id \"locus9\"", "id \"locus10\"", "id \"locus12\"", NA),
Type = c("Unique", "Conserved", "Shared", "Shared", "Shared",
"Unique")), row.names = c(NA, 6L), class = "data.frame")
我一直在寻找整理这些数据的方法。要执行此操作,我需要将所有后缀为 (.H、.I、.B 和 .C) 的列按如下方式分组到每一列中 (Host, Seq_ID, Start, End, Strand, P , Locus_ID 和类型)。 注意“Type”的值必须按照每一行对应的赋值来分配,我用下面的例子解释一下想要的输出
Host Seq_ID Start End Strand P Locus_ID Type
Human >H3-USA 1 160 Forward 0.99 id "locus1" Unique
Human >H3-USA 121 240 Forward 0.98 id "locus5" Conserved
Intermediate>I3-MM-CHA 121 240 Forward 0.98 id "locus5" Conserved
Bat >B2-RS-CHA 121 240 Forward 0.98 id "locus7" Conserved
Consensus >I3-MM-CHA 121 240 Forward 0.98 id "locus7" Conserved
Human >H3-USA 161 280 Forward 0.99 id "locus7" Shared
so on...
那我总结一下“开始”和“结束”的位置是否相等,只要“Host”相同即可。
我曾尝试使用“旋转更长”功能,但无法正常工作。为此,我希望你能与我合作。我将不胜感激
这是 tidyverse
中的解决方案,它一般利用 dplyr
,具体利用 tidyr::pivot_longer()
:
library(tidyverse)
# ...
# Code to generate 'data'.
# ...
data_2 <- data %>% pivot_longer(# "Unpivot" all columns except these:
cols = !c(Start, End, Type),
# Uses regex lookaround to target the last ".",
# used to separate column prefixes from suffixes.
names_sep = "\.(?!.*\.)",
# "Unpivot" to consolidate columns with the same
# prefix, and then record the suffixes in a new
# 'Suffix' column.
names_to = c(".value", "Suffix")) %>%
# Remove rows with missing 'Host'.
filter(!is.na(Host)) %>%
# Within each group of 'Host' and 'Type', select only the first row with each
# combo of 'Start' and 'End'.
group_by(Host, Type, Start, End) %>% filter(row_number() == 1) %>% ungroup() %>%
# Select the columns in proper order.
select(Host, Suffix, Seq_ID, Start, End, Strand, P, Locus_ID, Type) %>%
# Sort as desired.
arrange(Start, End) %>%
# Convert back into data.frame, if desired.
as.data.frame()
# Print the results.
data_2
给定一个数据集data
,就像您提供的示例
data <- structure(list(Host.H = c("Human", "Human", "Human", "Human", "Human", "Human"),
Seq_ID.H = c(">H3-USA", ">H3-USA", ">H3-USA", ">H3-USA", ">H3-USA", ">H3-USA"),
Start = c(1, 121, 161, 401, 721, 1081),
End = c(160, 240, 280, 520, 1040, 1280),
Strand.H = c("Forward", "Forward", "Forward", "Forward", "Forward", "Forward"),
P.H = c(0.995962, 0.985782, 0.997249, 0.983122, 0.998574, 0.993674),
Locus_ID.H = c("id \"locus1\"", "id \"locus5\"", "id \"locus7\"", "id \"locus8\"", "id \"locus10\"", "id \"locus12\""),
Host.I = c(NA, "Intermediate", NA, NA, NA, NA),
Seq_ID.I = c(NA, ">I3-MM-CHA", NA, NA, NA, NA),
Strand.I = c(NA, "Forward", NA, NA, NA, NA),
P.I = c(NA, 0.988441, NA, NA, NA, NA),
Locus_ID.I = c(NA, "id \"locus5\"", NA, NA, NA, NA),
Host.B = c(NA, "Bat", "Bat", "Bat", "Bat", NA),
Seq_ID.B = c(NA, ">B2-RS-CHA", ">B2-RS-CHA", ">B2-RS-CHA", ">B2-RS-CHA", NA),
Strand.B = c(NA, "Forward", "Forward", "Forward", "Forward", NA),
P.B = c(NA, 0.987457, 0.997273, 0.975433, 0.998187, NA),
Locus_ID.B = c(NA, "id \"locus7\"", "id \"locus9\"", "id \"locus10\"", "id \"locus11\"", NA),
Host.C = c(NA, "Consensus", "Consensus", "Consensus", "Consensus", NA),
Seq_ID.C = c(NA, ">I3-MM-CHA", ">I3-MM-CHA", ">I3-MM-CHA", ">I3-MM-CHA", NA),
Strand.C = c(NA, "Forward", "Forward", "Forward", "Forward", NA),
P.C = c(NA, 0.98647, 0.997287, 0.981532, 0.998712, NA),
Locus_ID.C = c(NA, "id \"locus7\"", "id \"locus9\"", "id \"locus10\"", "id \"locus12\"", NA),
Type = c("Unique", "Conserved", "Shared", "Shared", "Shared", "Unique")),
row.names = c(NA, -6L),
class = "data.frame")
您应该像这样获得 data_2
的输出:
Start End Type Suffix Host Seq_ID Strand P Locus_ID
1 1 160 Unique H Human >H3-USA Forward 0.995962 id "locus1"
2 121 240 Conserved H Human >H3-USA Forward 0.985782 id "locus5"
3 121 240 Conserved I Intermediate >I3-MM-CHA Forward 0.988441 id "locus5"
4 121 240 Conserved B Bat >B2-RS-CHA Forward 0.987457 id "locus7"
5 121 240 Conserved C Consensus >I3-MM-CHA Forward 0.986470 id "locus7"
6 161 280 Shared H Human >H3-USA Forward 0.997249 id "locus7"
7 161 280 Shared B Bat >B2-RS-CHA Forward 0.997273 id "locus9"
8 161 280 Shared C Consensus >I3-MM-CHA Forward 0.997287 id "locus9"
9 401 520 Shared H Human >H3-USA Forward 0.983122 id "locus8"
10 401 520 Shared B Bat >B2-RS-CHA Forward 0.975433 id "locus10"
11 401 520 Shared C Consensus >I3-MM-CHA Forward 0.981532 id "locus10"
12 721 1040 Shared H Human >H3-USA Forward 0.998574 id "locus10"
13 721 1040 Shared B Bat >B2-RS-CHA Forward 0.998187 id "locus11"
14 721 1040 Shared C Consensus >I3-MM-CHA Forward 0.998712 id "locus12"
15 1081 1280 Unique H Human >H3-USA Forward 0.993674 id "locus12"