单个 sheet 中的多个数据表,名称有一些重叠和分隔列 headers
Multiple data tables in single sheet with some overlap of names and separating column headers
我有一个包含多个表的数据集,我想将它们合并到同一个 Excel sheet 中。 sheet 的数据格式如下:
group_no group variable marker1 marker2 marker3
1 GR-01-1 1 - - A
1 GR-01-1 2 B A B
1 GR-01-1 3 A B -
1 GR-01-1 4 - - -
1 GR-01-1 5 B - B
group_no. group variable marker 4 marker5 marker6 marker7
2 GR-01-1a 1 B - A -
2 GR-01-1a 2 B A - B
2 GR-01-1a 3 A B - B
2 GR-01-1a 4 A A A A
group_no group variable marker8
3 GR-01-2 1 B
3 GR-01-2 2 A
3 GR-01-2 3 -
group_no group variable marker9 marker10
4 GR-02-1 1 B A
4 GR-02-1 2 A A
4 GR-02-1 3 - B
4 GR-02-1 4 B B
4 GR-02-1 5 A B
4 GR-02-1 6 - A
group_no group variable marker11 marker12 marker13
5 GR-02-2 d B B A
5 GR-02-2 e A B B
5 GR-02-2 f B - -
5 GR-02-2 g - B -
从上面可以看出,数据提供者没有将标记的名称分离到唯一的列中。所有的桌子都叠在一起。我想要做的是将数据转换成这样:
marker_name group_no group 1 2 3 4 5 6 a b c d e f g
marker1 GR-01-1 1 - B A - B
marker2 GR-01-1 1 - A B - -
marker3 GR-01-1 1 A B - - B
marker4 GR-01-1a 2 B B A A
marker5 GR-01-1a 2 - A B A
marker6 GR-01-1a 2 A - - A
marker7 GR-01-1a 2 - B B A
marker8 GR-01-2 3 B A -
marker9 GR-02-1 4 B A - B A -
marker10 GR-02-1 4 A A B B B A
marker11 GR-02-2 5 B A B -
marker12 GR-02-2 5 B B - B
marker13 GR-02-2 5 A B - -
有没有办法在 R 中做到这一点?也许是一个循环?我从 R 中的转置开始,但那没有用,我想知道是否有一种方法可以通过 group_no 读取数据。但没有运气。我想我还是个新手。单sheet我的数据集中有917张表。我希望我不必将每个分配给新的 R object one-by-one 然后 rbind 或 cbind.
#Reading data into R
library(readr)
df1 <- read_csv("C:/Users/userm/Desktop/df1.csv",
col_names = FALSE)
#Loading libraries:
library(plyr)
library(tidyr)
spl <- diff(c(which(df1$X1 == 'group_no'), nrow(df1))) #defining the split for each table
#Splitting to a list of dataframes, where each dataframe is one of the tables of interest:
dl2 <- split(df1,
c(rep(1, spl[1]),
rep(2, spl[2]),
rep(3, spl[3]),
rep(4, spl[4]),
rep(5, spl[5])))
#Assigning the first row as the dataframe header:
dl3 <- lapply(dl2, function(d){colnames(d) <- d[1,]; d <- d[-1,]})
#Removing columns with All NAs:
dl4 <- lapply(dl3, function(d){d <- d[,colSums(is.na(d))<nrow(d)]})
#Binding dataframes with different number of columns (plyr):
df5 <- rbind.fill(dl4)
#Converting from Wide to Long (tidyr):
df6 <- pivot_longer(df5, -c("group_no", "group", "variable"),
names_to = "marker_name", values_to = "value")
#Convert from Long to Wide with a different arrangement (tidyr):
df7 <- pivot_wider(df6, names_from = "variable", values_from = "value")
#> # A tibble: 65 x 16
#> group_no group marker_name a b c `4` `5` g `1`
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 GR-0~ marker1 - B A - B <NA> <NA>
#> 2 1 GR-0~ marker2 - A B - - <NA> <NA>
#> 3 1 GR-0~ marker3 A B - - B <NA> <NA>
#> 4 1 GR-0~ marker4 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 5 1 GR-0~ marker5 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 6 1 GR-0~ marker6 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 7 1 GR-0~ marker7 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 8 1 GR-0~ marker8 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 9 1 GR-0~ marker9 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 10 1 GR-0~ marker10 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> # ... with 55 more rows, and 6 more variables: `2` <chr>, `3` <chr>,
#> # `6` <chr>, d <chr>, e <chr>, f <chr>
数据:
df1 <- structure(list(X1 = c("group_no", "1", "1", "1", "1", "1", "group_no",
"2", "2", "2", "2", "group_no", "3", "3", "3", "group_no", "4",
"4", "4", "4", "4", "4", "group_no", "5", "5", "5", "5"), X2 = c("group",
"GR-01-1", "GR-01-1", "GR-01-1", "GR-01-1", "GR-01-1", "group",
"GR-01-1a", "GR-01-1a", "GR-01-1a", "GR-01-1a", "group", "GR-01-2",
"GR-01-2", "GR-01-2", "group", "GR-02-1", "GR-02-1", "GR-02-1",
"GR-02-1", "GR-02-1", "GR-02-1", "group", "GR-02-2", "GR-02-2",
"GR-02-2", "GR-02-2"), X3 = c("variable", "a", "b", "c", "4",
"5", "variable", "1", "2", "3", "4", "variable", "1", "2", "3",
"variable", "1", "2", "3", "4", "5", "6", "variable", "d", "e",
"f", "g"), X4 = c("marker1", "-", "B", "A", "-", "B", "marker4",
"B", "B", "A", "A", "marker8", "B", "A", "-", "marker9", "B",
"A", "-", "B", "A", "-", "marker11", "B", "A", "B", "-"), X5 = c("marker2",
"-", "A", "B", "-", "-", "marker5", "-", "A", "B", "A", NA, NA,
NA, NA, "marker10", "A", "A", "B", "B", "B", "A", "marker12",
"B", "B", "-", "B"), X6 = c("marker3", "A", "B", "-", "-", "B",
"marker6", "A", "-", "-", "A", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "marker13", "A", "B", "-", "-"), X7 = c(NA, NA, NA,
NA, NA, NA, "marker7", "-", "B", "B", "A", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -27L), spec = structure(list(
cols = list(X1 = structure(list(), class = c("collector_character",
"collector")), X2 = structure(list(), class = c("collector_character",
"collector")), X3 = structure(list(), class = c("collector_character",
"collector")), X4 = structure(list(), class = c("collector_character",
"collector")), X5 = structure(list(), class = c("collector_character",
"collector")), X6 = structure(list(), class = c("collector_character",
"collector")), X7 = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 0), class = "col_spec"))
X1 X2 X3 X4 X5 X6 X7
1 group_no group variable marker1 marker2 marker3 NA
2 1 GR-01-1 a - - A NA
3 1 GR-01-1 b B A B NA
4 1 GR-01-1 c A B - NA
5 1 GR-01-1 4 - - - NA
6 1 GR-01-1 5 B - B NA
7 group_no group variable marker4 marker5 marker6 marker7
8 2 GR-01-1a 1 B - A -
9 2 GR-01-1a 2 B A - B
10 2 GR-01-1a 3 A B - B
# ... with 17 more rows
我有一个包含多个表的数据集,我想将它们合并到同一个 Excel sheet 中。 sheet 的数据格式如下:
group_no group variable marker1 marker2 marker3
1 GR-01-1 1 - - A
1 GR-01-1 2 B A B
1 GR-01-1 3 A B -
1 GR-01-1 4 - - -
1 GR-01-1 5 B - B
group_no. group variable marker 4 marker5 marker6 marker7
2 GR-01-1a 1 B - A -
2 GR-01-1a 2 B A - B
2 GR-01-1a 3 A B - B
2 GR-01-1a 4 A A A A
group_no group variable marker8
3 GR-01-2 1 B
3 GR-01-2 2 A
3 GR-01-2 3 -
group_no group variable marker9 marker10
4 GR-02-1 1 B A
4 GR-02-1 2 A A
4 GR-02-1 3 - B
4 GR-02-1 4 B B
4 GR-02-1 5 A B
4 GR-02-1 6 - A
group_no group variable marker11 marker12 marker13
5 GR-02-2 d B B A
5 GR-02-2 e A B B
5 GR-02-2 f B - -
5 GR-02-2 g - B -
从上面可以看出,数据提供者没有将标记的名称分离到唯一的列中。所有的桌子都叠在一起。我想要做的是将数据转换成这样:
marker_name group_no group 1 2 3 4 5 6 a b c d e f g
marker1 GR-01-1 1 - B A - B
marker2 GR-01-1 1 - A B - -
marker3 GR-01-1 1 A B - - B
marker4 GR-01-1a 2 B B A A
marker5 GR-01-1a 2 - A B A
marker6 GR-01-1a 2 A - - A
marker7 GR-01-1a 2 - B B A
marker8 GR-01-2 3 B A -
marker9 GR-02-1 4 B A - B A -
marker10 GR-02-1 4 A A B B B A
marker11 GR-02-2 5 B A B -
marker12 GR-02-2 5 B B - B
marker13 GR-02-2 5 A B - -
有没有办法在 R 中做到这一点?也许是一个循环?我从 R 中的转置开始,但那没有用,我想知道是否有一种方法可以通过 group_no 读取数据。但没有运气。我想我还是个新手。单sheet我的数据集中有917张表。我希望我不必将每个分配给新的 R object one-by-one 然后 rbind 或 cbind.
#Reading data into R
library(readr)
df1 <- read_csv("C:/Users/userm/Desktop/df1.csv",
col_names = FALSE)
#Loading libraries:
library(plyr)
library(tidyr)
spl <- diff(c(which(df1$X1 == 'group_no'), nrow(df1))) #defining the split for each table
#Splitting to a list of dataframes, where each dataframe is one of the tables of interest:
dl2 <- split(df1,
c(rep(1, spl[1]),
rep(2, spl[2]),
rep(3, spl[3]),
rep(4, spl[4]),
rep(5, spl[5])))
#Assigning the first row as the dataframe header:
dl3 <- lapply(dl2, function(d){colnames(d) <- d[1,]; d <- d[-1,]})
#Removing columns with All NAs:
dl4 <- lapply(dl3, function(d){d <- d[,colSums(is.na(d))<nrow(d)]})
#Binding dataframes with different number of columns (plyr):
df5 <- rbind.fill(dl4)
#Converting from Wide to Long (tidyr):
df6 <- pivot_longer(df5, -c("group_no", "group", "variable"),
names_to = "marker_name", values_to = "value")
#Convert from Long to Wide with a different arrangement (tidyr):
df7 <- pivot_wider(df6, names_from = "variable", values_from = "value")
#> # A tibble: 65 x 16
#> group_no group marker_name a b c `4` `5` g `1`
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 GR-0~ marker1 - B A - B <NA> <NA>
#> 2 1 GR-0~ marker2 - A B - - <NA> <NA>
#> 3 1 GR-0~ marker3 A B - - B <NA> <NA>
#> 4 1 GR-0~ marker4 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 5 1 GR-0~ marker5 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 6 1 GR-0~ marker6 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 7 1 GR-0~ marker7 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 8 1 GR-0~ marker8 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 9 1 GR-0~ marker9 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 10 1 GR-0~ marker10 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> # ... with 55 more rows, and 6 more variables: `2` <chr>, `3` <chr>,
#> # `6` <chr>, d <chr>, e <chr>, f <chr>
数据:
df1 <- structure(list(X1 = c("group_no", "1", "1", "1", "1", "1", "group_no",
"2", "2", "2", "2", "group_no", "3", "3", "3", "group_no", "4",
"4", "4", "4", "4", "4", "group_no", "5", "5", "5", "5"), X2 = c("group",
"GR-01-1", "GR-01-1", "GR-01-1", "GR-01-1", "GR-01-1", "group",
"GR-01-1a", "GR-01-1a", "GR-01-1a", "GR-01-1a", "group", "GR-01-2",
"GR-01-2", "GR-01-2", "group", "GR-02-1", "GR-02-1", "GR-02-1",
"GR-02-1", "GR-02-1", "GR-02-1", "group", "GR-02-2", "GR-02-2",
"GR-02-2", "GR-02-2"), X3 = c("variable", "a", "b", "c", "4",
"5", "variable", "1", "2", "3", "4", "variable", "1", "2", "3",
"variable", "1", "2", "3", "4", "5", "6", "variable", "d", "e",
"f", "g"), X4 = c("marker1", "-", "B", "A", "-", "B", "marker4",
"B", "B", "A", "A", "marker8", "B", "A", "-", "marker9", "B",
"A", "-", "B", "A", "-", "marker11", "B", "A", "B", "-"), X5 = c("marker2",
"-", "A", "B", "-", "-", "marker5", "-", "A", "B", "A", NA, NA,
NA, NA, "marker10", "A", "A", "B", "B", "B", "A", "marker12",
"B", "B", "-", "B"), X6 = c("marker3", "A", "B", "-", "-", "B",
"marker6", "A", "-", "-", "A", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "marker13", "A", "B", "-", "-"), X7 = c(NA, NA, NA,
NA, NA, NA, "marker7", "-", "B", "B", "A", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -27L), spec = structure(list(
cols = list(X1 = structure(list(), class = c("collector_character",
"collector")), X2 = structure(list(), class = c("collector_character",
"collector")), X3 = structure(list(), class = c("collector_character",
"collector")), X4 = structure(list(), class = c("collector_character",
"collector")), X5 = structure(list(), class = c("collector_character",
"collector")), X6 = structure(list(), class = c("collector_character",
"collector")), X7 = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 0), class = "col_spec"))
X1 X2 X3 X4 X5 X6 X7
1 group_no group variable marker1 marker2 marker3 NA
2 1 GR-01-1 a - - A NA
3 1 GR-01-1 b B A B NA
4 1 GR-01-1 c A B - NA
5 1 GR-01-1 4 - - - NA
6 1 GR-01-1 5 B - B NA
7 group_no group variable marker4 marker5 marker6 marker7
8 2 GR-01-1a 1 B - A -
9 2 GR-01-1a 2 B A - B
10 2 GR-01-1a 3 A B - B
# ... with 17 more rows