使用重复列名称拆分数据框
split data frame with recurring column names
我有这个大数据,其中包含多个具有通用列名的数据框的汇编。为了说明,它看起来有点像这样:
df <- data_frame(A = c(runif(3), "A", runif(4), "A", runif(5)),
B = c(runif(3), "B", runif(4), "B", runif(5)),
C = c(runif(3), "C", runif(4), "C", runif(5)),
D = c(runif(3), "D", runif(4), "D", runif(5)))
如何为包含原始列名称的每个新行创建一个数据框,这样我会得到一个看起来像这样的结果:
df1:
A B C D id
0.668 0.411 0.553 0.477 1
0.794 0.821 0.530 0.732 1
0.108 0.647 0.789 0.693 1
df2:
A B C D id
0.724 0.783 0.023 0.478 2
0.861 0.099 0.407 0.332 2
0.438 0.316 0.913 0.651 2
0.245 0.519 0.294 0.258 2
0.070 0.662 0.459 0.479 2
0.766 0.839 0.892 0.961 2
df3:
A B C D id
0.084 0.347 0.864 0.435 3
0.875 0.334 0.390 0.713 3
0.339 0.476 0.777 0.400 3
0.084 0.347 0.864 0.435 3
谢谢!
一种方法是在数据框中找到列的名称并根据该名称进行拆分。我们可以使用 group_split
将数据拆分为数据帧,我们使用 map_at
从每个列表中删除第一行,因为它是列名,并使用 type.convert
将列转换为相应的 类。
library(dplyr)
library(purrr)
temp <- df %>%
group_split(id = cumsum(A == names(.)[1]) + 1) %>%
map_at(-1, tail, -1) %>%
map(type.convert)
temp
#[[1]]
# A tibble: 3 x 5
# A B C D id
# <dbl> <dbl> <dbl> <dbl> <int>
#1 0.668 0.411 0.553 0.477 1
#2 0.794 0.821 0.530 0.732 1
#3 0.108 0.647 0.789 0.693 1
#[[2]]
# A tibble: 6 x 5
# A B C D id
# <dbl> <dbl> <dbl> <dbl> <int>
#1 0.724 0.783 0.023 0.478 2
#2 0.861 0.099 0.407 0.332 2
#3 0.438 0.316 0.913 0.651 2
#4 0.245 0.519 0.294 0.258 2
#5 0.07 0.662 0.459 0.479 2
#6 0.766 0.839 0.892 0.961 2
#[[3]]
# A tibble: 4 x 5
# A B C D id
# <dbl> <dbl> <dbl> <dbl> <int>
#1 0.084 0.347 0.864 0.435 3
#2 0.875 0.334 0.39 0.713 3
#3 0.339 0.476 0.777 0.4 3
#4 0.084 0.347 0.864 0.435 3
在 base R 中使用相同的逻辑,我们可以做到
df$id <- cumsum(df$A == names(df)[1]) + 1
temp <- split(df, df$id)
temp[-1] <- lapply(temp[-1], tail, -1)
temp <- lapply(temp, type.convert)
如果您需要它们作为单独的数据帧,
names(temp) <- paste0("df", seq_along(temp))
list2env(temp, .GlobalEnv)
数据
df <- structure(list(A = structure(c(7L, 10L, 3L, 13L, 8L, 11L, 6L,
4L, 1L, 9L, 13L, 2L, 12L, 5L, 2L), .Label = c("0.070", "0.084",
"0.108", "0.245", "0.339", "0.438", "0.668", "0.724", "0.766",
"0.794", "0.861", "0.875", "A"), class = "factor"), B = structure(c(5L,
11L, 8L, 13L, 10L, 1L, 2L, 7L, 9L, 12L, 13L, 4L, 3L, 6L, 4L), .Label = c("0.099",
"0.316", "0.334", "0.347", "0.411", "0.476", "0.519", "0.647",
"0.662", "0.783", "0.821", "0.839", "B"), class = "factor"),
C = structure(c(7L, 6L, 9L, 13L, 1L, 4L, 12L, 2L, 5L, 11L,
13L, 10L, 3L, 8L, 10L), .Label = c("0.023", "0.294", "0.390",
"0.407", "0.459", "0.530", "0.553", "0.777", "0.789", "0.864",
"0.892", "0.913", "C"), class = "factor"), D = structure(c(5L,
11L, 9L, 13L, 6L, 2L, 8L, 1L, 7L, 12L, 13L, 4L, 10L, 3L,
4L), .Label = c("0.258", "0.332", "0.400", "0.435", "0.477",
"0.478", "0.479", "0.651", "0.693", "0.713", "0.732", "0.961",
"D"), class = "factor")), class = "data.frame", row.names = c(NA, -15L))
这是使用 apply()
+ split()
的基础 R 解决方案
idx <- apply(df, 1, function(v) all(v==names(df)))
df <- within(df[-which(idx),],id <- (cumsum(idx)+1)[-which(idx)])
res <- split(df,df$id)
这样
> res
$`1`
A B C D id
1 0.668 0.411 0.553 0.477 1
2 0.794 0.821 0.530 0.732 1
3 0.108 0.647 0.789 0.693 1
$`2`
A B C D id
5 0.724 0.783 0.023 0.478 2
6 0.861 0.099 0.407 0.332 2
7 0.438 0.316 0.913 0.651 2
8 0.245 0.519 0.294 0.258 2
9 0.070 0.662 0.459 0.479 2
10 0.766 0.839 0.892 0.961 2
$`3`
A B C D id
12 0.084 0.347 0.864 0.435 3
13 0.875 0.334 0.390 0.713 3
14 0.339 0.476 0.777 0.400 3
15 0.084 0.347 0.864 0.435 3
我有这个大数据,其中包含多个具有通用列名的数据框的汇编。为了说明,它看起来有点像这样:
df <- data_frame(A = c(runif(3), "A", runif(4), "A", runif(5)),
B = c(runif(3), "B", runif(4), "B", runif(5)),
C = c(runif(3), "C", runif(4), "C", runif(5)),
D = c(runif(3), "D", runif(4), "D", runif(5)))
如何为包含原始列名称的每个新行创建一个数据框,这样我会得到一个看起来像这样的结果:
df1:
A B C D id
0.668 0.411 0.553 0.477 1
0.794 0.821 0.530 0.732 1
0.108 0.647 0.789 0.693 1
df2:
A B C D id
0.724 0.783 0.023 0.478 2
0.861 0.099 0.407 0.332 2
0.438 0.316 0.913 0.651 2
0.245 0.519 0.294 0.258 2
0.070 0.662 0.459 0.479 2
0.766 0.839 0.892 0.961 2
df3:
A B C D id
0.084 0.347 0.864 0.435 3
0.875 0.334 0.390 0.713 3
0.339 0.476 0.777 0.400 3
0.084 0.347 0.864 0.435 3
谢谢!
一种方法是在数据框中找到列的名称并根据该名称进行拆分。我们可以使用 group_split
将数据拆分为数据帧,我们使用 map_at
从每个列表中删除第一行,因为它是列名,并使用 type.convert
将列转换为相应的 类。
library(dplyr)
library(purrr)
temp <- df %>%
group_split(id = cumsum(A == names(.)[1]) + 1) %>%
map_at(-1, tail, -1) %>%
map(type.convert)
temp
#[[1]]
# A tibble: 3 x 5
# A B C D id
# <dbl> <dbl> <dbl> <dbl> <int>
#1 0.668 0.411 0.553 0.477 1
#2 0.794 0.821 0.530 0.732 1
#3 0.108 0.647 0.789 0.693 1
#[[2]]
# A tibble: 6 x 5
# A B C D id
# <dbl> <dbl> <dbl> <dbl> <int>
#1 0.724 0.783 0.023 0.478 2
#2 0.861 0.099 0.407 0.332 2
#3 0.438 0.316 0.913 0.651 2
#4 0.245 0.519 0.294 0.258 2
#5 0.07 0.662 0.459 0.479 2
#6 0.766 0.839 0.892 0.961 2
#[[3]]
# A tibble: 4 x 5
# A B C D id
# <dbl> <dbl> <dbl> <dbl> <int>
#1 0.084 0.347 0.864 0.435 3
#2 0.875 0.334 0.39 0.713 3
#3 0.339 0.476 0.777 0.4 3
#4 0.084 0.347 0.864 0.435 3
在 base R 中使用相同的逻辑,我们可以做到
df$id <- cumsum(df$A == names(df)[1]) + 1
temp <- split(df, df$id)
temp[-1] <- lapply(temp[-1], tail, -1)
temp <- lapply(temp, type.convert)
如果您需要它们作为单独的数据帧,
names(temp) <- paste0("df", seq_along(temp))
list2env(temp, .GlobalEnv)
数据
df <- structure(list(A = structure(c(7L, 10L, 3L, 13L, 8L, 11L, 6L,
4L, 1L, 9L, 13L, 2L, 12L, 5L, 2L), .Label = c("0.070", "0.084",
"0.108", "0.245", "0.339", "0.438", "0.668", "0.724", "0.766",
"0.794", "0.861", "0.875", "A"), class = "factor"), B = structure(c(5L,
11L, 8L, 13L, 10L, 1L, 2L, 7L, 9L, 12L, 13L, 4L, 3L, 6L, 4L), .Label = c("0.099",
"0.316", "0.334", "0.347", "0.411", "0.476", "0.519", "0.647",
"0.662", "0.783", "0.821", "0.839", "B"), class = "factor"),
C = structure(c(7L, 6L, 9L, 13L, 1L, 4L, 12L, 2L, 5L, 11L,
13L, 10L, 3L, 8L, 10L), .Label = c("0.023", "0.294", "0.390",
"0.407", "0.459", "0.530", "0.553", "0.777", "0.789", "0.864",
"0.892", "0.913", "C"), class = "factor"), D = structure(c(5L,
11L, 9L, 13L, 6L, 2L, 8L, 1L, 7L, 12L, 13L, 4L, 10L, 3L,
4L), .Label = c("0.258", "0.332", "0.400", "0.435", "0.477",
"0.478", "0.479", "0.651", "0.693", "0.713", "0.732", "0.961",
"D"), class = "factor")), class = "data.frame", row.names = c(NA, -15L))
这是使用 apply()
+ split()
idx <- apply(df, 1, function(v) all(v==names(df)))
df <- within(df[-which(idx),],id <- (cumsum(idx)+1)[-which(idx)])
res <- split(df,df$id)
这样
> res
$`1`
A B C D id
1 0.668 0.411 0.553 0.477 1
2 0.794 0.821 0.530 0.732 1
3 0.108 0.647 0.789 0.693 1
$`2`
A B C D id
5 0.724 0.783 0.023 0.478 2
6 0.861 0.099 0.407 0.332 2
7 0.438 0.316 0.913 0.651 2
8 0.245 0.519 0.294 0.258 2
9 0.070 0.662 0.459 0.479 2
10 0.766 0.839 0.892 0.961 2
$`3`
A B C D id
12 0.084 0.347 0.864 0.435 3
13 0.875 0.334 0.390 0.713 3
14 0.339 0.476 0.777 0.400 3
15 0.084 0.347 0.864 0.435 3