从 data.frame 中的行中提取 data.frame 列名称
Extract data.frame column names from rows in data.frame
我有几个 data.frames 结构大致相同。对于可重现的示例,我创建了两个示例数据框 df1
和 df2
.
df1 <- structure(list(sample = c(2L, 6L), data1 = c(56L, 78L), data2 = c(59L,
27L), data6 = c(90L, 28L), data1namet = structure(c(1L, 1L), .Label = "Sam1", class = "factor"),
data2namab = structure(c(1L, 1L), .Label = "Test2", class = "factor"),
dataame = structure(c(1L, 1L), .Label = "Ex3", class = "factor")), .Names = c("sample",
"data1", "data2", "data3", "data1namet", "data2namab", "dataame"
), class = "data.frame", row.names = c(NA, -2L))
df1
sample data1 data2 data3 data1namet data2namab dataame
1 2 56 59 90 Sam1 Test2 Ex3
2 6 78 27 28 Sam1 Test2 Ex3
df2 <- structure(list(sample = c(12L, 13L, 17L), data1 = c(56L, 78L,
3L), data2 = c(59L, 27L, 2L), datest = structure(c(1L, 1L,
1L), .Label = "Exa9", class = "factor"), dattestr = structure(c(1L,
1L, 1L), .Label = "cz1", class = "factor")), .Names = c("sample",
"data1", "data2", "datest", "dattestr"), class = "data.frame", row.names = c(NA,
-3L))
df2
sample data1 data2 datest dattestr
1 12 56 59 Exa9 cz1
2 13 78 27 Exa9 cz1
3 17 3 2 Exa9 cz1
数据的名称保存在数据列之后的列中,我想知道是否有一种方法可以重构它们包含的 data.frames(大约 40 data.frames)列名称中的数据名称?
df1
sample Sam1 Test2 Ex3
1 2 56 59 90
2 6 78 27 28
和
df2
sample Exa9 cz1
1 12 56 59
2 13 78 27
3 17 3 2
编辑
我刚刚意识到我在数据列之后还有其他列,因此我的输入数据看起来像这样
df1 <- structure(list(sample = c(2L, 6L), data1 = c(56L, 78L), data2 = c(59L,
27L), data3 = c(90L, 28L), data1namet = structure(c(1L, 1L), .Label = "Sam1", class = "factor"),
data2namab = structure(c(1L, 1L), .Label = "Test2", class = "factor"),
dataame = structure(c(1L, 1L), .Label = "Ex3", class = "factor"),
ma = c("Jay", "Jay")), .Names = c("sample", "data1", "data2",
"data3", "data1namet", "data2namab", "dataame", "ma"), row.names = c(NA,
-2L), class = "data.frame")
df1
sample data1 data2 data3 data1namet data2namab dataame ma
1 2 56 59 90 Sam1 Test2 Ex3 Jay
2 6 78 27 28 Sam1 Test2 Ex3 Jay
df2 <- structure(list(sample = c(12L, 13L, 17L), data1 = c(56L, 78L,
3L), data2 = c(59L, 27L, 2L), datest = structure(c(1L, 1L, 1L
), .Label = "Exa9", class = "factor"), dattestr = structure(c(1L,
1L, 1L), .Label = "cz1", class = "factor"), add = c(2, 2, 2)), .Names = c("sample",
"data1", "data2", "datest", "dattestr", "add"), row.names = c(NA,
-3L), class = "data.frame")
df2
sample data1 data2 datest dattestr add
1 12 56 59 Exa9 cz1 2
2 13 78 27 Exa9 cz1 2
3 17 3 2 Exa9 cz1 2
在这种情况下,ma
和 add
列不是数据的一部分,应该像这样添加在末尾:
df1
sample Sam1 Test2 Ex3 ma
1 2 56 59 90 Jay
2 6 78 27 28 Jay
和
df2
sample Exa9 cz1 add
1 12 56 59 2
2 13 78 27 2
3 17 3 2 2
可以从确定应保留哪些列开始:
keep_col <- which(sapply(df2, is.numeric))
之后,需要做一些工作来提取新的列名并重命名数据框中的相应列:
names <- df2[1,keep_col[-1] + length(keep_col)-1]
colnames(df2)[keep_col[-1]] <- as.character(unlist(names))
最后,可以通过仅保留所需的列来重新组合数据框:
df2 <- df2[,keep_col]
#> df2
# sample Exa9 cz1
#1 12 56 59
#2 13 78 27
#3 17 3 2
为了将此转换用于多个不同的数据帧,可以将代码包装到一个函数中:
summarize_table <- function(x){
keep_col <- which(sapply(x, is.numeric))
names <- x[1,keep_col[-1] + length(keep_col)-1]
colnames(x)[keep_col[-1]] <- as.character(unlist(names))
x <- x[,keep_col]
}
如果各种数据帧存储在列表中,函数summarize_table()
可以与lapply()
一起使用以获得每个数据帧的结果:
my_dfs <- list(df1,df2)
out <- lapply(my_dfs,summarize_table)
#> out
#[[1]]
# sample Sam1 Test2 Ex3
#1 2 56 59 90
#2 6 78 27 28
#
#[[2]]
# sample Exa9 cz1
#1 12 56 59
#2 13 78 27
#3 17 3 2
编辑/附录
下面的修改版本应该也能处理修改后提到的情况 post:
summarize_tab2 <- function(x){
keep_col <- which(sapply(x, is.numeric))
first_block <- c(keep_col[1],keep_col[which(diff(keep_col)==1)])
add_col <- FALSE
if (2 * (length(keep_col) - 1) + 1 < ncol(x)) add_col <- TRUE
keep_col1 <- keep_col[1:length(first_block)]
names <- x[1,keep_col1[-1] + length(keep_col1) - 1]
colnames(x)[keep_col1[-1]] <- as.character(unlist(names))
df_t <- x[,keep_col]
if (add_col) df_t <- cbind(df_t, x[(2 * (ncol(df_t) - 1) + 2):ncol(x)])
return(df_t)
}
my_dfs <- list(df1, df2, df3, df4)
out <- lapply(my_dfs, summarize_tab2)
#> out
#[[1]]
# sample Sam1 Test2 Ex3 ma
#1 2 56 59 90 Jay
#2 6 78 27 28 Jay
#
#[[2]]
# sample Exa9 cz1 add
#1 12 56 59 2
#2 13 78 27 2
#3 17 3 2 2
#
#[[3]]
# sample Sam1 Test2 Ex3
#1 2 56 59 90
#2 6 78 27 28
#
#[[4]]
# sample Exa9 cz1
#1 12 56 59
#2 13 78 27
#3 17 3 2
这里的数据帧df3
和df4
分别是原来post的数据帧df1
和df2
。
以下应该有效:
library(plyr)
cols.to.rename <- grep('^data(.)$', colnames(df1))
cols.of.names <- max(cols.to.rename)+seq(1,length(cols.to.rename))
the.names <- lapply(df1[1,cols.of.names], as.character)
df1.mod <- df1
colnames(df1.mod)[cols.to.rename] <- the.names
df1.mod <- df1.mod[-cols.of.names]
它将所有 dataX 列重命名为最后一个 dataX 列之后的列中的(第一个)值。然后它从数据框中删除所有名称列。
我有几个 data.frames 结构大致相同。对于可重现的示例,我创建了两个示例数据框 df1
和 df2
.
df1 <- structure(list(sample = c(2L, 6L), data1 = c(56L, 78L), data2 = c(59L,
27L), data6 = c(90L, 28L), data1namet = structure(c(1L, 1L), .Label = "Sam1", class = "factor"),
data2namab = structure(c(1L, 1L), .Label = "Test2", class = "factor"),
dataame = structure(c(1L, 1L), .Label = "Ex3", class = "factor")), .Names = c("sample",
"data1", "data2", "data3", "data1namet", "data2namab", "dataame"
), class = "data.frame", row.names = c(NA, -2L))
df1
sample data1 data2 data3 data1namet data2namab dataame
1 2 56 59 90 Sam1 Test2 Ex3
2 6 78 27 28 Sam1 Test2 Ex3
df2 <- structure(list(sample = c(12L, 13L, 17L), data1 = c(56L, 78L,
3L), data2 = c(59L, 27L, 2L), datest = structure(c(1L, 1L,
1L), .Label = "Exa9", class = "factor"), dattestr = structure(c(1L,
1L, 1L), .Label = "cz1", class = "factor")), .Names = c("sample",
"data1", "data2", "datest", "dattestr"), class = "data.frame", row.names = c(NA,
-3L))
df2
sample data1 data2 datest dattestr
1 12 56 59 Exa9 cz1
2 13 78 27 Exa9 cz1
3 17 3 2 Exa9 cz1
数据的名称保存在数据列之后的列中,我想知道是否有一种方法可以重构它们包含的 data.frames(大约 40 data.frames)列名称中的数据名称?
df1
sample Sam1 Test2 Ex3
1 2 56 59 90
2 6 78 27 28
和
df2
sample Exa9 cz1
1 12 56 59
2 13 78 27
3 17 3 2
编辑
我刚刚意识到我在数据列之后还有其他列,因此我的输入数据看起来像这样
df1 <- structure(list(sample = c(2L, 6L), data1 = c(56L, 78L), data2 = c(59L,
27L), data3 = c(90L, 28L), data1namet = structure(c(1L, 1L), .Label = "Sam1", class = "factor"),
data2namab = structure(c(1L, 1L), .Label = "Test2", class = "factor"),
dataame = structure(c(1L, 1L), .Label = "Ex3", class = "factor"),
ma = c("Jay", "Jay")), .Names = c("sample", "data1", "data2",
"data3", "data1namet", "data2namab", "dataame", "ma"), row.names = c(NA,
-2L), class = "data.frame")
df1
sample data1 data2 data3 data1namet data2namab dataame ma
1 2 56 59 90 Sam1 Test2 Ex3 Jay
2 6 78 27 28 Sam1 Test2 Ex3 Jay
df2 <- structure(list(sample = c(12L, 13L, 17L), data1 = c(56L, 78L,
3L), data2 = c(59L, 27L, 2L), datest = structure(c(1L, 1L, 1L
), .Label = "Exa9", class = "factor"), dattestr = structure(c(1L,
1L, 1L), .Label = "cz1", class = "factor"), add = c(2, 2, 2)), .Names = c("sample",
"data1", "data2", "datest", "dattestr", "add"), row.names = c(NA,
-3L), class = "data.frame")
df2
sample data1 data2 datest dattestr add
1 12 56 59 Exa9 cz1 2
2 13 78 27 Exa9 cz1 2
3 17 3 2 Exa9 cz1 2
在这种情况下,ma
和 add
列不是数据的一部分,应该像这样添加在末尾:
df1
sample Sam1 Test2 Ex3 ma
1 2 56 59 90 Jay
2 6 78 27 28 Jay
和
df2
sample Exa9 cz1 add
1 12 56 59 2
2 13 78 27 2
3 17 3 2 2
可以从确定应保留哪些列开始:
keep_col <- which(sapply(df2, is.numeric))
之后,需要做一些工作来提取新的列名并重命名数据框中的相应列:
names <- df2[1,keep_col[-1] + length(keep_col)-1]
colnames(df2)[keep_col[-1]] <- as.character(unlist(names))
最后,可以通过仅保留所需的列来重新组合数据框:
df2 <- df2[,keep_col]
#> df2
# sample Exa9 cz1
#1 12 56 59
#2 13 78 27
#3 17 3 2
为了将此转换用于多个不同的数据帧,可以将代码包装到一个函数中:
summarize_table <- function(x){
keep_col <- which(sapply(x, is.numeric))
names <- x[1,keep_col[-1] + length(keep_col)-1]
colnames(x)[keep_col[-1]] <- as.character(unlist(names))
x <- x[,keep_col]
}
如果各种数据帧存储在列表中,函数summarize_table()
可以与lapply()
一起使用以获得每个数据帧的结果:
my_dfs <- list(df1,df2)
out <- lapply(my_dfs,summarize_table)
#> out
#[[1]]
# sample Sam1 Test2 Ex3
#1 2 56 59 90
#2 6 78 27 28
#
#[[2]]
# sample Exa9 cz1
#1 12 56 59
#2 13 78 27
#3 17 3 2
编辑/附录
下面的修改版本应该也能处理修改后提到的情况 post:
summarize_tab2 <- function(x){
keep_col <- which(sapply(x, is.numeric))
first_block <- c(keep_col[1],keep_col[which(diff(keep_col)==1)])
add_col <- FALSE
if (2 * (length(keep_col) - 1) + 1 < ncol(x)) add_col <- TRUE
keep_col1 <- keep_col[1:length(first_block)]
names <- x[1,keep_col1[-1] + length(keep_col1) - 1]
colnames(x)[keep_col1[-1]] <- as.character(unlist(names))
df_t <- x[,keep_col]
if (add_col) df_t <- cbind(df_t, x[(2 * (ncol(df_t) - 1) + 2):ncol(x)])
return(df_t)
}
my_dfs <- list(df1, df2, df3, df4)
out <- lapply(my_dfs, summarize_tab2)
#> out
#[[1]]
# sample Sam1 Test2 Ex3 ma
#1 2 56 59 90 Jay
#2 6 78 27 28 Jay
#
#[[2]]
# sample Exa9 cz1 add
#1 12 56 59 2
#2 13 78 27 2
#3 17 3 2 2
#
#[[3]]
# sample Sam1 Test2 Ex3
#1 2 56 59 90
#2 6 78 27 28
#
#[[4]]
# sample Exa9 cz1
#1 12 56 59
#2 13 78 27
#3 17 3 2
这里的数据帧df3
和df4
分别是原来post的数据帧df1
和df2
。
以下应该有效:
library(plyr)
cols.to.rename <- grep('^data(.)$', colnames(df1))
cols.of.names <- max(cols.to.rename)+seq(1,length(cols.to.rename))
the.names <- lapply(df1[1,cols.of.names], as.character)
df1.mod <- df1
colnames(df1.mod)[cols.to.rename] <- the.names
df1.mod <- df1.mod[-cols.of.names]
它将所有 dataX 列重命名为最后一个 dataX 列之后的列中的(第一个)值。然后它从数据框中删除所有名称列。