循环数据帧以创建新变量
Loop over dataframes to create new variable
我有 30 个数据帧,我需要根据一些条件在每个数据帧中创建一个新变量。
我正在尝试做一个 for 循环,但我没有得到它。我在这里搜索了一些例子,但我做得太过分了。有什么帮助吗?
我所做的是:
dflist <- c("chr1", "chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10",
"chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18",
"chr19","chr20","chr21","chr22","chr23","chr24","chr25","chr26","chr27",
"chr28","chr29","chr30")
for (df in dflist){
for (i in 1:length(df)){
df[i,]$Dist2 = ifelse(df[i,]$Dist <= 10000,1,
ifelse(df[i,]$Dist > 10000 & df[i,]$Dist <= 20000 ,2,
ifelse(df[i,]$Dist > 20000 & df[i,]$Dist <= 30000 ,3,
ifelse(df[i,]$Dist > 30000 & df[i,]$Dist <= 40000 ,4,
ifelse(df[i,]$Dist > 40000 & df[i,]$Dist <= 50000 ,5,
ifelse(df[i,]$Dist > 50000 & df[i,]$Dist <= 60000 ,6,
ifelse(df[i,]$Dist > 60000 & df[i,]$Dist <= 70000 ,7,
ifelse(df[i,]$Dist > 70000 & df[i,]$Dist <= 80000 ,8,
ifelse(df[i,]$Dist > 80000 & df[i,]$Dist <= 90000 ,9,
ifelse(df[i,]$Dist > 90000 & df[i,]$Dist <= 100000 ,10,NA))))))))))}}
每个文件如下所示:
Chr SNP1 SNP2 Dist Sign r2
1 26 507478 507479 9727 + 0.789
2 26 507478 507480 13907 - 0.093
3 26 507478 507481 23618 - 0.002
4 26 507478 507482 59349 - 0.245
5 26 507478 507483 62804 + 0.266
6 26 507478 507484 65323 + 0.029
非常感谢。
干杯。
宝拉
我们可以使用 cut
。我们将 data.frames 保留在 list
('lst') 中,通过 lapply
和 transform
循环 data.frame 通过 list
使用 cut
.
创建另一列 'Dist2'
lapply(lst, transform, Dist2= cut(Dist,
breaks=c(-Inf,seq(1e4, 1e5, by=1e4), Inf), labels=c(1:10, NA)))
#[[1]]
# Chr SNP1 SNP2 Dist Sign r2 Dist2
#1 26 507478 507479 123300 + 0.789 <NA>
#2 26 507478 507480 13907 - 0.093 2
#3 26 507478 507481 23618 - 0.002 3
#4 26 507478 507482 59349 - 0.245 6
#5 26 507478 507483 62804 + 0.266 7
#6 26 507478 507484 65323 + 0.029 7
#[[2]]
# Chr SNP1 SNP2 Dist Sign r2 Dist2
#1 26 507478 507479 9727 + 0.789 1
#2 26 507478 507480 13907 - 0.093 2
#3 26 507478 507481 23618 - 0.002 3
#4 26 507478 507482 59349 - 0.245 6
#5 26 507478 507483 62804 + 0.266 7
#6 26 507478 507484 65323 + 0.029 7
数据
lst <- list(structure(list(Chr = c(26L, 26L, 26L, 26L, 26L, 26L),
SNP1 = c(507478L,
507478L, 507478L, 507478L, 507478L, 507478L), SNP2 = 507479:507484,
Dist = c(123300, 13907, 23618, 59349, 62804, 65323), Sign = c("+",
"-", "-", "-", "+", "+"), r2 = c(0.789, 0.093, 0.002, 0.245,
0.266, 0.029)), .Names = c("Chr", "SNP1", "SNP2", "Dist",
"Sign", "r2"), row.names = c("1", "2", "3", "4", "5", "6"),
class = "data.frame"),
structure(list(Chr = c(26L, 26L, 26L, 26L, 26L, 26L), SNP1 = c(507478L,
507478L, 507478L, 507478L, 507478L, 507478L), SNP2 = 507479:507484,
Dist = c(9727L, 13907L, 23618L, 59349L, 62804L, 65323L
), Sign = c("+", "-", "-", "-", "+", "+"), r2 = c(0.789,
0.093, 0.002, 0.245, 0.266, 0.029)), .Names = c("Chr",
"SNP1", "SNP2", "Dist", "Sign", "r2"), class = "data.frame",
row.names = c("1",
"2", "3", "4", "5", "6")))
使用 akrun 的回答中的 lst
数据,这是使用 ceiling()
的另一种可能方法
lapply(lst, function(x) {
## divide 'Dist' by 10,000 and push to the next integer
y <- ceiling(x$Dist / 1e4)
## replace the values over 10 with NA
is.na(y) <- y > 10
## bind the data to the new vector
cbind(x, Dist2 = y)
})
# [[1]]
# Chr SNP1 SNP2 Dist Sign r2 Dist2
# 1 26 507478 507479 123300 + 0.789 NA
# 2 26 507478 507480 13907 - 0.093 2
# 3 26 507478 507481 23618 - 0.002 3
# 4 26 507478 507482 59349 - 0.245 6
# 5 26 507478 507483 62804 + 0.266 7
# 6 26 507478 507484 65323 + 0.029 7
#
# [[2]]
# Chr SNP1 SNP2 Dist Sign r2 Dist2
# 1 26 507478 507479 9727 + 0.789 1
# 2 26 507478 507480 13907 - 0.093 2
# 3 26 507478 507481 23618 - 0.002 3
# 4 26 507478 507482 59349 - 0.245 6
# 5 26 507478 507483 62804 + 0.266 7
# 6 26 507478 507484 65323 + 0.029 7
我有 30 个数据帧,我需要根据一些条件在每个数据帧中创建一个新变量。 我正在尝试做一个 for 循环,但我没有得到它。我在这里搜索了一些例子,但我做得太过分了。有什么帮助吗?
我所做的是:
dflist <- c("chr1", "chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10",
"chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18",
"chr19","chr20","chr21","chr22","chr23","chr24","chr25","chr26","chr27",
"chr28","chr29","chr30")
for (df in dflist){
for (i in 1:length(df)){
df[i,]$Dist2 = ifelse(df[i,]$Dist <= 10000,1,
ifelse(df[i,]$Dist > 10000 & df[i,]$Dist <= 20000 ,2,
ifelse(df[i,]$Dist > 20000 & df[i,]$Dist <= 30000 ,3,
ifelse(df[i,]$Dist > 30000 & df[i,]$Dist <= 40000 ,4,
ifelse(df[i,]$Dist > 40000 & df[i,]$Dist <= 50000 ,5,
ifelse(df[i,]$Dist > 50000 & df[i,]$Dist <= 60000 ,6,
ifelse(df[i,]$Dist > 60000 & df[i,]$Dist <= 70000 ,7,
ifelse(df[i,]$Dist > 70000 & df[i,]$Dist <= 80000 ,8,
ifelse(df[i,]$Dist > 80000 & df[i,]$Dist <= 90000 ,9,
ifelse(df[i,]$Dist > 90000 & df[i,]$Dist <= 100000 ,10,NA))))))))))}}
每个文件如下所示:
Chr SNP1 SNP2 Dist Sign r2
1 26 507478 507479 9727 + 0.789
2 26 507478 507480 13907 - 0.093
3 26 507478 507481 23618 - 0.002
4 26 507478 507482 59349 - 0.245
5 26 507478 507483 62804 + 0.266
6 26 507478 507484 65323 + 0.029
非常感谢。 干杯。 宝拉
我们可以使用 cut
。我们将 data.frames 保留在 list
('lst') 中,通过 lapply
和 transform
循环 data.frame 通过 list
使用 cut
.
lapply(lst, transform, Dist2= cut(Dist,
breaks=c(-Inf,seq(1e4, 1e5, by=1e4), Inf), labels=c(1:10, NA)))
#[[1]]
# Chr SNP1 SNP2 Dist Sign r2 Dist2
#1 26 507478 507479 123300 + 0.789 <NA>
#2 26 507478 507480 13907 - 0.093 2
#3 26 507478 507481 23618 - 0.002 3
#4 26 507478 507482 59349 - 0.245 6
#5 26 507478 507483 62804 + 0.266 7
#6 26 507478 507484 65323 + 0.029 7
#[[2]]
# Chr SNP1 SNP2 Dist Sign r2 Dist2
#1 26 507478 507479 9727 + 0.789 1
#2 26 507478 507480 13907 - 0.093 2
#3 26 507478 507481 23618 - 0.002 3
#4 26 507478 507482 59349 - 0.245 6
#5 26 507478 507483 62804 + 0.266 7
#6 26 507478 507484 65323 + 0.029 7
数据
lst <- list(structure(list(Chr = c(26L, 26L, 26L, 26L, 26L, 26L),
SNP1 = c(507478L,
507478L, 507478L, 507478L, 507478L, 507478L), SNP2 = 507479:507484,
Dist = c(123300, 13907, 23618, 59349, 62804, 65323), Sign = c("+",
"-", "-", "-", "+", "+"), r2 = c(0.789, 0.093, 0.002, 0.245,
0.266, 0.029)), .Names = c("Chr", "SNP1", "SNP2", "Dist",
"Sign", "r2"), row.names = c("1", "2", "3", "4", "5", "6"),
class = "data.frame"),
structure(list(Chr = c(26L, 26L, 26L, 26L, 26L, 26L), SNP1 = c(507478L,
507478L, 507478L, 507478L, 507478L, 507478L), SNP2 = 507479:507484,
Dist = c(9727L, 13907L, 23618L, 59349L, 62804L, 65323L
), Sign = c("+", "-", "-", "-", "+", "+"), r2 = c(0.789,
0.093, 0.002, 0.245, 0.266, 0.029)), .Names = c("Chr",
"SNP1", "SNP2", "Dist", "Sign", "r2"), class = "data.frame",
row.names = c("1",
"2", "3", "4", "5", "6")))
使用 akrun 的回答中的 lst
数据,这是使用 ceiling()
lapply(lst, function(x) {
## divide 'Dist' by 10,000 and push to the next integer
y <- ceiling(x$Dist / 1e4)
## replace the values over 10 with NA
is.na(y) <- y > 10
## bind the data to the new vector
cbind(x, Dist2 = y)
})
# [[1]]
# Chr SNP1 SNP2 Dist Sign r2 Dist2
# 1 26 507478 507479 123300 + 0.789 NA
# 2 26 507478 507480 13907 - 0.093 2
# 3 26 507478 507481 23618 - 0.002 3
# 4 26 507478 507482 59349 - 0.245 6
# 5 26 507478 507483 62804 + 0.266 7
# 6 26 507478 507484 65323 + 0.029 7
#
# [[2]]
# Chr SNP1 SNP2 Dist Sign r2 Dist2
# 1 26 507478 507479 9727 + 0.789 1
# 2 26 507478 507480 13907 - 0.093 2
# 3 26 507478 507481 23618 - 0.002 3
# 4 26 507478 507482 59349 - 0.245 6
# 5 26 507478 507483 62804 + 0.266 7
# 6 26 507478 507484 65323 + 0.029 7