寻求一种更简洁的方法来避免嵌套的 if 语句(使用 sapply)
Seeking a cleaner way to avoid nested if statements (with sapply)
我有2个数据框
lookup_table <- data.frame(Country = c("UK","France", "Germany"), A = c(0,0,1), B = c(1,6,7), C = c(4,8,9))
set.seed(123) # for being reproducible
df <- data.frame(Country = c("UK","UK","France","France","Germany","Germany","Germany","France","UK"), Values = runif(9, 1, 10))
我想在 df 中有一个第 3 列,它根据第 2 列中的值和国家分配 class。
类似于下面的内容,但值不应固定:它们应取决于查找中的值和国家/地区 table
Class <- function(x) {
if(x > 0 & x <= 1) y <- "A"
if(x > 1 & x <= 4) y <- "B"
if(x > 4) y <- "C"
return(y)
}
df$Class <- sapply(df$Values,Class)
提前感谢您的帮助
我们可以在 'lookup_table' 和 'df' 之间做一个 join
on
'Country',melt
到 'long' 格式。正如@zx8754 评论的那样,使用按 'Country' 分组的 cut
函数(或 findInterval
获取数字索引,使用它来获取相应的 'variable',将其分配为 'newVar'
library(data.table)
d1 <- melt(setDT(lookup_table)[df, on = "Country"], id.var = c("Country", "Values"))[,
newVar:=unique(variable)[findInterval(Values, unique(value))], Country]
对感兴趣的列进行子集化并获得 unique
个值
unique(d1[, c("Country", "Values", "newVar"), with = FALSE])
# Country Values newVar
#1: UK 3.588198 B
#2: UK 8.094746 C
#3: France 4.680792 A
#4: France 8.947157 C
#5: Germany 9.464206 C
#6: Germany 1.410008 A
#7: Germany 5.752949 A
#8: France 9.031771 C
#9: UK 5.962915 C
另一个选项:
df <- merge(df, lookup_table, by='Country', all.x=T)
df$Class <- 'A' # default
df$Class <- with(df, replace(Class, Values > B & Values <= C, 'B'))
df$Class <- with(df, replace(Class, Values > C, 'C'))
df
# Country Values A B C Class
#1 France 2.371120 0 6 8 A
#2 France 6.155804 0 6 8 B
#3 France 5.635268 0 6 8 A
#4 Germany 9.661230 1 7 9 C
#5 Germany 6.412292 1 7 9 A
#6 Germany 3.148534 1 7 9 A
#7 UK 4.661493 0 1 4 C
#8 UK 6.933073 0 1 4 C
#9 UK 4.623160 0 1 4 C
您可以从结果中删除任何不需要的列。
这是以 R 为基数的结果:
dfa<-merge(lookup_table,df)
Class <- function(x) {
if(x[5] > x[2] & x[5] <= x[3]) y <- "A"
if(x[5] > x[3] & x[5] <= x[4]) y <- "B"
if(x[5] > x[4]) y <- "C"
return(y)
}
dfa$Class <- sapply(1:nrow(dfa),function(ri)Class(dfa[ri,]))
dfa[,-c(2:4)]
> dfa[,-c(2:4)]
Country Values Class
1 France 4.680792 A
2 France 8.947157 C
3 France 9.031771 C
4 Germany 1.410008 A
5 Germany 5.752949 A
6 Germany 9.464206 C
7 UK 3.588198 B
8 UK 8.094746 C
9 UK 5.962915 C
这是 dplyr
解决方案。
library(dplyr)
df %>%
inner_join(lookup_table, by = "Country") %>%
mutate(Class = ifelse(Values > A & Values < B, "A",
ifelse(Values > B & Values < C, "B",
ifelse(Values > C, "C", "Not_found"))))
在管道的末尾添加 select(-c(A,B,C))
以获得更清晰的输出 data.frame
。作为这种方法的一个额外好处,任何不在范围内的值都将被标记为 "Not_found"
.
如果您更改 lookup_table
的形式并指定间隔,则可以使用 non-equi
join 从 data.table 的开发版本 v1.9.7 轻松执行此任务(Installation instructions):
require(data.table) #v1.9.7+
setDT(df)[lookup, Class := i.Class, on = .(Country, Values > value1, Values <= value2)]
# Country Values Class
# 1: UK 3.588198 B
# 2: UK 8.094746 C
# 3: France 4.680792 A
# 4: France 8.947157 C
# 5: Germany 9.464206 C
# 6: Germany 1.410008 A
# 7: Germany 5.752949 A
# 8: France 9.031771 C
# 9: UK 5.962915 C
## i.Class refers to Class from i argument = lookup$Class
其中 lookup
从 lookup_table
构造如下:
setDT(lookup_table)[, D := Inf]
lookup = lookup_table[, .(Country,
Class = rep(c("A", "B", "C"), each=.N),
value1 = c(A, B, C),
value2 = c(B, C, D))]
# Country Class value1 value2
# 1: UK A 0 1
# 2: France A 0 6
# 3: Germany A 1 7
# 4: UK B 1 4
# 5: France B 6 8
# 6: Germany B 7 9
# 7: UK C 4 Inf
# 8: France C 8 Inf
# 9: Germany C 9 Inf
我有2个数据框
lookup_table <- data.frame(Country = c("UK","France", "Germany"), A = c(0,0,1), B = c(1,6,7), C = c(4,8,9))
set.seed(123) # for being reproducible
df <- data.frame(Country = c("UK","UK","France","France","Germany","Germany","Germany","France","UK"), Values = runif(9, 1, 10))
我想在 df 中有一个第 3 列,它根据第 2 列中的值和国家分配 class。
类似于下面的内容,但值不应固定:它们应取决于查找中的值和国家/地区 table
Class <- function(x) {
if(x > 0 & x <= 1) y <- "A"
if(x > 1 & x <= 4) y <- "B"
if(x > 4) y <- "C"
return(y)
}
df$Class <- sapply(df$Values,Class)
提前感谢您的帮助
我们可以在 'lookup_table' 和 'df' 之间做一个 join
on
'Country',melt
到 'long' 格式。正如@zx8754 评论的那样,使用按 'Country' 分组的 cut
函数(或 findInterval
获取数字索引,使用它来获取相应的 'variable',将其分配为 'newVar'
library(data.table)
d1 <- melt(setDT(lookup_table)[df, on = "Country"], id.var = c("Country", "Values"))[,
newVar:=unique(variable)[findInterval(Values, unique(value))], Country]
对感兴趣的列进行子集化并获得 unique
个值
unique(d1[, c("Country", "Values", "newVar"), with = FALSE])
# Country Values newVar
#1: UK 3.588198 B
#2: UK 8.094746 C
#3: France 4.680792 A
#4: France 8.947157 C
#5: Germany 9.464206 C
#6: Germany 1.410008 A
#7: Germany 5.752949 A
#8: France 9.031771 C
#9: UK 5.962915 C
另一个选项:
df <- merge(df, lookup_table, by='Country', all.x=T)
df$Class <- 'A' # default
df$Class <- with(df, replace(Class, Values > B & Values <= C, 'B'))
df$Class <- with(df, replace(Class, Values > C, 'C'))
df
# Country Values A B C Class
#1 France 2.371120 0 6 8 A
#2 France 6.155804 0 6 8 B
#3 France 5.635268 0 6 8 A
#4 Germany 9.661230 1 7 9 C
#5 Germany 6.412292 1 7 9 A
#6 Germany 3.148534 1 7 9 A
#7 UK 4.661493 0 1 4 C
#8 UK 6.933073 0 1 4 C
#9 UK 4.623160 0 1 4 C
您可以从结果中删除任何不需要的列。
这是以 R 为基数的结果:
dfa<-merge(lookup_table,df)
Class <- function(x) {
if(x[5] > x[2] & x[5] <= x[3]) y <- "A"
if(x[5] > x[3] & x[5] <= x[4]) y <- "B"
if(x[5] > x[4]) y <- "C"
return(y)
}
dfa$Class <- sapply(1:nrow(dfa),function(ri)Class(dfa[ri,]))
dfa[,-c(2:4)]
> dfa[,-c(2:4)]
Country Values Class
1 France 4.680792 A
2 France 8.947157 C
3 France 9.031771 C
4 Germany 1.410008 A
5 Germany 5.752949 A
6 Germany 9.464206 C
7 UK 3.588198 B
8 UK 8.094746 C
9 UK 5.962915 C
这是 dplyr
解决方案。
library(dplyr)
df %>%
inner_join(lookup_table, by = "Country") %>%
mutate(Class = ifelse(Values > A & Values < B, "A",
ifelse(Values > B & Values < C, "B",
ifelse(Values > C, "C", "Not_found"))))
在管道的末尾添加 select(-c(A,B,C))
以获得更清晰的输出 data.frame
。作为这种方法的一个额外好处,任何不在范围内的值都将被标记为 "Not_found"
.
如果您更改 lookup_table
的形式并指定间隔,则可以使用 non-equi
join 从 data.table 的开发版本 v1.9.7 轻松执行此任务(Installation instructions):
require(data.table) #v1.9.7+
setDT(df)[lookup, Class := i.Class, on = .(Country, Values > value1, Values <= value2)]
# Country Values Class
# 1: UK 3.588198 B
# 2: UK 8.094746 C
# 3: France 4.680792 A
# 4: France 8.947157 C
# 5: Germany 9.464206 C
# 6: Germany 1.410008 A
# 7: Germany 5.752949 A
# 8: France 9.031771 C
# 9: UK 5.962915 C
## i.Class refers to Class from i argument = lookup$Class
其中 lookup
从 lookup_table
构造如下:
setDT(lookup_table)[, D := Inf]
lookup = lookup_table[, .(Country,
Class = rep(c("A", "B", "C"), each=.N),
value1 = c(A, B, C),
value2 = c(B, C, D))]
# Country Class value1 value2
# 1: UK A 0 1
# 2: France A 0 6
# 3: Germany A 1 7
# 4: UK B 1 4
# 5: France B 6 8
# 6: Germany B 7 9
# 7: UK C 4 Inf
# 8: France C 8 Inf
# 9: Germany C 9 Inf