根据 r 中的列+条件+阈值的组合创建一个新列
Create a new column based on a combination of columns + conditions + thresholds in r
我有一段代码有问题,首先我分享一个数据集:
df <- data.frame(PatientID = c("0002" ,"0002", "0005", "0005" ,"0009" ,"0009" ,"0018", "0018" ,"0039" ,"0039" , "0043" ,"0043", "0046", "0046" ,"0048" ,"0048"),
sex= c("F", "F", "M", "M", "F", "F", "M", "M","F", "F", "M", "M", "M", "M", "F", "F"),
A1 = c( 1961.810 , 929.466 , 978.166, 1005.820 , 925.752 , 969.469 ,943.398 , 965.292 , 1996.404 , 967.047 , NA , 893.428 , 921.606 , 976.192 , 929.590 , 950.493),
B1 = c(998.988 , NA , 1998.680 , NA , 1020.560 , 955.540 , 1911.606 , 964.039 , 988.087 , 1902.367 , 959.338 ,1029.050 , 1987.374 ,1066.400 ,957.512 , 917.597),
C1 = c( 1987.140 , 961.810 , 929.466 , 978.166, 969.469 , 943.398 ,936.034, 965.292 , 996.404 , 1920.610 , 967.047, 913.517 , 893.428 , 921.606 , 929.590 ,950.493),
D1 = c( 1961.810 , 929.466 , 978.166, 1005.820 , 925.752 , 969.469 ,1943.398 , 965.292 , 996.404 , 967.047 , NA , 1893.428 , 921.606 , 976.192 , 929.590 , 950.493),
E1 = c(1006.330, 1028.070 , 954.274 ,1005.910 ,949.969 , 992.820 ,934.407 , 948.913 , 961.375 ,955.296 , 961.128 ,998.119 ,1009.110 , 994.891 ,1000.170 ,982.763),
G1= c(987.140 , 961.810 , 929.466 , 978.166, 969.469 , 943.398 ,936.034, 965.292 , 996.404 , 920.610 , 967.047, 913.517 , 893.428 , 921.606 , 1929.590 ,950.493),
A2 = c(NA , 977.146 , NA , 964.315 ,NA , 952.311 , NA , NA , 947.465 , 902.852 , NA ,NA , 930.141 ,1007.790 , NA , 999.414),
B2 = c(1998.988 , NA , 1998.680 , NA , NA , 955.540 , NA , 964.039 , 988.087 , 1902.367 , NA ,1029.050 , NA ,1066.400 ,NA , 917.597),
C2 = c( NA , NA , NA , NA, 969.469 , NA ,936.034, 965.292 , NA , 1920.610 , 967.047, NA , 1893.428 , 921.606 , 929.590 ,950.493),
D2 = c( 961.810 , NA , 978.166, NA , 925.752 , NA ,943.398 , 1965.292 , NA , 1967.047 , NA , 1893.428 , 921.606 , 976.192 , NA , 1950.493),
E2 = c(1006.330, 1028.070 , NA ,1005.910 ,949.969 , 992.820 ,1934.407 , 948.913 , 961.375 ,955.296 , NA ,998.119 ,NA , 994.891 ,1000.170 ,982.763),
G2= c(NA , 958.990 , 924.680 , 955.927 , NA , NA ,973.348 , 984.392 , NA , NA , 995.368 , 1994.997 , 979.454 , 952.605 ,NA , 956.507), stringsAsFactors = F)
将问题具体化:我在访问1
(A1,B1,C1,D1,E1,G1
) 时定义了 2 组指标,并且在访问2
(A2,B2,C2,D2,E2,G2
) 时重复了相同的指标
要在访问 1 时诊断某人,我使用以下代码:
cols <- 3:8
df$sex= as.factor(df$sex)
df %>% mutate(Diagnosis=ifelse(sex == "F" & (rowSums(df[cols] > 1004, na.rm = TRUE) >=3) ,'Yes',
ifelse(sex == "M" & (rowSums(df[cols] > 986, na.rm = TRUE) >=3) ,'Yes','No')))-> df
这段代码可以满足我的要求,非常完美! :)
如您所见,我为女性设置了一个阈值 (1004),为男性设置了一个阈值 (986)。根据等式,当患者有 3 个或更多指标高于阈值时,诊断结果为 'Yes'。
现在,问题来自访问 2。在这种情况下,诊断有 4 个选项,患者可以诊断为“持续”、“已解决”、“新发”或“从未”疾病。
理论上,解决方案应该像应用这段代码一样简单:
cols <- 9:14
df$sex= as.factor(df$sex)
df %>% mutate(Diagnosis=ifelse(sex == "F" & (rowSums(df[cols] > 1004, na.rm = TRUE) >=3) ,'Yes',
ifelse(sex == "M" & (rowSums(df[cols] > 986, na.rm = TRUE) >=3) ,'Yes','No')))-> df
然后是一个非常简单的 ifelse
其中:
- 如果访问 1 时是,访问 2 时是 = 正在进行
- 如果在访问 1 时是且在访问 2 时否 = 已解决
- 如果在访问 1 时没有并且在访问 2 时有 = 新发作
- 如果在访问 1 时没有并且在访问 2 时没有 = 从不
但是情况有点复杂,有一个新选项称为“NPA”(无法评估),因为有两个潜在的例外:为了做出可靠的判断,我们需要看看发生了什么那些提升的指标。我创建了一个简化的 examples 来说明每个异常:
A) 例如,该患者在第 1 次就诊时 C1、D1 和 E1 升高,但 C2 为 NA,因此该患者在第 2 次就诊时为 NPA
df <- data.frame(PatientID = c("112"),
sex= c("F"),
A1 = c( 961.810),
B1 = c(998.988)
C1 = c( 1019.330)
D1 = c( 1046.0)
E1 = c(1006.330)
G1= c(987.140 ),
A2 = c(NA )
B2 = c(998.988),
C2 = c( NA ),
D2 = c( 961.810),
E2 = c(1006.330),
G2= c(NA), stringsAsFactors = F)
B) 在这种情况下,C1、D1 和 E1 在访视 1 时升高,C2 为 NA,但 A2 升高,因此无论 C1 是否缺失,该患者在访视 2 时表现出明确的“是”,即与访问 1 中的“是”一起,将是一个“正在进行的”案例。
df <- data.frame(PatientID = c("112"),
sex= c("F"),
A1 = c( 961.810),
B1 = c(998.988)
C1 = c( 1019.330)
D1 = c( 1046.0)
E1 = c(1006.330)
G1= c(987.140 ),
A2 = c(1800.810)
B2 = c(998.988),
C2 = c( NA ),
D2 = c( 961.810),
E2 = c(1006.330),
G2= c(NA), stringsAsFactors = F)
我怎么能编码这个。对不起,我知道这有点吵!
谢谢!
我附上解决方案
你用语言表达了你的逻辑,做得很好;我只是把它转换成一个很大的 if else。看看这是否适合你:
(你期待这么多 ongoing
吗?)
cols1 <- names(df)[3:8]
cols2 <- names(df)[9:14]
plogic <- function(x) {
# Define threshold values for each sex
thresh <- ifelse(df[x,"sex"] == "M", 986, ifelse(df[x,"sex"] == "F", 1004, print("no threshold specified")))
# Test for conditions
if(df[x,"C1"] > thresh & df[x,"D1"] > thresh & df[x,"E1"] > thresh & is.na(df[x,"C2"])) {
return("NPA")
}else if(df[x,"C1"] > thresh & df[x,"D1"] > thresh & df[x,"E1"] > thresh & df[x,"A2"] > thresh) {
return("ongoing")
}else if(length(df[x,cols1] > thresh) >= 3 & length(df[x,cols2] > thresh) >= 3){
return("ongoing")
}else if(length(df[x,cols1] > thresh) >= 3 & length(df[x,cols2] > thresh) < 3) {
return("resolved")
}else if(length(df[x,cols1] > thresh) < 3 & length(df[x,cols2] > thresh) >= 3) {
return("new onset")
}else if(length(df[x,cols1] > thresh) < 3 & length(df[x,cols2] > thresh) < 3){
return("never")
}else{
return("error")
}
}
sapply(1:nrow(df), plogic)
#> [1] "ongoing" "ongoing" "ongoing" "ongoing" "ongoing" "ongoing" "ongoing"
#> [8] "ongoing" "ongoing" "ongoing" "ongoing" "ongoing" "ongoing" "ongoing"
#> [15] "ongoing" "ongoing" "NPA"
Created on 2021-09-23 by the reprex package (v2.0.1)
我有一段代码有问题,首先我分享一个数据集:
df <- data.frame(PatientID = c("0002" ,"0002", "0005", "0005" ,"0009" ,"0009" ,"0018", "0018" ,"0039" ,"0039" , "0043" ,"0043", "0046", "0046" ,"0048" ,"0048"),
sex= c("F", "F", "M", "M", "F", "F", "M", "M","F", "F", "M", "M", "M", "M", "F", "F"),
A1 = c( 1961.810 , 929.466 , 978.166, 1005.820 , 925.752 , 969.469 ,943.398 , 965.292 , 1996.404 , 967.047 , NA , 893.428 , 921.606 , 976.192 , 929.590 , 950.493),
B1 = c(998.988 , NA , 1998.680 , NA , 1020.560 , 955.540 , 1911.606 , 964.039 , 988.087 , 1902.367 , 959.338 ,1029.050 , 1987.374 ,1066.400 ,957.512 , 917.597),
C1 = c( 1987.140 , 961.810 , 929.466 , 978.166, 969.469 , 943.398 ,936.034, 965.292 , 996.404 , 1920.610 , 967.047, 913.517 , 893.428 , 921.606 , 929.590 ,950.493),
D1 = c( 1961.810 , 929.466 , 978.166, 1005.820 , 925.752 , 969.469 ,1943.398 , 965.292 , 996.404 , 967.047 , NA , 1893.428 , 921.606 , 976.192 , 929.590 , 950.493),
E1 = c(1006.330, 1028.070 , 954.274 ,1005.910 ,949.969 , 992.820 ,934.407 , 948.913 , 961.375 ,955.296 , 961.128 ,998.119 ,1009.110 , 994.891 ,1000.170 ,982.763),
G1= c(987.140 , 961.810 , 929.466 , 978.166, 969.469 , 943.398 ,936.034, 965.292 , 996.404 , 920.610 , 967.047, 913.517 , 893.428 , 921.606 , 1929.590 ,950.493),
A2 = c(NA , 977.146 , NA , 964.315 ,NA , 952.311 , NA , NA , 947.465 , 902.852 , NA ,NA , 930.141 ,1007.790 , NA , 999.414),
B2 = c(1998.988 , NA , 1998.680 , NA , NA , 955.540 , NA , 964.039 , 988.087 , 1902.367 , NA ,1029.050 , NA ,1066.400 ,NA , 917.597),
C2 = c( NA , NA , NA , NA, 969.469 , NA ,936.034, 965.292 , NA , 1920.610 , 967.047, NA , 1893.428 , 921.606 , 929.590 ,950.493),
D2 = c( 961.810 , NA , 978.166, NA , 925.752 , NA ,943.398 , 1965.292 , NA , 1967.047 , NA , 1893.428 , 921.606 , 976.192 , NA , 1950.493),
E2 = c(1006.330, 1028.070 , NA ,1005.910 ,949.969 , 992.820 ,1934.407 , 948.913 , 961.375 ,955.296 , NA ,998.119 ,NA , 994.891 ,1000.170 ,982.763),
G2= c(NA , 958.990 , 924.680 , 955.927 , NA , NA ,973.348 , 984.392 , NA , NA , 995.368 , 1994.997 , 979.454 , 952.605 ,NA , 956.507), stringsAsFactors = F)
将问题具体化:我在访问1
(A1,B1,C1,D1,E1,G1
) 时定义了 2 组指标,并且在访问2
(A2,B2,C2,D2,E2,G2
) 时重复了相同的指标
要在访问 1 时诊断某人,我使用以下代码:
cols <- 3:8
df$sex= as.factor(df$sex)
df %>% mutate(Diagnosis=ifelse(sex == "F" & (rowSums(df[cols] > 1004, na.rm = TRUE) >=3) ,'Yes',
ifelse(sex == "M" & (rowSums(df[cols] > 986, na.rm = TRUE) >=3) ,'Yes','No')))-> df
这段代码可以满足我的要求,非常完美! :) 如您所见,我为女性设置了一个阈值 (1004),为男性设置了一个阈值 (986)。根据等式,当患者有 3 个或更多指标高于阈值时,诊断结果为 'Yes'。
现在,问题来自访问 2。在这种情况下,诊断有 4 个选项,患者可以诊断为“持续”、“已解决”、“新发”或“从未”疾病。
理论上,解决方案应该像应用这段代码一样简单:
cols <- 9:14
df$sex= as.factor(df$sex)
df %>% mutate(Diagnosis=ifelse(sex == "F" & (rowSums(df[cols] > 1004, na.rm = TRUE) >=3) ,'Yes',
ifelse(sex == "M" & (rowSums(df[cols] > 986, na.rm = TRUE) >=3) ,'Yes','No')))-> df
然后是一个非常简单的 ifelse
其中:
- 如果访问 1 时是,访问 2 时是 = 正在进行
- 如果在访问 1 时是且在访问 2 时否 = 已解决
- 如果在访问 1 时没有并且在访问 2 时有 = 新发作
- 如果在访问 1 时没有并且在访问 2 时没有 = 从不
但是情况有点复杂,有一个新选项称为“NPA”(无法评估),因为有两个潜在的例外:为了做出可靠的判断,我们需要看看发生了什么那些提升的指标。我创建了一个简化的 examples 来说明每个异常:
A) 例如,该患者在第 1 次就诊时 C1、D1 和 E1 升高,但 C2 为 NA,因此该患者在第 2 次就诊时为 NPA
df <- data.frame(PatientID = c("112"),
sex= c("F"),
A1 = c( 961.810),
B1 = c(998.988)
C1 = c( 1019.330)
D1 = c( 1046.0)
E1 = c(1006.330)
G1= c(987.140 ),
A2 = c(NA )
B2 = c(998.988),
C2 = c( NA ),
D2 = c( 961.810),
E2 = c(1006.330),
G2= c(NA), stringsAsFactors = F)
B) 在这种情况下,C1、D1 和 E1 在访视 1 时升高,C2 为 NA,但 A2 升高,因此无论 C1 是否缺失,该患者在访视 2 时表现出明确的“是”,即与访问 1 中的“是”一起,将是一个“正在进行的”案例。
df <- data.frame(PatientID = c("112"),
sex= c("F"),
A1 = c( 961.810),
B1 = c(998.988)
C1 = c( 1019.330)
D1 = c( 1046.0)
E1 = c(1006.330)
G1= c(987.140 ),
A2 = c(1800.810)
B2 = c(998.988),
C2 = c( NA ),
D2 = c( 961.810),
E2 = c(1006.330),
G2= c(NA), stringsAsFactors = F)
我怎么能编码这个。对不起,我知道这有点吵! 谢谢!
我附上解决方案
你用语言表达了你的逻辑,做得很好;我只是把它转换成一个很大的 if else。看看这是否适合你:
(你期待这么多 ongoing
吗?)
cols1 <- names(df)[3:8]
cols2 <- names(df)[9:14]
plogic <- function(x) {
# Define threshold values for each sex
thresh <- ifelse(df[x,"sex"] == "M", 986, ifelse(df[x,"sex"] == "F", 1004, print("no threshold specified")))
# Test for conditions
if(df[x,"C1"] > thresh & df[x,"D1"] > thresh & df[x,"E1"] > thresh & is.na(df[x,"C2"])) {
return("NPA")
}else if(df[x,"C1"] > thresh & df[x,"D1"] > thresh & df[x,"E1"] > thresh & df[x,"A2"] > thresh) {
return("ongoing")
}else if(length(df[x,cols1] > thresh) >= 3 & length(df[x,cols2] > thresh) >= 3){
return("ongoing")
}else if(length(df[x,cols1] > thresh) >= 3 & length(df[x,cols2] > thresh) < 3) {
return("resolved")
}else if(length(df[x,cols1] > thresh) < 3 & length(df[x,cols2] > thresh) >= 3) {
return("new onset")
}else if(length(df[x,cols1] > thresh) < 3 & length(df[x,cols2] > thresh) < 3){
return("never")
}else{
return("error")
}
}
sapply(1:nrow(df), plogic)
#> [1] "ongoing" "ongoing" "ongoing" "ongoing" "ongoing" "ongoing" "ongoing"
#> [8] "ongoing" "ongoing" "ongoing" "ongoing" "ongoing" "ongoing" "ongoing"
#> [15] "ongoing" "ongoing" "NPA"
Created on 2021-09-23 by the reprex package (v2.0.1)