R中基于多列的指标特征创建
Indicator feature creation in R based on multiple columns
我有一个包含 10 列的数据集,其中 10 列中有 3 列与创建新指标功能有关。这些特征是 "pT"、"pN" 和 "M",它们都采用不同的值。除去这 3 个特征所取的所有值,总共有 9 个独特的组合需要在新变量中捕获。
PATHOT PATHON PATHOM
1 pT2 pN1 M0
4 pT1 pN1 M0
13 pT3 pN1 M0
161 pT1 *pN2 M0
391 pT1 pN1 *M1
810 *pTIS pN1 M0
948 pT3 *pN2 M0
1043 pT2 pN1 *M1
1067 *pT4 pN1 M0
例如,当 PATHOT=pT2、PATHON=pN1 和 PATHOM=M0 时,新变量的值为“1”,依此类推,直到值为 9。我已经完成了任务,但是在花费了将近 20 行代码之后涉及所有独特组合的矢量化操作。
diag3_bs$sfd[diag3_bs$pathot=="pT2" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 1
diag3_bs$sfd[diag3_bs$pathot=="pT1" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 2
diag3_bs$sfd[diag3_bs$pathot=="pT3" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 3... so on upto 9.
我想问一下是否有更好的自动化方法来获得相同的结果?
dput(data.frame)
如下
structure(list(F_STATUS = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "Y", class = "factor"), EVENT_ID = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "BASELINE", class =
"factor"),
PAG_NAME = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "BR2", class = "factor"), PTSIZE = c(3, 4,
2.7, 2, 0.9, 3, 3, 0.9, 3, 4.5), PTSIZE_U = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CM", class = "factor"),
PT_SYM = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("", "-", "<", ">"), class = "factor"), PATHOT = structure(c(4L,
4L, 4L, 3L, 3L, 4L, 4L, 3L, 4L, 4L), .Label = c("*pT4", "*pTIS",
"pT1", "pT2", "pT3"), class = "factor"), PATHON = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("*pN2", "pN1"
), class = "factor"), PATHOM = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("*M1", "M0"), class = "factor"),
RSUBJID = 901000:901009, RUSUBJID = structure(1:10, .Label = c(
"000301-000-901-251", "000301-000-901-252", "000301-000-901-253",
"000301-000-901-254", "000301-000-901-255", "000301-000-901-256",
"000301-000-901-257", "000301-000-901-258", "000301-000-901-259",
"000301-000-901-260", "000301-000-901-261", "000301-000-901-262")
, class = "factor")), .Names = c("F_STATUS", "EVENT_ID", "PAG_NAME", "PTSIZE", "PTSIZE_U", "PT_SYM", "PATHOT",
"PATHON", "PATHOM", "RSUBJID", "RUSUBJID"), row.names = c(NA, 10L),
class = "data.frame")
谢谢。
我尝试编辑数据,以免在输入时引发错误。还创建了一个可能组合列表的版本:
stg_tbl <- structure(list(PATHOT = structure(c(4L, 3L, 5L, 3L, 3L, 2L, 5L,
4L, 1L), .Label = c("*pT4", "*pTIS", "pT1", "pT2", "pT3"), class = "factor"),
PATHON = structure(c(2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L), .Label = c("*pN2",
"pN1"), class = "factor"), PATHOM = structure(c(2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L), .Label = c("*M1", "M0"), class = "factor")), .Names = c("PATHOT",
"PATHON", "PATHOM"), class = "data.frame", row.names = c("1",
"4", "13", "161", "391", "810", "948", "1043", "1067"))
创建类别的等价文本向量:
stg_lbls <- with(stg_tbl, paste(PATHOT, PATHON, PATHOM, sep="_") )
那么使用这些水平创建的因子的 as.numeric 值将是所需的结果:
dat$stg <- with(dat, factor( paste(PATHOT, PATHON, PATHOM, sep="_"), levels=stg_lbls))
as.numeric(dat$stg)
#[1] 1 1 1 2 2 1 1 2 1 1
您可以按照通常的方式分配这些值:
dat$sfd <- as.numeric(dat$stg)
我做了一些新数据,应该对你的问题有用。
k<-expand.grid(data.frame(a=letters[1:3],b=letters[4:6],c=letters[7:9]))
library(dplyr)
k %>% mutate(groups=paste0(a,b,c))->k2
k2$groups<-as.numeric(factor(k2$groups))
k2
这很粗糙,而且您没有选择哪个组合得到哪个数字,所以之后需要一些挖掘,但很快。
我有一个包含 10 列的数据集,其中 10 列中有 3 列与创建新指标功能有关。这些特征是 "pT"、"pN" 和 "M",它们都采用不同的值。除去这 3 个特征所取的所有值,总共有 9 个独特的组合需要在新变量中捕获。
PATHOT PATHON PATHOM
1 pT2 pN1 M0
4 pT1 pN1 M0
13 pT3 pN1 M0
161 pT1 *pN2 M0
391 pT1 pN1 *M1
810 *pTIS pN1 M0
948 pT3 *pN2 M0
1043 pT2 pN1 *M1
1067 *pT4 pN1 M0
例如,当 PATHOT=pT2、PATHON=pN1 和 PATHOM=M0 时,新变量的值为“1”,依此类推,直到值为 9。我已经完成了任务,但是在花费了将近 20 行代码之后涉及所有独特组合的矢量化操作。
diag3_bs$sfd[diag3_bs$pathot=="pT2" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 1
diag3_bs$sfd[diag3_bs$pathot=="pT1" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 2
diag3_bs$sfd[diag3_bs$pathot=="pT3" & diag3_bs$pathon=="pN1" &
diag3_bs$pathom=="M0"] <- 3... so on upto 9.
我想问一下是否有更好的自动化方法来获得相同的结果?
dput(data.frame)
如下
structure(list(F_STATUS = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "Y", class = "factor"), EVENT_ID = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "BASELINE", class =
"factor"),
PAG_NAME = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "BR2", class = "factor"), PTSIZE = c(3, 4,
2.7, 2, 0.9, 3, 3, 0.9, 3, 4.5), PTSIZE_U = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CM", class = "factor"),
PT_SYM = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("", "-", "<", ">"), class = "factor"), PATHOT = structure(c(4L,
4L, 4L, 3L, 3L, 4L, 4L, 3L, 4L, 4L), .Label = c("*pT4", "*pTIS",
"pT1", "pT2", "pT3"), class = "factor"), PATHON = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("*pN2", "pN1"
), class = "factor"), PATHOM = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("*M1", "M0"), class = "factor"),
RSUBJID = 901000:901009, RUSUBJID = structure(1:10, .Label = c(
"000301-000-901-251", "000301-000-901-252", "000301-000-901-253",
"000301-000-901-254", "000301-000-901-255", "000301-000-901-256",
"000301-000-901-257", "000301-000-901-258", "000301-000-901-259",
"000301-000-901-260", "000301-000-901-261", "000301-000-901-262")
, class = "factor")), .Names = c("F_STATUS", "EVENT_ID", "PAG_NAME", "PTSIZE", "PTSIZE_U", "PT_SYM", "PATHOT",
"PATHON", "PATHOM", "RSUBJID", "RUSUBJID"), row.names = c(NA, 10L),
class = "data.frame")
谢谢。
我尝试编辑数据,以免在输入时引发错误。还创建了一个可能组合列表的版本:
stg_tbl <- structure(list(PATHOT = structure(c(4L, 3L, 5L, 3L, 3L, 2L, 5L,
4L, 1L), .Label = c("*pT4", "*pTIS", "pT1", "pT2", "pT3"), class = "factor"),
PATHON = structure(c(2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L), .Label = c("*pN2",
"pN1"), class = "factor"), PATHOM = structure(c(2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L), .Label = c("*M1", "M0"), class = "factor")), .Names = c("PATHOT",
"PATHON", "PATHOM"), class = "data.frame", row.names = c("1",
"4", "13", "161", "391", "810", "948", "1043", "1067"))
创建类别的等价文本向量:
stg_lbls <- with(stg_tbl, paste(PATHOT, PATHON, PATHOM, sep="_") )
那么使用这些水平创建的因子的 as.numeric 值将是所需的结果:
dat$stg <- with(dat, factor( paste(PATHOT, PATHON, PATHOM, sep="_"), levels=stg_lbls))
as.numeric(dat$stg)
#[1] 1 1 1 2 2 1 1 2 1 1
您可以按照通常的方式分配这些值:
dat$sfd <- as.numeric(dat$stg)
我做了一些新数据,应该对你的问题有用。
k<-expand.grid(data.frame(a=letters[1:3],b=letters[4:6],c=letters[7:9]))
library(dplyr)
k %>% mutate(groups=paste0(a,b,c))->k2
k2$groups<-as.numeric(factor(k2$groups))
k2
这很粗糙,而且您没有选择哪个组合得到哪个数字,所以之后需要一些挖掘,但很快。