用存储在另一个 data.table 中的标签快速替换 data.table 值

Question

它与 and 有关，虽然规模更大。我有两个 data.tables:

第一个带有市场研究数据，包含存储为整数的答案；
第二个可以称为字典，类别标签与上述整数相关联。

查看可重现的示例：

编辑：添加新变量以包含“0”情况。

编辑 2：修改 'age_group' 变量以包括一个因素的所有唯一水平未出现在数据中的情况。

library(data.table)
library(magrittr)

# Table with survey data :
# - each observation contains the answers of a person
# - variables describe the sample population characteristics (gender, age...)
# - numeric variables (like age) are also stored as character vectors
repex_DT <- data.table (
  country = as.character(c(1,3,4,2,NA,1,2,2,2,4,NA,2,1,1,3,4,4,4,NA,1)),
  gender = as.character(c(NA,2,2,NA,1,1,1,2,2,1,NA,2,1,1,1,2,2,1,2,NA)),
  age = as.character(c(18,40,50,NA,NA,22,30,52,64,24,NA,38,16,20,30,40,41,33,59,NA)),
  age_group = as.character(c(2,2,2,NA,NA,2,2,2,2,2,NA,2,2,2,2,2,2,2,2,NA)),
  status = as.character(c(1,NA,2,9,2,1,9,2,2,1,9,2,1,1,NA,2,2,1,2,9)),
  children = as.character(c(0,2,3,1,6,1,4,2,4,NA,NA,2,1,1,NA,NA,3,5,2,1))
)

# Table of the labels associated to categorical variables, plus 'label_id' to match the values
labels_DT <- data.table (
  label_id = as.character(c(1:9)),
  country = as.character(c("COUNTRY 1","COUNTRY 2","COUNTRY 3","COUNTRY 4",NA,NA,NA,NA,NA)),
  gender = as.character(c("Male","Female",NA,NA,NA,NA,NA,NA,NA)),
  age_group = as.character(c("Less than 35","35 and more",NA,NA,NA,NA,NA,NA,NA)),
  status = as.character(c("Employed","Unemployed",NA,NA,NA,NA,NA,NA,"Do not want to say")),
  children = as.character(c("0","1","2","3","4","5 and more",NA,NA,NA))
)

# Identification of the variable nature (numeric or character)
var_type <- c("character","character","numeric","character","character","character")

# Identification of the categorical variable names
categorical_var <- names(repex_DT)[which(var_type == "character")]

您可以看到字典 table 小于调查数据 table，这是预期的。此外，尽管所有变量都存储为字符，但有些变量是真正的数字变量，如年龄，因此不会出现在字典 table 中。我的objective是将第一个data.table的所有变量的值替换为字典table中匹配的名称的相应标签。

我实际上是使用循环实现的，如下所示：

result_DT1 <- copy(repex_DT) 
for (x in categorical_var){
  if(length(which(repex_DT[[x]]=="0"))==0){
    values_vector <- labels_DT$label_id
    labels_vector <- labels_DT[[x]]
  }else{
    values_vector <- c("0",labels_DT$label_id)
    labels_vector <- c(labels_DT[[x]][1:(length(labels_DT[[x]])-1)], NA, labels_DT[[x]][length(labels_DT[[x]])])}
  result_DT1[, (c(x)) := plyr::mapvalues(x=get(x), from=values_vector, to=labels_vector, warn_missing = F)]
}

我想要的是一种更快的方法（如果存在的话，最快的方法），因为我有数千个变量来符合数十万条记录的条件。任何性能改进都将非常受欢迎。我与 stringi 作斗争，但除非使用硬编码变量名，否则无法毫无错误地使用函数运行。参见示例：

test_stringi <- copy(repex_DT) %>% 
  .[, (c("country")) := lapply(.SD, function(x) stringi::stri_replace_all_fixed(
    str=x, pattern=unique(labels_DT$label_id)[!is.na(labels_DT[["country"]])],
    replacement=unique(na.omit(labels_DT[["country"]])), vectorize_all=FALSE)),
    .SDcols = c("country")]

Answer 1

你的第 2 个 data.table 的列只是查找向量：

same_cols <- intersect(names(repex_DT), names(labels_DT))

repex_DT[
  , 
  (same_cols) := mapply(
    function(x, y) y[as.integer(x)], 
    repex_DT[, same_cols, with = FALSE], 
    labels_DT[, same_cols, with = FALSE],
    SIMPLIFY = FALSE
  )
]

编辑

您可以在 labels_DT 列的第一个位置添加 NA（类似于您对其他缺失值所做的操作）或者更好的是您可以将标签保留在列表中：

labels_list <- list(
  country = c("COUNTRY 1","COUNTRY 2","COUNTRY 3","COUNTRY 4"),
  gender = c("Male","Female"),
  age_group = c("Less than 35","35 and more"),
  status = c("Employed","Unemployed","Do not want to say"),
  children = c("0","1","2","3","4","5 and more")
)

same_cols <- names(labels_list)

repex_DT[
  , 
  (same_cols) := mapply(
    function(x, y) y[factor(as.integer(x))], 
    repex_DT[, same_cols, with = FALSE], 
    labels_list,
    SIMPLIFY = FALSE
  )
]

请注意，这种方式需要先转换为因子，因为 repex_DT 中的值可能不是序列 1、2、3...

Answer 2

一个计算上非常有效的方法是先融化你的表格，匹配它们并再次投射：

repex_DT[, idx:= .I] # Create an index used for melting
# Melt
repex_melt <- melt(repex_DT, id.vars = "idx")
labels_melt <- melt(labels_DT, id.vars = "label_id")
# Match variables and value/label_id
repex_melt[labels_melt, value2:= i.value, on= c("variable", "value==label_id")]
# Put the data back into its original shape
result <- dcast(repex_melt, idx~variable, value.var = "value2")

Answer 3

我终于抽出时间来回答这个问题了。我改变了我的方法并使用 fastmatch::fmatch 来识别要更新的标签。正如@det 所指出的，与其他标准分类变量相比，不可能在同一循环中考虑具有起始“0”标签的变量，因此该指令基本上重复两次。不过，这比我最初的 for loop 方法要快得多。

下面的答案：

library(data.table)
library(magrittr)
library(stringi)
library(fastmatch)

#Selection of variable names depending on the presence of '0' labels
same_cols_with0 <- intersect(names(repex_DT), names(labels_DT))[
  which(intersect(names(repex_DT), names(labels_DT)) %fin% 
          names(repex_DT)[which(unlist(lapply(repex_DT, function(x) 
            sum(stri_detect_regex(x, pattern="^0$", negate=FALSE), na.rm=TRUE)),
 use.names=FALSE)>=1)])]

same_cols_standard <- intersect(names(repex_DT), names(labels_DT))[
  which(!(intersect(names(repex_DT), names(labels_DT)) %fin% same_cols_with0))]

labels_std <- labels_DT[, same_cols_standard, with=FALSE]
labels_0   <- labels_DT[, same_cols_with0, with=FALSE]
levels_id  <- as.integer(labels_DT$label_id)

#Update joins via matching IDs (credit to @det for mapply syntax).
result_DT <- data.table::copy(repex_DT) %>% 
  .[, (same_cols_standard) := mapply(
    function(x, y) y[fastmatch::fmatch(x=as.integer(x), table=levels_id, nomatch=NA)],
    repex_DT[, same_cols_standard, with=FALSE], labels_std, SIMPLIFY=FALSE)] %>% 
  .[, (same_cols_with0) := mapply(
    function(x, y) y[fastmatch::fmatch(x=as.integer(x), table=(levels_id - 1), nomatch=NA)],
    repex_DT[, same_cols_with0, with=FALSE], labels_0, SIMPLIFY=FALSE)]

用存储在另一个 data.table 中的标签快速替换 data.table 值

fast replacement of data.table values by labels stored in another data.table

performance

replace

r

function

data.table