用存储在另一个 data.table 中的标签快速替换 data.table 值
fast replacement of data.table values by labels stored in another data.table
它与 and 有关,虽然规模更大。
我有两个 data.tables:
- 第一个带有市场研究数据,包含存储为整数的答案;
- 第二个可以称为字典,类别标签与上述整数相关联。
查看可重现的示例:
编辑:添加新变量以包含“0”情况。
编辑 2:修改 'age_group' 变量以包括一个因素的所有唯一水平未出现在数据中的情况。
library(data.table)
library(magrittr)
# Table with survey data :
# - each observation contains the answers of a person
# - variables describe the sample population characteristics (gender, age...)
# - numeric variables (like age) are also stored as character vectors
repex_DT <- data.table (
country = as.character(c(1,3,4,2,NA,1,2,2,2,4,NA,2,1,1,3,4,4,4,NA,1)),
gender = as.character(c(NA,2,2,NA,1,1,1,2,2,1,NA,2,1,1,1,2,2,1,2,NA)),
age = as.character(c(18,40,50,NA,NA,22,30,52,64,24,NA,38,16,20,30,40,41,33,59,NA)),
age_group = as.character(c(2,2,2,NA,NA,2,2,2,2,2,NA,2,2,2,2,2,2,2,2,NA)),
status = as.character(c(1,NA,2,9,2,1,9,2,2,1,9,2,1,1,NA,2,2,1,2,9)),
children = as.character(c(0,2,3,1,6,1,4,2,4,NA,NA,2,1,1,NA,NA,3,5,2,1))
)
# Table of the labels associated to categorical variables, plus 'label_id' to match the values
labels_DT <- data.table (
label_id = as.character(c(1:9)),
country = as.character(c("COUNTRY 1","COUNTRY 2","COUNTRY 3","COUNTRY 4",NA,NA,NA,NA,NA)),
gender = as.character(c("Male","Female",NA,NA,NA,NA,NA,NA,NA)),
age_group = as.character(c("Less than 35","35 and more",NA,NA,NA,NA,NA,NA,NA)),
status = as.character(c("Employed","Unemployed",NA,NA,NA,NA,NA,NA,"Do not want to say")),
children = as.character(c("0","1","2","3","4","5 and more",NA,NA,NA))
)
# Identification of the variable nature (numeric or character)
var_type <- c("character","character","numeric","character","character","character")
# Identification of the categorical variable names
categorical_var <- names(repex_DT)[which(var_type == "character")]
您可以看到字典 table 小于调查数据 table,这是预期的。
此外,尽管所有变量都存储为字符,但有些变量是真正的数字变量,如年龄,因此不会出现在字典 table 中。
我的objective是将第一个data.table的所有变量的值替换为字典table中匹配的名称的相应标签。
我实际上是使用循环实现的,如下所示:
result_DT1 <- copy(repex_DT)
for (x in categorical_var){
if(length(which(repex_DT[[x]]=="0"))==0){
values_vector <- labels_DT$label_id
labels_vector <- labels_DT[[x]]
}else{
values_vector <- c("0",labels_DT$label_id)
labels_vector <- c(labels_DT[[x]][1:(length(labels_DT[[x]])-1)], NA, labels_DT[[x]][length(labels_DT[[x]])])}
result_DT1[, (c(x)) := plyr::mapvalues(x=get(x), from=values_vector, to=labels_vector, warn_missing = F)]
}
我想要的是一种更快的方法(如果存在的话,最快的方法),因为我有数千个变量来符合数十万条记录的条件。
任何性能改进都将非常受欢迎。我与 stringi
作斗争,但除非使用硬编码变量名,否则无法毫无错误地使用函数 运行。参见示例:
test_stringi <- copy(repex_DT) %>%
.[, (c("country")) := lapply(.SD, function(x) stringi::stri_replace_all_fixed(
str=x, pattern=unique(labels_DT$label_id)[!is.na(labels_DT[["country"]])],
replacement=unique(na.omit(labels_DT[["country"]])), vectorize_all=FALSE)),
.SDcols = c("country")]
你的第 2 个 data.table 的列只是查找向量:
same_cols <- intersect(names(repex_DT), names(labels_DT))
repex_DT[
,
(same_cols) := mapply(
function(x, y) y[as.integer(x)],
repex_DT[, same_cols, with = FALSE],
labels_DT[, same_cols, with = FALSE],
SIMPLIFY = FALSE
)
]
编辑
您可以在 labels_DT
列的第一个位置添加 NA
(类似于您对其他缺失值所做的操作)或者更好的是您可以将标签保留在列表中:
labels_list <- list(
country = c("COUNTRY 1","COUNTRY 2","COUNTRY 3","COUNTRY 4"),
gender = c("Male","Female"),
age_group = c("Less than 35","35 and more"),
status = c("Employed","Unemployed","Do not want to say"),
children = c("0","1","2","3","4","5 and more")
)
same_cols <- names(labels_list)
repex_DT[
,
(same_cols) := mapply(
function(x, y) y[factor(as.integer(x))],
repex_DT[, same_cols, with = FALSE],
labels_list,
SIMPLIFY = FALSE
)
]
请注意,这种方式需要先转换为因子,因为 repex_DT
中的值可能不是序列 1、2、3...
一个计算上非常有效的方法是先融化你的表格,匹配它们并再次投射:
repex_DT[, idx:= .I] # Create an index used for melting
# Melt
repex_melt <- melt(repex_DT, id.vars = "idx")
labels_melt <- melt(labels_DT, id.vars = "label_id")
# Match variables and value/label_id
repex_melt[labels_melt, value2:= i.value, on= c("variable", "value==label_id")]
# Put the data back into its original shape
result <- dcast(repex_melt, idx~variable, value.var = "value2")
我终于抽出时间来回答这个问题了。
我改变了我的方法并使用 fastmatch::fmatch
来识别要更新的标签。
正如@det 所指出的,与其他标准分类变量相比,不可能在同一循环中考虑具有起始“0”标签的变量,因此该指令基本上重复两次。
不过,这比我最初的 for loop
方法要快得多。
下面的答案:
library(data.table)
library(magrittr)
library(stringi)
library(fastmatch)
#Selection of variable names depending on the presence of '0' labels
same_cols_with0 <- intersect(names(repex_DT), names(labels_DT))[
which(intersect(names(repex_DT), names(labels_DT)) %fin%
names(repex_DT)[which(unlist(lapply(repex_DT, function(x)
sum(stri_detect_regex(x, pattern="^0$", negate=FALSE), na.rm=TRUE)),
use.names=FALSE)>=1)])]
same_cols_standard <- intersect(names(repex_DT), names(labels_DT))[
which(!(intersect(names(repex_DT), names(labels_DT)) %fin% same_cols_with0))]
labels_std <- labels_DT[, same_cols_standard, with=FALSE]
labels_0 <- labels_DT[, same_cols_with0, with=FALSE]
levels_id <- as.integer(labels_DT$label_id)
#Update joins via matching IDs (credit to @det for mapply syntax).
result_DT <- data.table::copy(repex_DT) %>%
.[, (same_cols_standard) := mapply(
function(x, y) y[fastmatch::fmatch(x=as.integer(x), table=levels_id, nomatch=NA)],
repex_DT[, same_cols_standard, with=FALSE], labels_std, SIMPLIFY=FALSE)] %>%
.[, (same_cols_with0) := mapply(
function(x, y) y[fastmatch::fmatch(x=as.integer(x), table=(levels_id - 1), nomatch=NA)],
repex_DT[, same_cols_with0, with=FALSE], labels_0, SIMPLIFY=FALSE)]
它与
- 第一个带有市场研究数据,包含存储为整数的答案;
- 第二个可以称为字典,类别标签与上述整数相关联。
查看可重现的示例:
编辑:添加新变量以包含“0”情况。
编辑 2:修改 'age_group' 变量以包括一个因素的所有唯一水平未出现在数据中的情况。
library(data.table)
library(magrittr)
# Table with survey data :
# - each observation contains the answers of a person
# - variables describe the sample population characteristics (gender, age...)
# - numeric variables (like age) are also stored as character vectors
repex_DT <- data.table (
country = as.character(c(1,3,4,2,NA,1,2,2,2,4,NA,2,1,1,3,4,4,4,NA,1)),
gender = as.character(c(NA,2,2,NA,1,1,1,2,2,1,NA,2,1,1,1,2,2,1,2,NA)),
age = as.character(c(18,40,50,NA,NA,22,30,52,64,24,NA,38,16,20,30,40,41,33,59,NA)),
age_group = as.character(c(2,2,2,NA,NA,2,2,2,2,2,NA,2,2,2,2,2,2,2,2,NA)),
status = as.character(c(1,NA,2,9,2,1,9,2,2,1,9,2,1,1,NA,2,2,1,2,9)),
children = as.character(c(0,2,3,1,6,1,4,2,4,NA,NA,2,1,1,NA,NA,3,5,2,1))
)
# Table of the labels associated to categorical variables, plus 'label_id' to match the values
labels_DT <- data.table (
label_id = as.character(c(1:9)),
country = as.character(c("COUNTRY 1","COUNTRY 2","COUNTRY 3","COUNTRY 4",NA,NA,NA,NA,NA)),
gender = as.character(c("Male","Female",NA,NA,NA,NA,NA,NA,NA)),
age_group = as.character(c("Less than 35","35 and more",NA,NA,NA,NA,NA,NA,NA)),
status = as.character(c("Employed","Unemployed",NA,NA,NA,NA,NA,NA,"Do not want to say")),
children = as.character(c("0","1","2","3","4","5 and more",NA,NA,NA))
)
# Identification of the variable nature (numeric or character)
var_type <- c("character","character","numeric","character","character","character")
# Identification of the categorical variable names
categorical_var <- names(repex_DT)[which(var_type == "character")]
您可以看到字典 table 小于调查数据 table,这是预期的。 此外,尽管所有变量都存储为字符,但有些变量是真正的数字变量,如年龄,因此不会出现在字典 table 中。 我的objective是将第一个data.table的所有变量的值替换为字典table中匹配的名称的相应标签。
我实际上是使用循环实现的,如下所示:
result_DT1 <- copy(repex_DT)
for (x in categorical_var){
if(length(which(repex_DT[[x]]=="0"))==0){
values_vector <- labels_DT$label_id
labels_vector <- labels_DT[[x]]
}else{
values_vector <- c("0",labels_DT$label_id)
labels_vector <- c(labels_DT[[x]][1:(length(labels_DT[[x]])-1)], NA, labels_DT[[x]][length(labels_DT[[x]])])}
result_DT1[, (c(x)) := plyr::mapvalues(x=get(x), from=values_vector, to=labels_vector, warn_missing = F)]
}
我想要的是一种更快的方法(如果存在的话,最快的方法),因为我有数千个变量来符合数十万条记录的条件。
任何性能改进都将非常受欢迎。我与 stringi
作斗争,但除非使用硬编码变量名,否则无法毫无错误地使用函数 运行。参见示例:
test_stringi <- copy(repex_DT) %>%
.[, (c("country")) := lapply(.SD, function(x) stringi::stri_replace_all_fixed(
str=x, pattern=unique(labels_DT$label_id)[!is.na(labels_DT[["country"]])],
replacement=unique(na.omit(labels_DT[["country"]])), vectorize_all=FALSE)),
.SDcols = c("country")]
你的第 2 个 data.table 的列只是查找向量:
same_cols <- intersect(names(repex_DT), names(labels_DT))
repex_DT[
,
(same_cols) := mapply(
function(x, y) y[as.integer(x)],
repex_DT[, same_cols, with = FALSE],
labels_DT[, same_cols, with = FALSE],
SIMPLIFY = FALSE
)
]
编辑
您可以在 labels_DT
列的第一个位置添加 NA
(类似于您对其他缺失值所做的操作)或者更好的是您可以将标签保留在列表中:
labels_list <- list(
country = c("COUNTRY 1","COUNTRY 2","COUNTRY 3","COUNTRY 4"),
gender = c("Male","Female"),
age_group = c("Less than 35","35 and more"),
status = c("Employed","Unemployed","Do not want to say"),
children = c("0","1","2","3","4","5 and more")
)
same_cols <- names(labels_list)
repex_DT[
,
(same_cols) := mapply(
function(x, y) y[factor(as.integer(x))],
repex_DT[, same_cols, with = FALSE],
labels_list,
SIMPLIFY = FALSE
)
]
请注意,这种方式需要先转换为因子,因为 repex_DT
中的值可能不是序列 1、2、3...
一个计算上非常有效的方法是先融化你的表格,匹配它们并再次投射:
repex_DT[, idx:= .I] # Create an index used for melting
# Melt
repex_melt <- melt(repex_DT, id.vars = "idx")
labels_melt <- melt(labels_DT, id.vars = "label_id")
# Match variables and value/label_id
repex_melt[labels_melt, value2:= i.value, on= c("variable", "value==label_id")]
# Put the data back into its original shape
result <- dcast(repex_melt, idx~variable, value.var = "value2")
我终于抽出时间来回答这个问题了。
我改变了我的方法并使用 fastmatch::fmatch
来识别要更新的标签。
正如@det 所指出的,与其他标准分类变量相比,不可能在同一循环中考虑具有起始“0”标签的变量,因此该指令基本上重复两次。
不过,这比我最初的 for loop
方法要快得多。
下面的答案:
library(data.table)
library(magrittr)
library(stringi)
library(fastmatch)
#Selection of variable names depending on the presence of '0' labels
same_cols_with0 <- intersect(names(repex_DT), names(labels_DT))[
which(intersect(names(repex_DT), names(labels_DT)) %fin%
names(repex_DT)[which(unlist(lapply(repex_DT, function(x)
sum(stri_detect_regex(x, pattern="^0$", negate=FALSE), na.rm=TRUE)),
use.names=FALSE)>=1)])]
same_cols_standard <- intersect(names(repex_DT), names(labels_DT))[
which(!(intersect(names(repex_DT), names(labels_DT)) %fin% same_cols_with0))]
labels_std <- labels_DT[, same_cols_standard, with=FALSE]
labels_0 <- labels_DT[, same_cols_with0, with=FALSE]
levels_id <- as.integer(labels_DT$label_id)
#Update joins via matching IDs (credit to @det for mapply syntax).
result_DT <- data.table::copy(repex_DT) %>%
.[, (same_cols_standard) := mapply(
function(x, y) y[fastmatch::fmatch(x=as.integer(x), table=levels_id, nomatch=NA)],
repex_DT[, same_cols_standard, with=FALSE], labels_std, SIMPLIFY=FALSE)] %>%
.[, (same_cols_with0) := mapply(
function(x, y) y[fastmatch::fmatch(x=as.integer(x), table=(levels_id - 1), nomatch=NA)],
repex_DT[, same_cols_with0, with=FALSE], labels_0, SIMPLIFY=FALSE)]