用 NA data.table 替换一些字符
Replace some character by NA data.table
我正在构建一个函数来替换一些字符,如“-”,以在 R 中的 data.table
中正确 NA
。
我的函数如下:
na_replacer <- function(data_set, characters_to_replace) {
text_features <- names(data_set)[sapply(data_set, class) %in% c("character","factor")]
for (x in text_features) {
data_set[, lapply(.SD, function(x) replace(x, which(x==any(characters_to_replace)), NA))]
}
return (data_set)
}
当我运行这个函数时,我得到以下异常:
Error in charToDate(x) :
character string is not in a standard
unambiguous format
能否请您帮助我使此功能按预期工作,或者是否有更短的版本来完成我尝试执行的操作?
这是一个调用函数的示例数据集
DT = data.table(ID = c("foo","bar","-","foo","[]","bah"), a = 1:6, b = 7:12, c = 13:18, d = c("aaa", "bbb", "ccc", "_", "eeee", "ffff"))
DT <- na_replacer(data_set = DT, characters_to_replace = c('-', '_', '[]'))
之前的数据集:
ID a b c d
1: foo 1 7 13 aaa
2: bar 2 8 14 bbb
3: - 3 9 15 ccc
4: foo 4 10 16 _
5: [] 5 11 17 eeee
6: bah 6 12 18 ffff
之后的预期数据集:
ID a b c d
1: foo 1 7 13 aaa
2: bar 2 8 14 bbb
3: NA 3 9 15 ccc
4: foo 4 10 16 NA
5: NA 5 11 17 eeee
6: bah 6 12 18 ffff
请测试这个在 data.table
上运行的修改函数。
na_replacer <- function(data_set, characters_to_replace = c('-', '_')) {
library(data.table)
setDT(data_set)
text_features <- names(data_set)[sapply(data_set, class) %in% c("character", "factor")]
for (x in text_features) {
foo <- data_set[, get(x)]
data_set[, eval(x) := ifelse(foo %in% characters_to_replace, NA, foo)]
}
return(data_set)
}
这样的东西可以工作吗
na_replacer <- function(data_set, characters_to_replace) {
text_features <- names(data_set)[sapply(data_set, class) %in% c("character","factor")]
for (x in text_features) {
data_set[[x]][grep(paste0('[',characters_to_replace,']',collapse =""),data_set[[x]])] <- NA
}
return (data_set)
}
检查这个:
solution <- function(dt, replacer) {
result <- do.call(cbind, lapply(dt, function(x) lapply(x, function(x) { ifelse(is.na(x), replacer, x) } )))
as.data.frame(result)
}
# example:
dt <- data.frame(x = c(1, 4, NA, NA, 54), y = c(5, NA, -1, 0, 5))
cat("before:")
dt
cat("after:")
solution(dt, "-")
它用 data.frame 中给定的符号替换所有 NA
值。
OP 已请求将 data.table 的所有 character
或 factor
类型的列中的某些字符串替换为 NA
。
因子列的 失败。
以下两种方法也适用于因子列:
加入中更新
library(data.table)
options(datatable.print.class = TRUE)
for (col in DT[, names(.SD)[lapply(.SD, class) %in% c("character", "factor")]]) {
DT[.(chr = c("-", "_", "[]")), on = paste0(col, "==chr"), (col) := NA_character_][]
}
DT
ID a b c d
<char> <int> <int> <int> <fctr>
1: foo 1 7 13 aaa
2: bar 2 8 14 bbb
3: NA 3 9 15 ccc
4: foo 4 10 16 NA
5: NA 5 11 17 eeee
6: bah 6 12 18 ffff
使用set()
for (col in DT[, names(.SD)[lapply(.SD, class) %in% c("character", "factor")]]) {
set(DT, DT[get(col) %in% c("-", "_", "[]"), which = TRUE], col, NA_character_)
}
DT
ID a b c d
<char> <int> <int> <int> <fctr>
1: foo 1 7 13 aaa
2: bar 2 8 14 bbb
3: NA 3 9 15 ccc
4: foo 4 10 16 NA
5: NA 5 11 17 eeee
6: bah 6 12 18 ffff
数据
OP 在最新更新中提供的示例数据集被使用了一个修改:列 d
被强制为 factor
:
DT <- data.table(ID = c("foo", "bar", "-", "foo", "[]", "bah"),
a = 1:6, b = 7:12, c = 13:18,
d = factor(c("aaa", "bbb", "ccc", "_", "eeee", "ffff")))
我正在构建一个函数来替换一些字符,如“-”,以在 R 中的 data.table
中正确 NA
。
我的函数如下:
na_replacer <- function(data_set, characters_to_replace) {
text_features <- names(data_set)[sapply(data_set, class) %in% c("character","factor")]
for (x in text_features) {
data_set[, lapply(.SD, function(x) replace(x, which(x==any(characters_to_replace)), NA))]
}
return (data_set)
}
当我运行这个函数时,我得到以下异常:
Error in charToDate(x) :
character string is not in a standard unambiguous format
能否请您帮助我使此功能按预期工作,或者是否有更短的版本来完成我尝试执行的操作?
这是一个调用函数的示例数据集
DT = data.table(ID = c("foo","bar","-","foo","[]","bah"), a = 1:6, b = 7:12, c = 13:18, d = c("aaa", "bbb", "ccc", "_", "eeee", "ffff"))
DT <- na_replacer(data_set = DT, characters_to_replace = c('-', '_', '[]'))
之前的数据集:
ID a b c d
1: foo 1 7 13 aaa
2: bar 2 8 14 bbb
3: - 3 9 15 ccc
4: foo 4 10 16 _
5: [] 5 11 17 eeee
6: bah 6 12 18 ffff
之后的预期数据集:
ID a b c d
1: foo 1 7 13 aaa
2: bar 2 8 14 bbb
3: NA 3 9 15 ccc
4: foo 4 10 16 NA
5: NA 5 11 17 eeee
6: bah 6 12 18 ffff
请测试这个在 data.table
上运行的修改函数。
na_replacer <- function(data_set, characters_to_replace = c('-', '_')) {
library(data.table)
setDT(data_set)
text_features <- names(data_set)[sapply(data_set, class) %in% c("character", "factor")]
for (x in text_features) {
foo <- data_set[, get(x)]
data_set[, eval(x) := ifelse(foo %in% characters_to_replace, NA, foo)]
}
return(data_set)
}
这样的东西可以工作吗
na_replacer <- function(data_set, characters_to_replace) {
text_features <- names(data_set)[sapply(data_set, class) %in% c("character","factor")]
for (x in text_features) {
data_set[[x]][grep(paste0('[',characters_to_replace,']',collapse =""),data_set[[x]])] <- NA
}
return (data_set)
}
检查这个:
solution <- function(dt, replacer) {
result <- do.call(cbind, lapply(dt, function(x) lapply(x, function(x) { ifelse(is.na(x), replacer, x) } )))
as.data.frame(result)
}
# example:
dt <- data.frame(x = c(1, 4, NA, NA, 54), y = c(5, NA, -1, 0, 5))
cat("before:")
dt
cat("after:")
solution(dt, "-")
它用 data.frame 中给定的符号替换所有 NA
值。
OP 已请求将 data.table 的所有 character
或 factor
类型的列中的某些字符串替换为 NA
。
因子列的
以下两种方法也适用于因子列:
加入中更新
library(data.table)
options(datatable.print.class = TRUE)
for (col in DT[, names(.SD)[lapply(.SD, class) %in% c("character", "factor")]]) {
DT[.(chr = c("-", "_", "[]")), on = paste0(col, "==chr"), (col) := NA_character_][]
}
DT
ID a b c d <char> <int> <int> <int> <fctr> 1: foo 1 7 13 aaa 2: bar 2 8 14 bbb 3: NA 3 9 15 ccc 4: foo 4 10 16 NA 5: NA 5 11 17 eeee 6: bah 6 12 18 ffff
使用set()
for (col in DT[, names(.SD)[lapply(.SD, class) %in% c("character", "factor")]]) {
set(DT, DT[get(col) %in% c("-", "_", "[]"), which = TRUE], col, NA_character_)
}
DT
ID a b c d <char> <int> <int> <int> <fctr> 1: foo 1 7 13 aaa 2: bar 2 8 14 bbb 3: NA 3 9 15 ccc 4: foo 4 10 16 NA 5: NA 5 11 17 eeee 6: bah 6 12 18 ffff
数据
OP 在最新更新中提供的示例数据集被使用了一个修改:列 d
被强制为 factor
:
DT <- data.table(ID = c("foo", "bar", "-", "foo", "[]", "bah"),
a = 1:6, b = 7:12, c = 13:18,
d = factor(c("aaa", "bbb", "ccc", "_", "eeee", "ffff")))