从数字序列中获取法术和统计数据
getting spells and statistics from sequence of numbers
我有一个字符串,我想从中提取法术,例如,
A<- c('000001111000', '0110011', '110001')
我想以序列格式获取0和1的连续拼写长度。然后使用法术的长度我想计算描述性统计数据,如均值、模式、sd 等,(spell_0 和 spell_1 是 A 向量中的序列。
例如,
spell_0 spell_1 mean_spell_0 mean_spell_1
5-3 4 4 4
1-2 2-2 1.5 2
3 2-1 3 1.5
有什么建议吗?
您可以尝试这样的操作:
do.call(rbind,
lapply(strsplit(A, ""),
function(x) {
lengths <- rle(x)$lengths
values <- rle(x)$values
data.frame(spell_0 = paste(lengths[values == "0"], collapse = "-"),
spell_1 = paste(lengths[values == "1"], collapse = "-"),
mean_spell_0 = mean(lengths[values == "0"]),
mean_spell_1 = mean(lengths[values == "1"]))
}))
#> spell_0 spell_1 mean_spell_0 mean_spell_1
#> 1 5-3 4 4.0 4.0
#> 2 1-2 2-2 1.5 2.0
#> 3 3 2-1 3.0 1.5
你的问题其实包含了好几个问题
在将字符串拆分为字符后,您首先需要从原始向量中获取不同的序列。正如评论中指出的那样,这可以通过 rle
来实现。然后,对于您的示例中的每个值(“0”和“1”),您需要获取与该值对应的每个序列的 lengths
。然后你需要把它们放在你想要的格式中(尽管这可能不是最合适的。
这是我的提议:
seqA <- lapply(strsplit(A, ""), rle)
do.call(cbind,lapply(c("0", "1"), # this can be made more general, for example using unique(unlist(strsplit(A, "")))
function(i){
do.call(rbind, lapply(seqA,
function(x){
lesSeq <- x$lengths[x$values==i]
res <- data.frame(paste(lesSeq, collapse="-"), mean(lesSeq))
colnames(res) <- paste(c("spell", "mean_spell"), i, sep="_")
return(res)
}))
}))[, c(1, 3, 2, 4)] # this rearrangment may not be needed...
# spell_0 spell_1 mean_spell_0 mean_spell_1
#1 5-3 4 4.0 4.0
#2 1-2 2-2 1.5 2.0
#3 3 2-1 3.0 1.5
首先我们提取并计算 0
s 和 1
s:
library(stringr)
spell_0a <- sapply(str_extract_all(A, "0+"), function(x) str_count(x, "0"))
spell_1a <- sapply(str_extract_all(A, "1+"), function(x) str_count(x, "1"))
然后我们折叠结果并进行数学运算:
df <- data.frame(
# collapse results:
spell_0 = unlist(lapply(spell_0a, function(x) paste0(x, collapse = "-"))),
spell_1 = unlist(lapply(spell_1a, function(x) paste0(x, collapse = "-"))),
# calculate means:
mean_spell_0 = unlist(lapply(spell_0a, function(x) ifelse(length(x)==1, x[1], sum(x[1]+x[2])/2))),
mean_spell_1 = unlist(lapply(spell_1a, function(x) ifelse(length(x)==1, x[1],sum(x[1]+x[2])/2)))
)
结果:
df
spell_0 spell_1 mean_spell_0 mean_spell_1
1 5-3 4 4.0 4.0
2 1-2 2-2 1.5 2.0
3 3 2-1 3.0 1.5
这是一个 tidyverse 友好的解决方案,它避免了 apply
函数。
library(tidyverse)
library(stringr)
A <- c('000001111000', '0110011', '110001')
data.frame(A) %>%
mutate(A = str_replace_all(A, "01", "0-1"),
A = str_replace_all(A, "10", "1-0")) %>%
mutate(A_split = str_split(A, "-")) %>%
unnest(A_split) %>%
mutate(n_0 = str_count(A_split, "0"),
n_0 = ifelse(n_0 == 0, NA, n_0),
n_1 = str_count(A_split, "1"),
n_1 = ifelse(n_1 == 0, NA, n_1)) %>%
group_by(A) %>%
summarise(spell_0 = paste(na.omit(n_0), collapse = "-"),
spell_1 = paste(na.omit(n_1), collapse = "-"),
mean_spell_0 = mean(n_0, na.rm = T),
mean_spell_1 = mean(n_1, na.rm = T))
结果:
#> A spell_0 spell_1 mean_spell_0 mean_spell_1
#> 1 0-11-00-11 1-2 2-2 1.5 2.0
#> 2 00000-1111-000 5-3 4 4.0 4.0
#> 3 11-000-1 3 2-1 3.0 1.5
由 reprex package (v2.0.1)
于 2021-10-25 创建
get_spells <- function(x, char){
s <- sapply(gregexpr(paste0(char, "+"), x), attr, "match")
u <- sapply(s, paste0, collapse = "-")
v <- sapply(s, mean)
nms <- paste0("spell_", c(char, paste0("mean_", char)))
setNames(data.frame(u, v) ,nms)
}
do.call(cbind, lapply(0:1, get_spells, x = A))
spell_0 spell_mean_0 spell_1 spell_mean_1
1 5-3 4.0 4 4.0
2 1-2 1.5 2-2 2.0
3 3 3.0 2-1 1.5
另一种方式可能是:
a <- strsplit(A, "(?<=(.))(?!\1)", perl=TRUE)
b <- lapply(a, function(x)
unlist(tapply(nchar(x),sub("(.)+", "\1", x), function(x)
c(setNames(paste(x, collapse = '-'), "spell"),
setNames(mean(x), "mean_spell")))))
d <- type.convert(data.frame(do.call(rbind, b)), as.is = TRUE)
d
X0.spell X0.mean_spell X1.spell X1.mean_spell
1 5-3 4.0 4 4.0
2 1-2 1.5 2-2 2.0
3 3 3.0 2-1 1.5
我有一个字符串,我想从中提取法术,例如,
A<- c('000001111000', '0110011', '110001')
我想以序列格式获取0和1的连续拼写长度。然后使用法术的长度我想计算描述性统计数据,如均值、模式、sd 等,(spell_0 和 spell_1 是 A 向量中的序列。
例如,
spell_0 spell_1 mean_spell_0 mean_spell_1
5-3 4 4 4
1-2 2-2 1.5 2
3 2-1 3 1.5
有什么建议吗?
您可以尝试这样的操作:
do.call(rbind,
lapply(strsplit(A, ""),
function(x) {
lengths <- rle(x)$lengths
values <- rle(x)$values
data.frame(spell_0 = paste(lengths[values == "0"], collapse = "-"),
spell_1 = paste(lengths[values == "1"], collapse = "-"),
mean_spell_0 = mean(lengths[values == "0"]),
mean_spell_1 = mean(lengths[values == "1"]))
}))
#> spell_0 spell_1 mean_spell_0 mean_spell_1
#> 1 5-3 4 4.0 4.0
#> 2 1-2 2-2 1.5 2.0
#> 3 3 2-1 3.0 1.5
你的问题其实包含了好几个问题
在将字符串拆分为字符后,您首先需要从原始向量中获取不同的序列。正如评论中指出的那样,这可以通过 rle
来实现。然后,对于您的示例中的每个值(“0”和“1”),您需要获取与该值对应的每个序列的 lengths
。然后你需要把它们放在你想要的格式中(尽管这可能不是最合适的。
这是我的提议:
seqA <- lapply(strsplit(A, ""), rle)
do.call(cbind,lapply(c("0", "1"), # this can be made more general, for example using unique(unlist(strsplit(A, "")))
function(i){
do.call(rbind, lapply(seqA,
function(x){
lesSeq <- x$lengths[x$values==i]
res <- data.frame(paste(lesSeq, collapse="-"), mean(lesSeq))
colnames(res) <- paste(c("spell", "mean_spell"), i, sep="_")
return(res)
}))
}))[, c(1, 3, 2, 4)] # this rearrangment may not be needed...
# spell_0 spell_1 mean_spell_0 mean_spell_1
#1 5-3 4 4.0 4.0
#2 1-2 2-2 1.5 2.0
#3 3 2-1 3.0 1.5
首先我们提取并计算 0
s 和 1
s:
library(stringr)
spell_0a <- sapply(str_extract_all(A, "0+"), function(x) str_count(x, "0"))
spell_1a <- sapply(str_extract_all(A, "1+"), function(x) str_count(x, "1"))
然后我们折叠结果并进行数学运算:
df <- data.frame(
# collapse results:
spell_0 = unlist(lapply(spell_0a, function(x) paste0(x, collapse = "-"))),
spell_1 = unlist(lapply(spell_1a, function(x) paste0(x, collapse = "-"))),
# calculate means:
mean_spell_0 = unlist(lapply(spell_0a, function(x) ifelse(length(x)==1, x[1], sum(x[1]+x[2])/2))),
mean_spell_1 = unlist(lapply(spell_1a, function(x) ifelse(length(x)==1, x[1],sum(x[1]+x[2])/2)))
)
结果:
df
spell_0 spell_1 mean_spell_0 mean_spell_1
1 5-3 4 4.0 4.0
2 1-2 2-2 1.5 2.0
3 3 2-1 3.0 1.5
这是一个 tidyverse 友好的解决方案,它避免了 apply
函数。
library(tidyverse)
library(stringr)
A <- c('000001111000', '0110011', '110001')
data.frame(A) %>%
mutate(A = str_replace_all(A, "01", "0-1"),
A = str_replace_all(A, "10", "1-0")) %>%
mutate(A_split = str_split(A, "-")) %>%
unnest(A_split) %>%
mutate(n_0 = str_count(A_split, "0"),
n_0 = ifelse(n_0 == 0, NA, n_0),
n_1 = str_count(A_split, "1"),
n_1 = ifelse(n_1 == 0, NA, n_1)) %>%
group_by(A) %>%
summarise(spell_0 = paste(na.omit(n_0), collapse = "-"),
spell_1 = paste(na.omit(n_1), collapse = "-"),
mean_spell_0 = mean(n_0, na.rm = T),
mean_spell_1 = mean(n_1, na.rm = T))
结果:
#> A spell_0 spell_1 mean_spell_0 mean_spell_1
#> 1 0-11-00-11 1-2 2-2 1.5 2.0
#> 2 00000-1111-000 5-3 4 4.0 4.0
#> 3 11-000-1 3 2-1 3.0 1.5
由 reprex package (v2.0.1)
于 2021-10-25 创建get_spells <- function(x, char){
s <- sapply(gregexpr(paste0(char, "+"), x), attr, "match")
u <- sapply(s, paste0, collapse = "-")
v <- sapply(s, mean)
nms <- paste0("spell_", c(char, paste0("mean_", char)))
setNames(data.frame(u, v) ,nms)
}
do.call(cbind, lapply(0:1, get_spells, x = A))
spell_0 spell_mean_0 spell_1 spell_mean_1
1 5-3 4.0 4 4.0
2 1-2 1.5 2-2 2.0
3 3 3.0 2-1 1.5
另一种方式可能是:
a <- strsplit(A, "(?<=(.))(?!\1)", perl=TRUE)
b <- lapply(a, function(x)
unlist(tapply(nchar(x),sub("(.)+", "\1", x), function(x)
c(setNames(paste(x, collapse = '-'), "spell"),
setNames(mean(x), "mean_spell")))))
d <- type.convert(data.frame(do.call(rbind, b)), as.is = TRUE)
d
X0.spell X0.mean_spell X1.spell X1.mean_spell
1 5-3 4.0 4 4.0
2 1-2 1.5 2-2 2.0
3 3 3.0 2-1 1.5