缩短(限制)句子的长度
Shorten (Limit) the length of a sentence
我有一列很长的名字,我想将它们剪切到最大 40 个字符 长度。
示例数据:
x <- c("This is the longest sentence in world, so now just make it longer",
"No in fact, this is the longest sentence in entire world, world, world, world, the whole world")
我想将句子长度缩短到大约 40 (-/+ 3 nchar),这样我就不会在单词中间缩短句子。 (所以长度是根据单词之间的空 space 决定的)。
另外我想在短句后加3点
期望的输出是这样的:
c("This is the longest sentence...","No in fact, this is the longest...")
这个函数只会盲目地缩短 40 个字符。:
strtrim(x, 40)
好的,我现在有更好的解决方案:)
x <- c("This is the longest sentence in world, so now just make it longer","No in fact, this is the longest sentence in entire world, world, world, world, the whole world")
extract <- function(x){
result <- stri_extract_first_regex(x, "^.{0,40}( |$)")
longer <- stri_length(x) > 40
result[longer] <- stri_paste(result[longer], "...")
result
}
extract(x)
## [1] "This is the longest sentence in world, ..." "No in fact, this is the longest sentence ..."
新旧基准(32000 句):
microbenchmark(sapply(x, cutAndAddDots, USE.NAMES = FALSE), extract(x), times=5)
Unit: milliseconds
expr min lq median uq max neval
sapply(x, cutAndAddDots, USE.NAMES = FALSE) 3762.51134 3762.92163 3767.87134 3776.03706 3788.139 5
extract(x) 56.01727 57.18771 58.50321 79.55759 97.924 5
旧版本
此解决方案需要 stringi
包并且总是在字符串末尾添加三个点 ...
。
require(stringi)
sapply(x, function(x) stri_paste(stri_wrap(x, 40)[1],"..."),USE.NAMES = FALSE)
## [1] "This is the longest sentence in world..." "No in fact, this is the longest..."
这个只对超过 40 个字符的句子添加三个点:
require(stringi)
cutAndAddDots <- function(x){
w <- stri_wrap(x, 40)
if(length(w) > 1){
stri_paste(w[1],"...")
}else{
w[1]
}
}
sapply(x, cutAndAddDots, USE.NAMES = FALSE)
## [1] "This is the longest sentence in world" "No in fact, this is the longest..."
性能说明
在 stri_wrap
中设置 normalize=FALSE
可能会加快大约 3 倍(在 30 000 个句子上测试)
测试数据:
x <- stri_rand_lipsum(3000)
x <- unlist(stri_split_regex(x,"(?<=\.) "))
head(x)
[1] "Lorem ipsum dolor sit amet, vel commodo in."
[2] "Ultricies mauris sapien lectus dignissim."
[3] "Id pellentesque semper turpis habitasse egestas rutrum ligula vulputate laoreet mollis id."
[4] "Curabitur volutpat efficitur parturient nibh sociosqu, faucibus tellus, eleifend pretium, quis."
[5] "Feugiat vel mollis ultricies ut auctor."
[6] "Massa neque auctor lacus ridiculus."
stri_length(head(x))
[1] 43 41 90 95 39 35
cutAndAddDots <- function(x){
w <- stri_wrap(x, 40, normalize = FALSE)
if(length(w) > 1){
stri_paste(w[1],"...")
}else{
w[1]
}
}
cutAndAddDotsNormalize <- function(x){
w <- stri_wrap(x, 40, normalize = TRUE)
if(length(w) > 1){
stri_paste(w[1],"...")
}else{
w[1]
}
}
require(microbenchmark)
microbenchmark(sapply(x, cutAndAddDots, USE.NAMES = FALSE),sapply(x, cutAndAddDotsNormalize, USE.NAMES = FALSE),times=3)
Unit: seconds
expr min lq median uq max
sapply(x, cutAndAddDots, USE.NAMES = FALSE) 3.917858 3.967411 4.016964 4.055571 4.094178
sapply(x, cutAndAddDotsNormalize, USE.NAMES = FALSE) 13.493732 13.651451 13.809170 13.917854 14.026538
基础 R 解决方案:
baseR <- function(x){
m <- regexpr("^.{0,40}( |$)", x)
result <- regmatches(x,m)
longer <- nchar(x)>40
result[longer] <- paste(result[longer],"...",sep = "")
result
}
baseR(x)==extract(x)
[1] TRUE TRUE
就像@bartektartanus extract
一样工作 :) 但是它更慢......我用他的代码生成的数据测试了这个。不过,如果您不想使用任何外部包 - 这个可以!
microbenchmark(baseR(x), extract(x))
Unit: milliseconds
expr min lq median uq max neval
baseR(x) 101.20905 107.0264 108.79086 111.03229 162.6375 100
extract(x) 52.83951 54.6931 55.46628 59.37808 103.0631 100
我想我也会 post 这个。绝对不是 stringi
速度,但也不算太寒酸。我需要一个绕过 str
的打印方法,所以我写了这个。
charTrunc <- function(x, width, end = " ...") {
ncw <- nchar(x) >= width
trm <- strtrim(x[ncw], width - nchar(end))
trimmed <- gsub("\s+$", "", trm)
replace(x, ncw, paste0(trimmed, end))
}
测试来自@bartektartanus 答案的字符串:
x <- stri_rand_lipsum(3000)
x <- unlist(stri_split_regex(x,"(?<=\.) "))
library(microbenchmark)
microbenchmark(charTrunc = {
out <- charTrunc(x, 40L)
},
times = 3
)
Unit: milliseconds
expr min lq mean median uq max neval
charTrunc 506.553 510.988 513.4603 515.423 516.9139 518.4049 3
head(out)
# [1] "Lorem ipsum dolor sit amet, venenati ..."
# [2] "Tincidunt at pellentesque id sociosq ..."
# [3] "At etiam quis et mauris non tincidun ..."
# [4] "In viverra aenean nisl ex aliquam du ..."
# [5] "Dui mi mauris ac lacus sit hac."
# [6] "Ultrices faucibus sed justo ridiculu ..."
我有一列很长的名字,我想将它们剪切到最大 40 个字符 长度。
示例数据:
x <- c("This is the longest sentence in world, so now just make it longer",
"No in fact, this is the longest sentence in entire world, world, world, world, the whole world")
我想将句子长度缩短到大约 40 (-/+ 3 nchar),这样我就不会在单词中间缩短句子。 (所以长度是根据单词之间的空 space 决定的)。
另外我想在短句后加3点
期望的输出是这样的:
c("This is the longest sentence...","No in fact, this is the longest...")
这个函数只会盲目地缩短 40 个字符。:
strtrim(x, 40)
好的,我现在有更好的解决方案:)
x <- c("This is the longest sentence in world, so now just make it longer","No in fact, this is the longest sentence in entire world, world, world, world, the whole world")
extract <- function(x){
result <- stri_extract_first_regex(x, "^.{0,40}( |$)")
longer <- stri_length(x) > 40
result[longer] <- stri_paste(result[longer], "...")
result
}
extract(x)
## [1] "This is the longest sentence in world, ..." "No in fact, this is the longest sentence ..."
新旧基准(32000 句):
microbenchmark(sapply(x, cutAndAddDots, USE.NAMES = FALSE), extract(x), times=5)
Unit: milliseconds
expr min lq median uq max neval
sapply(x, cutAndAddDots, USE.NAMES = FALSE) 3762.51134 3762.92163 3767.87134 3776.03706 3788.139 5
extract(x) 56.01727 57.18771 58.50321 79.55759 97.924 5
旧版本
此解决方案需要 stringi
包并且总是在字符串末尾添加三个点 ...
。
require(stringi)
sapply(x, function(x) stri_paste(stri_wrap(x, 40)[1],"..."),USE.NAMES = FALSE)
## [1] "This is the longest sentence in world..." "No in fact, this is the longest..."
这个只对超过 40 个字符的句子添加三个点:
require(stringi)
cutAndAddDots <- function(x){
w <- stri_wrap(x, 40)
if(length(w) > 1){
stri_paste(w[1],"...")
}else{
w[1]
}
}
sapply(x, cutAndAddDots, USE.NAMES = FALSE)
## [1] "This is the longest sentence in world" "No in fact, this is the longest..."
性能说明
在 stri_wrap
中设置 normalize=FALSE
可能会加快大约 3 倍(在 30 000 个句子上测试)
测试数据:
x <- stri_rand_lipsum(3000)
x <- unlist(stri_split_regex(x,"(?<=\.) "))
head(x)
[1] "Lorem ipsum dolor sit amet, vel commodo in."
[2] "Ultricies mauris sapien lectus dignissim."
[3] "Id pellentesque semper turpis habitasse egestas rutrum ligula vulputate laoreet mollis id."
[4] "Curabitur volutpat efficitur parturient nibh sociosqu, faucibus tellus, eleifend pretium, quis."
[5] "Feugiat vel mollis ultricies ut auctor."
[6] "Massa neque auctor lacus ridiculus."
stri_length(head(x))
[1] 43 41 90 95 39 35
cutAndAddDots <- function(x){
w <- stri_wrap(x, 40, normalize = FALSE)
if(length(w) > 1){
stri_paste(w[1],"...")
}else{
w[1]
}
}
cutAndAddDotsNormalize <- function(x){
w <- stri_wrap(x, 40, normalize = TRUE)
if(length(w) > 1){
stri_paste(w[1],"...")
}else{
w[1]
}
}
require(microbenchmark)
microbenchmark(sapply(x, cutAndAddDots, USE.NAMES = FALSE),sapply(x, cutAndAddDotsNormalize, USE.NAMES = FALSE),times=3)
Unit: seconds
expr min lq median uq max
sapply(x, cutAndAddDots, USE.NAMES = FALSE) 3.917858 3.967411 4.016964 4.055571 4.094178
sapply(x, cutAndAddDotsNormalize, USE.NAMES = FALSE) 13.493732 13.651451 13.809170 13.917854 14.026538
基础 R 解决方案:
baseR <- function(x){
m <- regexpr("^.{0,40}( |$)", x)
result <- regmatches(x,m)
longer <- nchar(x)>40
result[longer] <- paste(result[longer],"...",sep = "")
result
}
baseR(x)==extract(x)
[1] TRUE TRUE
就像@bartektartanus extract
一样工作 :) 但是它更慢......我用他的代码生成的数据测试了这个。不过,如果您不想使用任何外部包 - 这个可以!
microbenchmark(baseR(x), extract(x))
Unit: milliseconds
expr min lq median uq max neval
baseR(x) 101.20905 107.0264 108.79086 111.03229 162.6375 100
extract(x) 52.83951 54.6931 55.46628 59.37808 103.0631 100
我想我也会 post 这个。绝对不是 stringi
速度,但也不算太寒酸。我需要一个绕过 str
的打印方法,所以我写了这个。
charTrunc <- function(x, width, end = " ...") {
ncw <- nchar(x) >= width
trm <- strtrim(x[ncw], width - nchar(end))
trimmed <- gsub("\s+$", "", trm)
replace(x, ncw, paste0(trimmed, end))
}
测试来自@bartektartanus 答案的字符串:
x <- stri_rand_lipsum(3000)
x <- unlist(stri_split_regex(x,"(?<=\.) "))
library(microbenchmark)
microbenchmark(charTrunc = {
out <- charTrunc(x, 40L)
},
times = 3
)
Unit: milliseconds
expr min lq mean median uq max neval
charTrunc 506.553 510.988 513.4603 515.423 516.9139 518.4049 3
head(out)
# [1] "Lorem ipsum dolor sit amet, venenati ..."
# [2] "Tincidunt at pellentesque id sociosq ..."
# [3] "At etiam quis et mauris non tincidun ..."
# [4] "In viverra aenean nisl ex aliquam du ..."
# [5] "Dui mi mauris ac lacus sit hac."
# [6] "Ultrices faucibus sed justo ridiculu ..."