R中的文本清理
text cleaning in R
我在 R 中有一列如下所示:
Path Column
ag.1.4->ao.5.5->iv.9.12->ag.4.35
ao.11.234->iv.345.455.1.2->ag.9.531
我想将其转换为:
Path Column
ag->ao->iv->ag
ao->iv->ag
我该怎么做?
谢谢
这是我的全部数据输出:
structure(list(Rank = c(10394749L, 36749879L), Count = c(1L,
1L), Percent = c(0.001011122, 0.001011122), Path = c("ao.legacy payment.not_completed->ao.legacy payment.not_completed->ao.legacy payment.completed",
"ao.legacy payment.not_completed->agent.payment.completed")), .Names = c("Rank",
"Count", "Percent", "Path"), class = "data.frame", row.names = c(NA,
-2L))
您可以使用 gsub
来匹配 .
和 .
之后的数字 (\.[0-9]+
) 并将其替换为 ''
.
df1$Path.Column <- gsub('\.[0-9]+', '', df1$Path.Column)
df1
# Path.Column
#1 ag -> ao -> iv -> ag
#2 ao -> iv -> ag
更新
对于新数据集df2
gsub('\.[^->]+(?=(->|\b))', '', df2$Path, perl=TRUE)
#[1] "ao->ao->ao" "ao->agent"
以及 OP 中显示的字符串 post
str2 <- c('ag.1.4->ao.5.5->iv.9.12->ag.4.35',
'ao.11.234->iv.345.455.1.2->ag.9.531')
gsub('\.[^->]+(?=(->|\b))', '', str2, perl=TRUE)
#[1] "ag->ao->iv->ag" "ao->iv->ag"
数据
df1 <- structure(list(Path.Column = c("ag.1 -> ao.5 -> iv.9 -> ag.4",
"ao.11 -> iv.345 -> ag.9")), .Names = "Path.Column",
class = "data.frame", row.names = c(NA, -2L))
df2 <- structure(list(Rank = c(10394749L, 36749879L), Count = c(1L,
1L), Percent = c(0.001011122, 0.001011122),
Path = c("ao.legacy payment.not_completed->ao.legacy payment.not_completed->ao.legacy payment.completed",
"ao.legacy payment.not_completed->agent.payment.completed")),
.Names = c("Rank", "Count", "Percent", "Path"), class = "data.frame",
row.names = c(NA, -2L))
拆分 '->'
上的字符串并分别处理子字符串可能更容易
# split the stirngs into parts
subStrings <- strsplit(df$Path,'->')
# remove eveything after **first** the dot
subStrings<- lapply(subStrings,
function(x)gsub('\..*','',x))
# paste them back together.
sapply(subStrings,paste0,collapse="->")
#> "ao->ao->ao" "ao->agent"
或
# split the stirngs into parts
subStrings <- strsplit(df$Path,'->')
# remove the parts of the identifiers after the dot
subStrings<- lapply(subStrings,
function(x)gsub('\.[^ \t]*','',x))
# paste them back together.
sapply(subStrings,paste0,collapse="->")
#> "ao payment->ao payment->ao payment" "ao payment->agent"
我在 R 中有一列如下所示:
Path Column
ag.1.4->ao.5.5->iv.9.12->ag.4.35
ao.11.234->iv.345.455.1.2->ag.9.531
我想将其转换为:
Path Column
ag->ao->iv->ag
ao->iv->ag
我该怎么做?
谢谢
这是我的全部数据输出:
structure(list(Rank = c(10394749L, 36749879L), Count = c(1L,
1L), Percent = c(0.001011122, 0.001011122), Path = c("ao.legacy payment.not_completed->ao.legacy payment.not_completed->ao.legacy payment.completed",
"ao.legacy payment.not_completed->agent.payment.completed")), .Names = c("Rank",
"Count", "Percent", "Path"), class = "data.frame", row.names = c(NA,
-2L))
您可以使用 gsub
来匹配 .
和 .
之后的数字 (\.[0-9]+
) 并将其替换为 ''
.
df1$Path.Column <- gsub('\.[0-9]+', '', df1$Path.Column)
df1
# Path.Column
#1 ag -> ao -> iv -> ag
#2 ao -> iv -> ag
更新
对于新数据集df2
gsub('\.[^->]+(?=(->|\b))', '', df2$Path, perl=TRUE)
#[1] "ao->ao->ao" "ao->agent"
以及 OP 中显示的字符串 post
str2 <- c('ag.1.4->ao.5.5->iv.9.12->ag.4.35',
'ao.11.234->iv.345.455.1.2->ag.9.531')
gsub('\.[^->]+(?=(->|\b))', '', str2, perl=TRUE)
#[1] "ag->ao->iv->ag" "ao->iv->ag"
数据
df1 <- structure(list(Path.Column = c("ag.1 -> ao.5 -> iv.9 -> ag.4",
"ao.11 -> iv.345 -> ag.9")), .Names = "Path.Column",
class = "data.frame", row.names = c(NA, -2L))
df2 <- structure(list(Rank = c(10394749L, 36749879L), Count = c(1L,
1L), Percent = c(0.001011122, 0.001011122),
Path = c("ao.legacy payment.not_completed->ao.legacy payment.not_completed->ao.legacy payment.completed",
"ao.legacy payment.not_completed->agent.payment.completed")),
.Names = c("Rank", "Count", "Percent", "Path"), class = "data.frame",
row.names = c(NA, -2L))
拆分 '->'
上的字符串并分别处理子字符串可能更容易
# split the stirngs into parts
subStrings <- strsplit(df$Path,'->')
# remove eveything after **first** the dot
subStrings<- lapply(subStrings,
function(x)gsub('\..*','',x))
# paste them back together.
sapply(subStrings,paste0,collapse="->")
#> "ao->ao->ao" "ao->agent"
或
# split the stirngs into parts
subStrings <- strsplit(df$Path,'->')
# remove the parts of the identifiers after the dot
subStrings<- lapply(subStrings,
function(x)gsub('\.[^ \t]*','',x))
# paste them back together.
sapply(subStrings,paste0,collapse="->")
#> "ao payment->ao payment->ao payment" "ao payment->agent"