R分号将一列分隔成行
R semicolon delimited a column into rows
我正在使用 RStudio 2.15.0 并使用 XLConnect 从 Excel 创建了一个对象,其中有 3000 多行和 12 列知道这是否可能或如何去做。以下数据示例使用 3 列连接。在这方面的任何帮助都会很棒。
适用于其中 2 列的代码如下。
v1 <- with(df, tapply(PolId, Description, FUN= function(x) {
x1 <- paste(x, collapse=";")
gsub('(\b\S+\b)(?=.*\b\1\b.*);', '', x1, perl=TRUE)}))
library(stringr)
Description <- rep(names(v1), str_count(v1, '\w+'))
PolId <- scan(text=gsub(';+', ' ', v1), what='', quiet=TRUE)
data.frame(PolId, Description)
示例数据
PolId Description Document.Type
ABC123;ABC456;ABC789; TEST1 Pol1
ABC123;ABC456;ABC789; TEST1 Pol1
ABC123;ABC456;ABC789; TEST1 Pol1
AAA123; TEST1 End1
AAA123; TEST2 End2
ABB123;ABC123; TEST3 End1
ABB123;ABC123; TEST3 End1
我希望输出是这样的(替换重复的 Polid 的)
PolId Description Document.Type
ABC123 TEST1 Pol1
ABC456 TEST1 Pol1
ABC789 TEST1 Pol1
AAA123 TEST1 End1
AAA123 TEST2 End2
ABB123 TEST3 End1
ABC123 TEST3 End1
这是一个快速 data.table
可能的解决方案
library(data.table)
unique(setDT(df)[, .(PolId = unlist(strsplit(as.character(PolId), ";"))), by = Description])
# Description PolId
# 1: TEST1 ABC123
# 2: TEST1 ABC456
# 3: TEST1 ABC789
# 4: TEST1 AAA123
# 5: TEST2 AAA123
# 6: TEST3 ABB123
# 7: TEST3 ABC123
根据您的编辑 - 另一个选项(如果您有超过两列)
library(splitstackshape)
unique(cSplit(df, "PolId", ";", "long"))
# PolId Description Document.Type
# 1: ABC123 TEST1 Pol1
# 2: ABC456 TEST1 Pol1
# 3: ABC789 TEST1 Pol1
# 4: AAA123 TEST1 End1
# 5: AAA123 TEST2 End2
# 6: ABB123 TEST3 End1
# 7: ABC123 TEST3 End1
您可以在拆分 "PolId" 列后从 tidyr
尝试 unnest
并获得 unique
行
library(dplyr)
library(tidyr)
unnest(setNames(strsplit(df$PolId, ';'), df$Description),
Description) %>% unique()
或使用 base R
和 stack/strsplit/duplicated
。用定界符(;
)分割"PolId"(strsplit
),用"Description"列命名输出列表元素,stack
列表得到一个'data.frame' 并使用 duplicated
删除重复行。
df1 <- stack(setNames(strsplit(df$PolId, ';'), df$Description))
setNames(df1[!duplicated(df1),], names(df))
# PolId Description
#1 ABC123 TEST1
#2 ABC456 TEST1
#3 ABC789 TEST1
#10 AAA123 TEST1
#11 AAA123 TEST2
#12 ABB123 TEST3
#13 ABC123 TEST3
或不使用 strsplit
的其他选项
v1 <- with(df, tapply(PolId, Description, FUN= function(x) {
x1 <- paste(x, collapse=";")
gsub('(\b\S+\b)(?=.*\b\1\b.*);', '', x1, perl=TRUE)}))
library(stringr)
Description <- rep(names(v1), str_count(v1, '\w+'))
PolId <- scan(text=gsub(';+', ' ', v1), what='', quiet=TRUE)
data.frame(PolId, Description)
# PolId Description
#1 ABC123 TEST1
#2 ABC456 TEST1
#3 ABC789 TEST1
#4 AAA123 TEST1
#5 AAA123 TEST2
#6 ABB123 TEST3
#7 ABC123 TEST3
这是一个基本的 R 解决方案。使用 strplit
拆分 PolId
字段,并为每个这样的拆分字段将其与相应的 Description
绑定。这给出了我们 rbind
在一起的矩阵列表。最后设置列名。
out <- do.call(rbind, Map(cbind, strsplit(DF$PolId, ";"), DF$Description))
colnames(out) <- colnames(DF)
给予:
> out
PolId Description
[1,] "ABC123" "TEST1"
[2,] "ABC456" "TEST1"
[3,] "ABC789" "TEST1"
[4,] "ABC123" "TEST1"
[5,] "ABC456" "TEST1"
[6,] "ABC789" "TEST1"
[7,] "ABC123" "TEST1"
[8,] "ABC456" "TEST1"
[9,] "ABC789" "TEST1"
[10,] "AAA123" "TEST1"
[11,] "AAA123" "TEST2"
[12,] "ABB123" "TEST3"
[13,] "ABC123" "TEST3"
[14,] "ABB123" "TEST3"
[15,] "ABC123" "TEST3"
注:我们用这个作为输入:
DF <-
structure(list(PolId = c("ABC123;ABC456;ABC789;", "ABC123;ABC456;ABC789;",
"ABC123;ABC456;ABC789;", "AAA123;", "AAA123;", "ABB123;ABC123;",
"ABB123;ABC123;"), Description = c("TEST1", "TEST1", "TEST1",
"TEST1", "TEST2", "TEST3", "TEST3")), .Names = c("PolId", "Description"
), class = "data.frame", row.names = c(NA, -7L))
我正在使用 RStudio 2.15.0 并使用 XLConnect 从 Excel 创建了一个对象,其中有 3000 多行和 12 列知道这是否可能或如何去做。以下数据示例使用 3 列连接。在这方面的任何帮助都会很棒。
适用于其中 2 列的代码如下。
v1 <- with(df, tapply(PolId, Description, FUN= function(x) {
x1 <- paste(x, collapse=";")
gsub('(\b\S+\b)(?=.*\b\1\b.*);', '', x1, perl=TRUE)}))
library(stringr)
Description <- rep(names(v1), str_count(v1, '\w+'))
PolId <- scan(text=gsub(';+', ' ', v1), what='', quiet=TRUE)
data.frame(PolId, Description)
示例数据
PolId Description Document.Type
ABC123;ABC456;ABC789; TEST1 Pol1
ABC123;ABC456;ABC789; TEST1 Pol1
ABC123;ABC456;ABC789; TEST1 Pol1
AAA123; TEST1 End1
AAA123; TEST2 End2
ABB123;ABC123; TEST3 End1
ABB123;ABC123; TEST3 End1
我希望输出是这样的(替换重复的 Polid 的)
PolId Description Document.Type
ABC123 TEST1 Pol1
ABC456 TEST1 Pol1
ABC789 TEST1 Pol1
AAA123 TEST1 End1
AAA123 TEST2 End2
ABB123 TEST3 End1
ABC123 TEST3 End1
这是一个快速 data.table
可能的解决方案
library(data.table)
unique(setDT(df)[, .(PolId = unlist(strsplit(as.character(PolId), ";"))), by = Description])
# Description PolId
# 1: TEST1 ABC123
# 2: TEST1 ABC456
# 3: TEST1 ABC789
# 4: TEST1 AAA123
# 5: TEST2 AAA123
# 6: TEST3 ABB123
# 7: TEST3 ABC123
根据您的编辑 - 另一个选项(如果您有超过两列)
library(splitstackshape)
unique(cSplit(df, "PolId", ";", "long"))
# PolId Description Document.Type
# 1: ABC123 TEST1 Pol1
# 2: ABC456 TEST1 Pol1
# 3: ABC789 TEST1 Pol1
# 4: AAA123 TEST1 End1
# 5: AAA123 TEST2 End2
# 6: ABB123 TEST3 End1
# 7: ABC123 TEST3 End1
您可以在拆分 "PolId" 列后从 tidyr
尝试 unnest
并获得 unique
行
library(dplyr)
library(tidyr)
unnest(setNames(strsplit(df$PolId, ';'), df$Description),
Description) %>% unique()
或使用 base R
和 stack/strsplit/duplicated
。用定界符(;
)分割"PolId"(strsplit
),用"Description"列命名输出列表元素,stack
列表得到一个'data.frame' 并使用 duplicated
删除重复行。
df1 <- stack(setNames(strsplit(df$PolId, ';'), df$Description))
setNames(df1[!duplicated(df1),], names(df))
# PolId Description
#1 ABC123 TEST1
#2 ABC456 TEST1
#3 ABC789 TEST1
#10 AAA123 TEST1
#11 AAA123 TEST2
#12 ABB123 TEST3
#13 ABC123 TEST3
或不使用 strsplit
v1 <- with(df, tapply(PolId, Description, FUN= function(x) {
x1 <- paste(x, collapse=";")
gsub('(\b\S+\b)(?=.*\b\1\b.*);', '', x1, perl=TRUE)}))
library(stringr)
Description <- rep(names(v1), str_count(v1, '\w+'))
PolId <- scan(text=gsub(';+', ' ', v1), what='', quiet=TRUE)
data.frame(PolId, Description)
# PolId Description
#1 ABC123 TEST1
#2 ABC456 TEST1
#3 ABC789 TEST1
#4 AAA123 TEST1
#5 AAA123 TEST2
#6 ABB123 TEST3
#7 ABC123 TEST3
这是一个基本的 R 解决方案。使用 strplit
拆分 PolId
字段,并为每个这样的拆分字段将其与相应的 Description
绑定。这给出了我们 rbind
在一起的矩阵列表。最后设置列名。
out <- do.call(rbind, Map(cbind, strsplit(DF$PolId, ";"), DF$Description))
colnames(out) <- colnames(DF)
给予:
> out
PolId Description
[1,] "ABC123" "TEST1"
[2,] "ABC456" "TEST1"
[3,] "ABC789" "TEST1"
[4,] "ABC123" "TEST1"
[5,] "ABC456" "TEST1"
[6,] "ABC789" "TEST1"
[7,] "ABC123" "TEST1"
[8,] "ABC456" "TEST1"
[9,] "ABC789" "TEST1"
[10,] "AAA123" "TEST1"
[11,] "AAA123" "TEST2"
[12,] "ABB123" "TEST3"
[13,] "ABC123" "TEST3"
[14,] "ABB123" "TEST3"
[15,] "ABC123" "TEST3"
注:我们用这个作为输入:
DF <-
structure(list(PolId = c("ABC123;ABC456;ABC789;", "ABC123;ABC456;ABC789;",
"ABC123;ABC456;ABC789;", "AAA123;", "AAA123;", "ABB123;ABC123;",
"ABB123;ABC123;"), Description = c("TEST1", "TEST1", "TEST1",
"TEST1", "TEST2", "TEST3", "TEST3")), .Names = c("PolId", "Description"
), class = "data.frame", row.names = c(NA, -7L))