字符串拆分、合并和堆叠多列
String Split, Merge, and Stack multiple columns
我有以下原始数据,如下所示:
rawData <- data.frame(ID = c(1,2,3),
Name = c("Company B; Company A; Company C", "Company A; Company D", "Company E"),
Name_location = c("Company A (USA (Primary)); Company B (Japan(Primary)); Company C (Korea,South (Primary))", "Company A (USA (Primary)); Company D (USA (Primary))", "European (Primary)" ))
ID Name Name_location
1 Company B; Company A;Company C Company A (USA (Primary)); Company B (Japan(Primary)); Company C (Korea,South (Primary))
2 Company A; Company D Company A (USA (Primary)); Company D (USA (Primary))
3 Company E European (Primary)
我需要将数据转换为如下所示:
Name_location 字段在“名称”字段中包含每个公司的位置数据,但它可能是乱序的。此外,如果名称字段中只有 1 家公司,Name_location 字段将只有位置,而如果名称字段中有多家公司,Name_location 字段将遵循语法“公司(位置) (主要));公司(位置(主要))”
我需要一种方法将公司及其位置隔离为单独的行,可通过 ID 识别。
IdealData <- data.frame(ID = c(1,1,1,2,2,3),
Name = c("Company B", "Company A", "Company C", "Company A","Company D", "Company E"),
Location = c("Japan","USA", "Korea,South","USA","USA","European"))
ID Name Location
1 Company B Japan
1 Company A USA
1 Company C Korea,South
2 Company A USA
2 Company D USA
3 Company E European
希望在 R 中实现这一点
使用separate_rows
后,我们可以用str_extract
提取具体成分
library(stringr)
library(dplyr)
library(tidyr)
rawData %>%
separate_rows(c(Name, Name_location), sep=";\s*") %>%
separate(Name_location, into = c('Name1', 'Location'), sep= "\s+(?=\()",
extra = "merge") %>%
mutate(Location = case_when(Name1 == 'European' ~ Name1,
TRUE ~ trimws(str_extract(Location,
"(?<=\()[^(]+"))[match(Name, Name1)])) %>%
select(-Name1)
# A tibble: 6 x 3
# ID Name Location
# <dbl> <chr> <chr>
#1 1 Company B Japan
#2 1 Company A USA
#3 1 Company C Korea,South
#4 2 Company A USA
#5 2 Company D USA
#6 3 Company E European
如果你想在没有包和库的情况下做到这一点,你可以循环遍历条目并创建一个新的 data.frame:
rawData <- data.frame("ID" = c(1,2,3),
"Name" = c("Company B; Company A; Company C", "Company A; Company D", "Company E"),
"Name_location" = c("Company A (USA (Primary)); Company B (Japan(Primary)); Company C (Korea,South (Primary))", "Company A (USA (Primary)); Company D (USA (Primary))", "European (Primary)" ))
rawData$Name = as.character(rawData$Name)
rawData$Name_location = as.character(rawData$Name_location)
idealData = list("ID"=c(),"Company"=c(),"Location"=c())
for(i in 1:length(rawData$ID)){
print(strsplit(rawData$Name[i],";"))
ncomp = length(strsplit(rawData$Name[i],";")[[1]])
print(ncomp)
if(ncomp==1){
idealData[["ID"]]=c(idealData[["ID"]],rawData$ID[i])
idealData[["Company"]]=c(idealData[["Company"]],rawData$Name[i])
idealData[["Location"]]=c(idealData[["Location"]],strsplit(rawData$Name_location[i]," \(")[[1]][1])
}else{
vcomp = strsplit(rawData$Name[i],"; ")[[1]]
for(compi in 1:ncomp){
idealData[["ID"]]=c(idealData[["ID"]],rawData$ID[i])
idealData[["Company"]]=c(idealData[["Company"]],vcomp[compi])
loc = strsplit(rawData$Name_location[i],";")[[1]]
print(loc)
loc = loc[grep(vcomp[compi],loc)][1]
idealData[["Location"]]=c(idealData[["Location"]],strsplit(loc,"\(")[[1]][2])
}
}
}
idealData = as.data.frame(idealData)
给出输出:
> idealData
ID Company Location
1 1 Company B Japan
2 1 Company A USA
3 1 Company C Korea,South
4 2 Company A USA
5 2 Company D USA
6 3 Company E European
我有以下原始数据,如下所示:
rawData <- data.frame(ID = c(1,2,3),
Name = c("Company B; Company A; Company C", "Company A; Company D", "Company E"),
Name_location = c("Company A (USA (Primary)); Company B (Japan(Primary)); Company C (Korea,South (Primary))", "Company A (USA (Primary)); Company D (USA (Primary))", "European (Primary)" ))
ID Name Name_location
1 Company B; Company A;Company C Company A (USA (Primary)); Company B (Japan(Primary)); Company C (Korea,South (Primary))
2 Company A; Company D Company A (USA (Primary)); Company D (USA (Primary))
3 Company E European (Primary)
我需要将数据转换为如下所示:
Name_location 字段在“名称”字段中包含每个公司的位置数据,但它可能是乱序的。此外,如果名称字段中只有 1 家公司,Name_location 字段将只有位置,而如果名称字段中有多家公司,Name_location 字段将遵循语法“公司(位置) (主要));公司(位置(主要))”
我需要一种方法将公司及其位置隔离为单独的行,可通过 ID 识别。
IdealData <- data.frame(ID = c(1,1,1,2,2,3),
Name = c("Company B", "Company A", "Company C", "Company A","Company D", "Company E"),
Location = c("Japan","USA", "Korea,South","USA","USA","European"))
ID Name Location
1 Company B Japan
1 Company A USA
1 Company C Korea,South
2 Company A USA
2 Company D USA
3 Company E European
希望在 R 中实现这一点
使用separate_rows
后,我们可以用str_extract
library(stringr)
library(dplyr)
library(tidyr)
rawData %>%
separate_rows(c(Name, Name_location), sep=";\s*") %>%
separate(Name_location, into = c('Name1', 'Location'), sep= "\s+(?=\()",
extra = "merge") %>%
mutate(Location = case_when(Name1 == 'European' ~ Name1,
TRUE ~ trimws(str_extract(Location,
"(?<=\()[^(]+"))[match(Name, Name1)])) %>%
select(-Name1)
# A tibble: 6 x 3
# ID Name Location
# <dbl> <chr> <chr>
#1 1 Company B Japan
#2 1 Company A USA
#3 1 Company C Korea,South
#4 2 Company A USA
#5 2 Company D USA
#6 3 Company E European
如果你想在没有包和库的情况下做到这一点,你可以循环遍历条目并创建一个新的 data.frame:
rawData <- data.frame("ID" = c(1,2,3),
"Name" = c("Company B; Company A; Company C", "Company A; Company D", "Company E"),
"Name_location" = c("Company A (USA (Primary)); Company B (Japan(Primary)); Company C (Korea,South (Primary))", "Company A (USA (Primary)); Company D (USA (Primary))", "European (Primary)" ))
rawData$Name = as.character(rawData$Name)
rawData$Name_location = as.character(rawData$Name_location)
idealData = list("ID"=c(),"Company"=c(),"Location"=c())
for(i in 1:length(rawData$ID)){
print(strsplit(rawData$Name[i],";"))
ncomp = length(strsplit(rawData$Name[i],";")[[1]])
print(ncomp)
if(ncomp==1){
idealData[["ID"]]=c(idealData[["ID"]],rawData$ID[i])
idealData[["Company"]]=c(idealData[["Company"]],rawData$Name[i])
idealData[["Location"]]=c(idealData[["Location"]],strsplit(rawData$Name_location[i]," \(")[[1]][1])
}else{
vcomp = strsplit(rawData$Name[i],"; ")[[1]]
for(compi in 1:ncomp){
idealData[["ID"]]=c(idealData[["ID"]],rawData$ID[i])
idealData[["Company"]]=c(idealData[["Company"]],vcomp[compi])
loc = strsplit(rawData$Name_location[i],";")[[1]]
print(loc)
loc = loc[grep(vcomp[compi],loc)][1]
idealData[["Location"]]=c(idealData[["Location"]],strsplit(loc,"\(")[[1]][2])
}
}
}
idealData = as.data.frame(idealData)
给出输出:
> idealData
ID Company Location
1 1 Company B Japan
2 1 Company A USA
3 1 Company C Korea,South
4 2 Company A USA
5 2 Company D USA
6 3 Company E European