通过检测一个字符串中的任何多个字符串模式来创建变量
Making a variable from detection of any multiple string pattern in one string
end_result_tbl
此 end_result_tbl 是来自不同选民文件的理想格式示例。
ID GEN_16 GEN_14 GEN_08 PP_16 PR_16 PR_15 PR_14
0001 1 1 1 1 0 0 0
0002 0 0 0 0 1 0 1
0003 1 1 1 0 0 0 0
0004 1 0 1 0 0 0 1
0005 1 0 1 1 1 0 1
raw_data_tbl
ID Voter_History
0001 GE 20161108;20121106 GE;20081104 GE;20080205 PP;General Election 2004
0002 2016 GENERAL ELECTION;2014 GENERAL ELECTION
0003 20121106 GE;20081104 GE;General Election 2006
0004 GE 20150910
0005 16 GENERAL ELECTION; 14 PRIMARY ELECTION
希望根据每个文本字符串的条件字符串匹配为每次选举创建变量。
每次选举大约有9次迭代。如果一次迭代与选举匹配,则放置“1”以显示该选举中的投票,如果匹配 none,则放置“0”表示未投票。
以下是 2016 年 11 月大选的迭代
GEN_16<-c("20161108 GE",
"16 GENERAL ELECTION",
"GENERAL 2016",
"GENERAL ELECTION 2016",
"2016 GENERAL ELECTION",
"GENERAL ELECTION, 2016",
"16 GENERAL ELECTION",
"GE 20161108")
这是我尝试过的方法(仅尝试 2016 年大选):
raw_data_tbl$GEN_16<-
as.integer(stri_detect(raw_data_tbl$Voter_History,GEN_16))
which(GEN_16%in%raw_data_tbl$Voter_History
require(dplyr)
Sequences <- GEN_16
Database <- raw_data_tabl$Voter_History
df=as.data.frame(sapply(Sequences, function(x) grep(x,Database)))
stats=df %>% summarise_all(funs(sum))
cbind(Sequences,as.numeric(stats))
这实际上是 sql 中的一段非常简单但超长的代码,但很难找到它在 R 中的等效代码。
raw_data_tabl 有大约 1700 万选民。
任何方向都非常感谢,提前致谢。
你可以试试这个 -
library(stringr)
library(tidyverse)
#read input file
txt <- readLines("test.txt")
#put delimiter between columns and transform it into a dataframe
txt <- gsub("\s+(.*)", ",\1", txt)
df <- read.table(textConnection(txt),
header = T, stringsAsFactors = F, sep = ",", colClasses = c("ID" = "character"))
初始数据框看起来像
> df
# ID Voter_History
#1 0001 GE 20161108;20121106 GE;20081104 GE;20080205 PP;General Election 2004
#2 0002 2016 GENERAL ELECTION;2014 GENERAL ELECTION
#3 0003 20121106 GE;20081104 GE;General Election 2006
#4 0004 GE 20150910
#5 0005 16 GENERAL ELECTION; 14 PRIMARY ELECTION
清理 Voter_History
列的数据以提取有用的信息
election_func <- function(x){
#extract year
yr <- gsub("20", "", substr(str_extract_all(strsplit(x, split=";")[[1]], "[0-9]+"), 1, 4))
#extract election type
elec_type <- toupper(substr(str_extract(strsplit(x, split=";")[[1]], '[A-Za-z]+'), 1, 2))
return(paste(sort(paste(elec_type, yr, sep="_")), collapse = ";"))
}
df$Voter_History <- do.call(rbind, lapply(df$Voter_History, function(x) election_func(x)))
清理后的数据是
> df
# ID Voter_History
#1 0001 GE_04;GE_08;GE_12;GE_16;PP_08
#2 0002 GE_14;GE_16
#3 0003 GE_06;GE_08;GE_12
#4 0004 GE_15
#5 0005 GE_16;PR_14
最终将此数据转换为所需格式
df1 <- df %>%
separate_rows("Voter_History", sep= ";") %>%
distinct(ID, Voter_History) %>%
mutate(value = 1) %>%
spread(Voter_History, value, fill = 0)
df1
# ID GE_04 GE_06 GE_08 GE_12 GE_14 GE_15 GE_16 PP_08 PR_14
#1 0001 1 0 1 1 0 0 1 1 0
#2 0002 0 0 0 0 1 0 1 0 0
#3 0003 0 1 1 1 0 0 0 0 0
#4 0004 0 0 0 0 0 1 0 0 0
#5 0005 0 0 0 0 0 0 1 0 1
示例数据: test.txt
包含
ID Voter_History
0001 GE 20161108;20121106 GE;20081104 GE;20080205 PP;General Election 2004
0002 2016 GENERAL ELECTION;2014 GENERAL ELECTION
0003 20121106 GE;20081104 GE;General Election 2006
0004 GE 20150910
0005 16 GENERAL ELECTION; 14 PRIMARY ELECTION
(更新 - 添加了一个逻辑来解析 Error: Duplicate identifiers for rows...
。这是由于 spread
调用中重复的 ID
和 Voter_History
组合而发生的)
end_result_tbl
此 end_result_tbl 是来自不同选民文件的理想格式示例。
ID GEN_16 GEN_14 GEN_08 PP_16 PR_16 PR_15 PR_14
0001 1 1 1 1 0 0 0
0002 0 0 0 0 1 0 1
0003 1 1 1 0 0 0 0
0004 1 0 1 0 0 0 1
0005 1 0 1 1 1 0 1
raw_data_tbl
ID Voter_History
0001 GE 20161108;20121106 GE;20081104 GE;20080205 PP;General Election 2004
0002 2016 GENERAL ELECTION;2014 GENERAL ELECTION
0003 20121106 GE;20081104 GE;General Election 2006
0004 GE 20150910
0005 16 GENERAL ELECTION; 14 PRIMARY ELECTION
希望根据每个文本字符串的条件字符串匹配为每次选举创建变量。
每次选举大约有9次迭代。如果一次迭代与选举匹配,则放置“1”以显示该选举中的投票,如果匹配 none,则放置“0”表示未投票。
以下是 2016 年 11 月大选的迭代
GEN_16<-c("20161108 GE",
"16 GENERAL ELECTION",
"GENERAL 2016",
"GENERAL ELECTION 2016",
"2016 GENERAL ELECTION",
"GENERAL ELECTION, 2016",
"16 GENERAL ELECTION",
"GE 20161108")
这是我尝试过的方法(仅尝试 2016 年大选):
raw_data_tbl$GEN_16<-
as.integer(stri_detect(raw_data_tbl$Voter_History,GEN_16))
which(GEN_16%in%raw_data_tbl$Voter_History
require(dplyr)
Sequences <- GEN_16
Database <- raw_data_tabl$Voter_History
df=as.data.frame(sapply(Sequences, function(x) grep(x,Database)))
stats=df %>% summarise_all(funs(sum))
cbind(Sequences,as.numeric(stats))
这实际上是 sql 中的一段非常简单但超长的代码,但很难找到它在 R 中的等效代码。
raw_data_tabl 有大约 1700 万选民。
任何方向都非常感谢,提前致谢。
你可以试试这个 -
library(stringr)
library(tidyverse)
#read input file
txt <- readLines("test.txt")
#put delimiter between columns and transform it into a dataframe
txt <- gsub("\s+(.*)", ",\1", txt)
df <- read.table(textConnection(txt),
header = T, stringsAsFactors = F, sep = ",", colClasses = c("ID" = "character"))
初始数据框看起来像
> df
# ID Voter_History
#1 0001 GE 20161108;20121106 GE;20081104 GE;20080205 PP;General Election 2004
#2 0002 2016 GENERAL ELECTION;2014 GENERAL ELECTION
#3 0003 20121106 GE;20081104 GE;General Election 2006
#4 0004 GE 20150910
#5 0005 16 GENERAL ELECTION; 14 PRIMARY ELECTION
清理 Voter_History
列的数据以提取有用的信息
election_func <- function(x){
#extract year
yr <- gsub("20", "", substr(str_extract_all(strsplit(x, split=";")[[1]], "[0-9]+"), 1, 4))
#extract election type
elec_type <- toupper(substr(str_extract(strsplit(x, split=";")[[1]], '[A-Za-z]+'), 1, 2))
return(paste(sort(paste(elec_type, yr, sep="_")), collapse = ";"))
}
df$Voter_History <- do.call(rbind, lapply(df$Voter_History, function(x) election_func(x)))
清理后的数据是
> df
# ID Voter_History
#1 0001 GE_04;GE_08;GE_12;GE_16;PP_08
#2 0002 GE_14;GE_16
#3 0003 GE_06;GE_08;GE_12
#4 0004 GE_15
#5 0005 GE_16;PR_14
最终将此数据转换为所需格式
df1 <- df %>%
separate_rows("Voter_History", sep= ";") %>%
distinct(ID, Voter_History) %>%
mutate(value = 1) %>%
spread(Voter_History, value, fill = 0)
df1
# ID GE_04 GE_06 GE_08 GE_12 GE_14 GE_15 GE_16 PP_08 PR_14
#1 0001 1 0 1 1 0 0 1 1 0
#2 0002 0 0 0 0 1 0 1 0 0
#3 0003 0 1 1 1 0 0 0 0 0
#4 0004 0 0 0 0 0 1 0 0 0
#5 0005 0 0 0 0 0 0 1 0 1
示例数据: test.txt
包含
ID Voter_History
0001 GE 20161108;20121106 GE;20081104 GE;20080205 PP;General Election 2004
0002 2016 GENERAL ELECTION;2014 GENERAL ELECTION
0003 20121106 GE;20081104 GE;General Election 2006
0004 GE 20150910
0005 16 GENERAL ELECTION; 14 PRIMARY ELECTION
(更新 - 添加了一个逻辑来解析 Error: Duplicate identifiers for rows...
。这是由于 spread
调用中重复的 ID
和 Voter_History
组合而发生的)