根据单词是否出现在多列中创建虚拟变量
Creating a dummy variable based on whether words appear in multiple columns
我正在与大型 df
合作研究跨国时间序列抗议模式。我想根据一个或 select 组单词是否出现在任何这些列中来创建一个虚拟变量。我在下面包含了数据。这是我想做的口头表达:
如果 protesterdemand1
、protesterdemand2
、protesterdemand3
、[=12] 中出现以下短语 (1) 政治行为 (2) 警察暴行或 (3) 罢免政客=],然后创建一个虚拟变量 sensitive_issue
,其值为 1,否则为 0。
谢谢!
structure(list(Country = c("Canada", "Canada", "Canada", "Canada",
"Canada", "Canada"), COWcode = c(20L, 20L, 20L, 20L, 20L, 20L
), Year = c(1990L, 1990L, 1990L, 1990L, 1990L, 1990L), Region = c("North America",
"North America", "North America", "North America", "North America",
"North America"), Protest = c(1L, 1L, 1L, 1L, 1L, 1L), protesterviolence = c(0L,
0L, 0L, 1L, 1L, 0L), protesterdemand1 = c("political behavior, process",
"political behavior, process", "political behavior, process",
"land farm issue", "political behavior, process", "police brutality"
), protesterdemand2 = c("labor wage dispute", "", "", "", "",
""), protesterdemand3 = c("", "", "", "", "", ""), protesterdemand4 = c("",
"", "", "", "", ""), stateresponse1 = c("ignore", "ignore", "ignore",
"accomodation", "crowd dispersal", "crowd dispersal"), stateresponse2 = c("",
"", "", "", "arrests", "shootings"), stateresponse3 = c("", "",
"", "", "accomodation", ""), stateresponse4 = c("", "", "", "",
"", ""), stateresponse5 = c("", "", "", "", "", ""), stateresponse6 = c("",
"", "", "", "", ""), stateresponse7 = c("", "", "", "", "", ""
), participants = c("1000s", "1000", "500", "100s", "950", "200"
), participants_category = c("", "", "", "", "", "")), row.names = c(NA,
6L), class = "data.frame")
基础 R
found <- sapply(dat[c("protesterdemand1", "protesterdemand2", "protesterdemand3", "protesterdemand1")],
grepl, pattern = "political behavior|police brutality|removal of politician", ignore.case = TRUE) # ignore is just-in-case, over to you
found
# protesterdemand1 protesterdemand2 protesterdemand3 protesterdemand1.1
# [1,] TRUE FALSE FALSE TRUE
# [2,] TRUE FALSE FALSE TRUE
# [3,] TRUE FALSE FALSE TRUE
# [4,] FALSE FALSE FALSE FALSE
# [5,] TRUE FALSE FALSE TRUE
# [6,] TRUE FALSE FALSE TRUE
dat$sensitive_issue <- rowSums(found) > 0
dat
# Country COWcode Year Region Protest protesterviolence protesterdemand1 protesterdemand2 protesterdemand3
# 1 Canada 20 1990 North America 1 0 political behavior, process labor wage dispute
# 2 Canada 20 1990 North America 1 0 political behavior, process
# 3 Canada 20 1990 North America 1 0 political behavior, process
# 4 Canada 20 1990 North America 1 1 land farm issue
# 5 Canada 20 1990 North America 1 1 political behavior, process
# 6 Canada 20 1990 North America 1 0 police brutality
# protesterdemand4 stateresponse1 stateresponse2 stateresponse3 stateresponse4 stateresponse5 stateresponse6 stateresponse7
# 1 ignore
# 2 ignore
# 3 ignore
# 4 accomodation
# 5 crowd dispersal arrests accomodation
# 6 crowd dispersal shootings
# participants participants_category sensitive_issue
# 1 1000s TRUE
# 2 1000 TRUE
# 3 500 TRUE
# 4 100s FALSE
# 5 950 TRUE
# 6 200 TRUE
我正在与大型 df
合作研究跨国时间序列抗议模式。我想根据一个或 select 组单词是否出现在任何这些列中来创建一个虚拟变量。我在下面包含了数据。这是我想做的口头表达:
如果 protesterdemand1
、protesterdemand2
、protesterdemand3
、[=12] 中出现以下短语 (1) 政治行为 (2) 警察暴行或 (3) 罢免政客=],然后创建一个虚拟变量 sensitive_issue
,其值为 1,否则为 0。
谢谢!
structure(list(Country = c("Canada", "Canada", "Canada", "Canada",
"Canada", "Canada"), COWcode = c(20L, 20L, 20L, 20L, 20L, 20L
), Year = c(1990L, 1990L, 1990L, 1990L, 1990L, 1990L), Region = c("North America",
"North America", "North America", "North America", "North America",
"North America"), Protest = c(1L, 1L, 1L, 1L, 1L, 1L), protesterviolence = c(0L,
0L, 0L, 1L, 1L, 0L), protesterdemand1 = c("political behavior, process",
"political behavior, process", "political behavior, process",
"land farm issue", "political behavior, process", "police brutality"
), protesterdemand2 = c("labor wage dispute", "", "", "", "",
""), protesterdemand3 = c("", "", "", "", "", ""), protesterdemand4 = c("",
"", "", "", "", ""), stateresponse1 = c("ignore", "ignore", "ignore",
"accomodation", "crowd dispersal", "crowd dispersal"), stateresponse2 = c("",
"", "", "", "arrests", "shootings"), stateresponse3 = c("", "",
"", "", "accomodation", ""), stateresponse4 = c("", "", "", "",
"", ""), stateresponse5 = c("", "", "", "", "", ""), stateresponse6 = c("",
"", "", "", "", ""), stateresponse7 = c("", "", "", "", "", ""
), participants = c("1000s", "1000", "500", "100s", "950", "200"
), participants_category = c("", "", "", "", "", "")), row.names = c(NA,
6L), class = "data.frame")
基础 R
found <- sapply(dat[c("protesterdemand1", "protesterdemand2", "protesterdemand3", "protesterdemand1")],
grepl, pattern = "political behavior|police brutality|removal of politician", ignore.case = TRUE) # ignore is just-in-case, over to you
found
# protesterdemand1 protesterdemand2 protesterdemand3 protesterdemand1.1
# [1,] TRUE FALSE FALSE TRUE
# [2,] TRUE FALSE FALSE TRUE
# [3,] TRUE FALSE FALSE TRUE
# [4,] FALSE FALSE FALSE FALSE
# [5,] TRUE FALSE FALSE TRUE
# [6,] TRUE FALSE FALSE TRUE
dat$sensitive_issue <- rowSums(found) > 0
dat
# Country COWcode Year Region Protest protesterviolence protesterdemand1 protesterdemand2 protesterdemand3
# 1 Canada 20 1990 North America 1 0 political behavior, process labor wage dispute
# 2 Canada 20 1990 North America 1 0 political behavior, process
# 3 Canada 20 1990 North America 1 0 political behavior, process
# 4 Canada 20 1990 North America 1 1 land farm issue
# 5 Canada 20 1990 North America 1 1 political behavior, process
# 6 Canada 20 1990 North America 1 0 police brutality
# protesterdemand4 stateresponse1 stateresponse2 stateresponse3 stateresponse4 stateresponse5 stateresponse6 stateresponse7
# 1 ignore
# 2 ignore
# 3 ignore
# 4 accomodation
# 5 crowd dispersal arrests accomodation
# 6 crowd dispersal shootings
# participants participants_category sensitive_issue
# 1 1000s TRUE
# 2 1000 TRUE
# 3 500 TRUE
# 4 100s FALSE
# 5 950 TRUE
# 6 200 TRUE