将多值属性分离为单个属性 R

Separating multi-valued attributes into individual attributes R

我正在使用 Whosebug 开发人员调查数据集,并试图根据所使用的技术和所使用的协作工具来预测薪酬。这两个属性是多值的,用分号分隔各个值。

例如,在CollabToolsWorkedWith 属性下的一行中,有Confluence;Jira;Github;Slack;Microsoft;Teams;Google Suite。如果该行具有该值,我想为这些值中的每一个提供它们自己的列,值为 0 或 1。

最终结果是每行包含一个列,对应 CollabToolsWorkedWith 下的每个值,并且每列将包含 0 和 1,具体取决于该行是否包含该值。

如果您提供一些每个人都可以快速访问的示例数据,下次您可能会得到更快的答复。我在网上找到了2020年的数据。这是我的回答:

# read the data frame
rm(list = ls())
df <- read.csv("survey_results_public.csv")

# figure out which column you are talking about
data.frame(colnames(df))
table(df$NEWCollabToolsWorkedWith)

# convert to lower case and character
df$NEWCollabToolsWorkedWith <- as.character(df$NEWCollabToolsWorkedWith)
df$NEWCollabToolsWorkedWith <- tolower(df$NEWCollabToolsWorkedWith)

#  keep only the useful variables and separate based on ;
library(tidyverse)
library(splitstackshape)
namesdf <- df %>% select(NEWCollabToolsWorkedWith) 
namesdf <- cSplit(namesdf,"NEWCollabToolsWorkedWith", sep = ";", direction = "wide", drop=TRUE, 
                  type.convert = TRUE) 

# stack stuff on top of each other to find unique list of tools/platforms
long_data_frame <-
  namesdf %>%
  pivot_longer(cols = starts_with("NEWCollabToolsWorkedWith"), # use columns starting with "year"
               names_to ="unique", # name of new column
               names_prefix = "_", 
               values_drop_na = TRUE) %>% 
  distinct(value)

# clean the variable names
library(janitor)
long_data_frame$value = as.character(long_data_frame$value)
long_data_frame$value = janitor::make_clean_names(long_data_frame$value)

# get final unique list
table(long_data_frame$value)

> table(long_data_frame$value)

                confluence         facebook_workplace                     github                     gitlab 
                         1                          1                          1                          1 
google_suite_docs_meet_etc                       jira            microsoft_azure            microsoft_teams 
                         1                          1                          1                          1 
                     slack   stack_overflow_for_teams                     trello 
                         1                          1                          1 

# create new variables
df$confluence <- NA
df$jira <- NA
df$slack = NA
df$microsoft_azure =NA
df$trello = NA
df$github = NA
df$gitlab = NA
df$google_suite_docs_meet_etc = NA
df$microsoft_teams = NA
df$stack_overflow_for_teams = NA
df$facebook_workplace =NA

# make a dummy variable based on string match
df$confluence <- as.integer(grepl(pattern = "confluence", x = df$NEWCollabToolsWorkedWith))
df$jira <- as.integer(grepl(pattern = "jira", x = df$NEWCollabToolsWorkedWith))
df$slack <- as.integer(grepl(pattern = "slack", x = df$NEWCollabToolsWorkedWith))
df$microsoft_azure <- as.integer(grepl(pattern = "microsoft azure", x = df$NEWCollabToolsWorkedWith))
df$trello <- as.integer(grepl(pattern = "trello", x = df$NEWCollabToolsWorkedWith))
df$github <- as.integer(grepl(pattern = "github", x = df$NEWCollabToolsWorkedWith))
df$gitlab <- as.integer(grepl(pattern = "gitlab", x = df$NEWCollabToolsWorkedWith))
df$google_suite_docs_meet_etc <- as.integer(grepl(pattern = "google", x = df$NEWCollabToolsWorkedWith))
df$microsoft_teams <- as.integer(grepl(pattern = "microsoft teams", x = df$NEWCollabToolsWorkedWith))
df$stack_overflow_for_teams <- as.integer(grepl(pattern = "overflow", x = df$NEWCollabToolsWorkedWith))
df$facebook_workplace <- as.integer(grepl(pattern = "facebook", x = df$NEWCollabToolsWorkedWith))

# proof that it went through 
table(df$facebook_workplace)

> table(df$facebook_workplace)

    0     1 
62881  1580