将字符串列表中的每个字符串元素编码为 R 数据帧列中的数字列表
encode each string element in list of strings to list of numbers within R dataframe columns
我是 R 的新手,希望实现在 post 中看到的内容: 对于我 df.That 中的每一列,我想对每个字符串元素进行编码到一个虚拟变量,然后将每个变量存储回一个列表,因为它是编码值。我在 Python 中完成了此操作,但不确定在 R 中执行此操作的最佳方法。请在下面找到预期输出。
目前
| Databases | Cryptocurrency | Visual
|:------------------------------------------------:|:------------------------------------------------------:|:----------------------------------------------------------:|
|c("PostgreSQL","MySQL","SQL") |c("Ethereum","Cryptocurrency","Bitcoin","Blockchain") |c("BrandDesign","GraphicDesign","LogoDesign","PackageDesign")
| MSSQLManagement |character(0) |c("BrandDesign", "GraphicDesign", "LogoDesign")
|c("MongoDB","Redis","MySQL") |c("Bitcoin", "Blockchain") |character(0)
|c("RedisManagement") | Cryptocurrency |c("BrandDesign", "GraphicDesign")
|c("MySQL","MemcachedManagement","MongoDB","Redis")|c("Cryptocurrency", "Ethereum", "Blockchain") | GraphicDesign
预期输出:
| Databases | Cryptocurrency | Visual |
|:---------:|:----------------:|:------------:|
|c(1,2,3) |c(1,2,3,4) |c(1,2,3,4) |
| 4 | 0 |c(1, 2, 3) |
|c(5,6,2) |c(3, 4) | 0 |
|c(6) | 2 |c(1, 2) |
|c(2,7,5,6) |c(2, 1, 4) | 2 |
最终目标是在 KNN 模型上训练数据。
编辑:
下面的输出是dput(df[1:25, c(11, 17)]):
的输出
structure(list(Technical = list(c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), character(0), character(0),
c("Documentation", "TechnicalDesign", "RequirementsGathering"
), c("Architecture", "TechnicalDesign", "Documentation",
"RequirementsGathering"), c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), c("Architecture",
"TechnicalDesign", "Documentation", "RequirementsGathering"
), character(0), c("Architecture", "Documentation", "RequirementsGathering"
), c("Architecture", "TechnicalDesign", "Documentation",
"RequirementsGathering"), c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), c("Architecture",
"TechnicalDesign", "Documentation", "RequirementsGathering"
), c("Architecture", "TechnicalDesign", "Documentation",
"RequirementsGathering"), c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), c("Architecture",
"TechnicalDesign", "Documentation", "RequirementsGathering"
), c("Documentation", "TechnicalDesign", "RequirementsGathering"
), character(0), character(0), c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), c("Architecture",
"TechnicalDesign", "Documentation", "RequirementsGathering"
)), Tools = list(c("Photoshop", "Sketch", "InVision", "Illustrator",
"Zeplin"), c("GoogleAnalytics", "GoogleAdsense", "MailChimp",
"GoogleAdwords"), character(0), c("FacebookAds", "GoogleAnalytics",
"MailChimp"), c("FacebookAds", "GoogleAnalytics", "MailChimp"
), c("Kubernetes", "Vagrant"), c("Gulp", "Vagrant"), character(0),
c("GoogleAnalytics", "GoogleAdwords"), c("Hubspot", "Kissmetrics",
"InstagramAds", "FacebookAds", "LinkedInAds", "Optimizely",
"GoogleAnalytics"), "GoogleAnalytics", c("FacebookAds", "GoogleAnalytics",
"GoogleAdsense", "InstagramAds", "GoogleAdwords"), c("GoogleAnalytics",
"TwitterAds", "MailChimp"), c("GoogleAnalytics", "MailChimp",
"Mixpanel"), c("GoogleAnalytics", "MailChimp", "GoogleAdwords"
), c("Photoshop", "Sketch", "InVision", "Illustrator"), character(0),
character(0), character(0), c("FacebookAds", "GoogleAnalytics",
"Optimizely", "GoogleAdwords", "Mixpanel")), ProjectManagement = list(
c("AgileMethodology", "ClientManagement", "Scrum"), c("AgileMethodology",
"ClientManagement", "Scrum"), "ClientManagement", c("AgileMethodology",
"ClientManagement", "Scrum"), c("AgileMethodology", "ClientManagement",
"Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology",
"ClientManagement"), character(0), c("AgileMethodology",
"ClientManagement", "Scrum"), c("AgileMethodology", "ClientManagement",
"Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology",
"ClientManagement", "Scrum"), character(0), c("ClientManagement",
"Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology",
"ClientManagement", "Scrum"), c("AgileMethodology", "Scrum"
), "AgileMethodology", c("AgileMethodology", "Scrum"), c("AgileMethodology",
"Scrum")), Visual = list(c("BrandDesign", "GraphicDesign",
"LogoDesign", "PackageDesign"), c("BrandDesign", "GraphicDesign",
"LogoDesign"), character(0), character(0), c("BrandDesign", "GraphicDesign",
"LogoDesign", "PackageDesign"), character(0), character(0), character(0),
character(0), c("BrandDesign", "GraphicDesign", "LogoDesign"
), character(0), character(0), c("BrandDesign", "GraphicDesign",
"LogoDesign", "PackageDesign"), c("BrandDesign", "GraphicDesign",
"LogoDesign", "PackageDesign"), character(0), character(0),
character(0), character(0), character(0), c("BrandDesign",
"GraphicDesign", "LogoDesign")), ConfigurationManagement = list(
character(0), "Chef", character(0), character(0), character(0),
c("Ansible", "Terraform", "Puppet"), character(0), character(0),
character(0), c("Puppet", "Chef"), c("Terraform", "Chef"),
character(0), character(0), character(0), character(0), character(0),
character(0), character(0), "Chef", character(0)), Containers = list(
character(0), "Docker", character(0), character(0), character(0),
"Docker", "Docker", "Docker", "Docker", "Docker", "Docker",
"Docker", character(0), character(0), "Docker", "Docker",
character(0), character(0), "Docker", "Docker"), Cryptocurrency = list(
character(0), character(0), character(0), character(0), character(0),
c("Ethereum", "Cryptocurrency", "Bitcoin", "Blockchain"),
character(0), character(0), character(0), c("Ethereum", "Cryptocurrency",
"Bitcoin", "Blockchain"), c("Solidity", "Ethereum", "Cryptocurrency",
"Bitcoin", "Blockchain"), c("Solidity", "Ethereum", "Cryptocurrency",
"Bitcoin", "Blockchain"), character(0), character(0), character(0),
character(0), character(0), character(0), character(0), character(0))), row.names = c(NA,
20L), class = "data.frame")
编写一个重新编码值的函数。
recode_values <- function(x) {
x1 <- strsplit(x, ',\s*')
relist(match(unlist(x1), unique(unlist(x1))), x1)
}
清理字符串并将其应用于多个列。
library(dplyr)
df %>% mutate(across(c(AutomatedTesting, Cryptocurrency),
~recode_values(gsub("\[|\]|'", "", .))))
# AutomatedTesting Cryptocurrency
#1
#2 1, 2, 3, 4
#3 1
#4 2
#5
#6 2, 1 1, 2, 3, 4
#7 1, 2, 5
#8 2, 1
#9 2, 3, 1
#10 1, 2, 6, 3, 5 1, 2, 3, 4
#11 2, 6, 1 5, 1, 2, 3, 4
#12 2, 1 5, 1, 2, 3, 4
#13
#14
#15 1, 2, 6, 3, 4
#16 1, 2, 6, 5
#17 2, 3, 1
#18
#19 2
#20 4, 5, 6, 3, 1, 2
#21 2, 4
#22 4, 5, 6, 3, 1, 2
#23 2, 1
#24 2, 6, 1
#25 2, 6, 3, 1
我是 R 的新手,希望实现在 post 中看到的内容:
| Databases | Cryptocurrency | Visual
|:------------------------------------------------:|:------------------------------------------------------:|:----------------------------------------------------------:|
|c("PostgreSQL","MySQL","SQL") |c("Ethereum","Cryptocurrency","Bitcoin","Blockchain") |c("BrandDesign","GraphicDesign","LogoDesign","PackageDesign")
| MSSQLManagement |character(0) |c("BrandDesign", "GraphicDesign", "LogoDesign")
|c("MongoDB","Redis","MySQL") |c("Bitcoin", "Blockchain") |character(0)
|c("RedisManagement") | Cryptocurrency |c("BrandDesign", "GraphicDesign")
|c("MySQL","MemcachedManagement","MongoDB","Redis")|c("Cryptocurrency", "Ethereum", "Blockchain") | GraphicDesign
预期输出:
| Databases | Cryptocurrency | Visual |
|:---------:|:----------------:|:------------:|
|c(1,2,3) |c(1,2,3,4) |c(1,2,3,4) |
| 4 | 0 |c(1, 2, 3) |
|c(5,6,2) |c(3, 4) | 0 |
|c(6) | 2 |c(1, 2) |
|c(2,7,5,6) |c(2, 1, 4) | 2 |
最终目标是在 KNN 模型上训练数据。
编辑: 下面的输出是dput(df[1:25, c(11, 17)]):
的输出structure(list(Technical = list(c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), character(0), character(0),
c("Documentation", "TechnicalDesign", "RequirementsGathering"
), c("Architecture", "TechnicalDesign", "Documentation",
"RequirementsGathering"), c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), c("Architecture",
"TechnicalDesign", "Documentation", "RequirementsGathering"
), character(0), c("Architecture", "Documentation", "RequirementsGathering"
), c("Architecture", "TechnicalDesign", "Documentation",
"RequirementsGathering"), c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), c("Architecture",
"TechnicalDesign", "Documentation", "RequirementsGathering"
), c("Architecture", "TechnicalDesign", "Documentation",
"RequirementsGathering"), c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), c("Architecture",
"TechnicalDesign", "Documentation", "RequirementsGathering"
), c("Documentation", "TechnicalDesign", "RequirementsGathering"
), character(0), character(0), c("Architecture", "TechnicalDesign",
"Documentation", "RequirementsGathering"), c("Architecture",
"TechnicalDesign", "Documentation", "RequirementsGathering"
)), Tools = list(c("Photoshop", "Sketch", "InVision", "Illustrator",
"Zeplin"), c("GoogleAnalytics", "GoogleAdsense", "MailChimp",
"GoogleAdwords"), character(0), c("FacebookAds", "GoogleAnalytics",
"MailChimp"), c("FacebookAds", "GoogleAnalytics", "MailChimp"
), c("Kubernetes", "Vagrant"), c("Gulp", "Vagrant"), character(0),
c("GoogleAnalytics", "GoogleAdwords"), c("Hubspot", "Kissmetrics",
"InstagramAds", "FacebookAds", "LinkedInAds", "Optimizely",
"GoogleAnalytics"), "GoogleAnalytics", c("FacebookAds", "GoogleAnalytics",
"GoogleAdsense", "InstagramAds", "GoogleAdwords"), c("GoogleAnalytics",
"TwitterAds", "MailChimp"), c("GoogleAnalytics", "MailChimp",
"Mixpanel"), c("GoogleAnalytics", "MailChimp", "GoogleAdwords"
), c("Photoshop", "Sketch", "InVision", "Illustrator"), character(0),
character(0), character(0), c("FacebookAds", "GoogleAnalytics",
"Optimizely", "GoogleAdwords", "Mixpanel")), ProjectManagement = list(
c("AgileMethodology", "ClientManagement", "Scrum"), c("AgileMethodology",
"ClientManagement", "Scrum"), "ClientManagement", c("AgileMethodology",
"ClientManagement", "Scrum"), c("AgileMethodology", "ClientManagement",
"Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology",
"ClientManagement"), character(0), c("AgileMethodology",
"ClientManagement", "Scrum"), c("AgileMethodology", "ClientManagement",
"Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology",
"ClientManagement", "Scrum"), character(0), c("ClientManagement",
"Scrum"), c("AgileMethodology", "Scrum"), c("AgileMethodology",
"ClientManagement", "Scrum"), c("AgileMethodology", "Scrum"
), "AgileMethodology", c("AgileMethodology", "Scrum"), c("AgileMethodology",
"Scrum")), Visual = list(c("BrandDesign", "GraphicDesign",
"LogoDesign", "PackageDesign"), c("BrandDesign", "GraphicDesign",
"LogoDesign"), character(0), character(0), c("BrandDesign", "GraphicDesign",
"LogoDesign", "PackageDesign"), character(0), character(0), character(0),
character(0), c("BrandDesign", "GraphicDesign", "LogoDesign"
), character(0), character(0), c("BrandDesign", "GraphicDesign",
"LogoDesign", "PackageDesign"), c("BrandDesign", "GraphicDesign",
"LogoDesign", "PackageDesign"), character(0), character(0),
character(0), character(0), character(0), c("BrandDesign",
"GraphicDesign", "LogoDesign")), ConfigurationManagement = list(
character(0), "Chef", character(0), character(0), character(0),
c("Ansible", "Terraform", "Puppet"), character(0), character(0),
character(0), c("Puppet", "Chef"), c("Terraform", "Chef"),
character(0), character(0), character(0), character(0), character(0),
character(0), character(0), "Chef", character(0)), Containers = list(
character(0), "Docker", character(0), character(0), character(0),
"Docker", "Docker", "Docker", "Docker", "Docker", "Docker",
"Docker", character(0), character(0), "Docker", "Docker",
character(0), character(0), "Docker", "Docker"), Cryptocurrency = list(
character(0), character(0), character(0), character(0), character(0),
c("Ethereum", "Cryptocurrency", "Bitcoin", "Blockchain"),
character(0), character(0), character(0), c("Ethereum", "Cryptocurrency",
"Bitcoin", "Blockchain"), c("Solidity", "Ethereum", "Cryptocurrency",
"Bitcoin", "Blockchain"), c("Solidity", "Ethereum", "Cryptocurrency",
"Bitcoin", "Blockchain"), character(0), character(0), character(0),
character(0), character(0), character(0), character(0), character(0))), row.names = c(NA,
20L), class = "data.frame")
编写一个重新编码值的函数。
recode_values <- function(x) {
x1 <- strsplit(x, ',\s*')
relist(match(unlist(x1), unique(unlist(x1))), x1)
}
清理字符串并将其应用于多个列。
library(dplyr)
df %>% mutate(across(c(AutomatedTesting, Cryptocurrency),
~recode_values(gsub("\[|\]|'", "", .))))
# AutomatedTesting Cryptocurrency
#1
#2 1, 2, 3, 4
#3 1
#4 2
#5
#6 2, 1 1, 2, 3, 4
#7 1, 2, 5
#8 2, 1
#9 2, 3, 1
#10 1, 2, 6, 3, 5 1, 2, 3, 4
#11 2, 6, 1 5, 1, 2, 3, 4
#12 2, 1 5, 1, 2, 3, 4
#13
#14
#15 1, 2, 6, 3, 4
#16 1, 2, 6, 5
#17 2, 3, 1
#18
#19 2
#20 4, 5, 6, 3, 1, 2
#21 2, 4
#22 4, 5, 6, 3, 1, 2
#23 2, 1
#24 2, 6, 1
#25 2, 6, 3, 1