电子邮件的 R 数据整理
R Data Wrangling for Emails
需要帮助!这是一个与工作相关的项目。我需要清理 16,000 封电子邮件......预计需要手动完成 :( 我需要找到一个方法从电子邮件中提取域名并将其放入新列中,并将名称也解析到新列中,同时仍保留原始邮件。数据部分完整。
library(tidyr)
library(magrittr)
Email.Address <- c('john.doe@abccorp.com','jdoe@cisco.com','johnd@widgetco.com')
First.Name <- c('John', 'JDoe','NA' )
Last.Name <- c('Doe','NA','NA')
Company <- c('NA','NA','NA')
data <- data.frame(Email.Address, First.Name, Last.Name, Company)
separate_DF <- data %>% separate(Email.Address, c("Company"), sep="@")
试试这个
df <- data.frame(Email.Address, First.Name, Last.Name, Company, stringsAsFactors = FALSE)
Corp <- sapply(strsplit(sapply(strsplit(df$Email.Address,"@"),"[[",2),"[.]"),"[[",1)
F.Name <- sapply(strsplit(sapply(strsplit(df$Email.Address,"@"),"[[",1), "[.]"),"[[",1)
L.Name <- sapply(strsplit(sapply(strsplit(df$Email.Address,"@"),"[[",1),"[.]"),tail,n=1)
L.Name[L.Name == F.Name] <- NA
OUT <- data.frame(df$Email.Address, F.Name, L.Name, Corp)
df[df=="NA" |is.na(df)] <- OUT[df=="NA" |is.na(df)]
df
tidyr
中的函数 separate
看起来也不错。
http://blog.rstudio.org/2014/07/22/introducing-tidyr/
根据您提供的信息,这也有效:
library(tidyr)
df <- data.frame(Email.Address, First.Name, Last.Name, Company)
df2 <- separate(df, Email.Address, into = c("Name", "Corp"), sep = "@")
df2 <- separate(df2, Name, into = c("F.Name", "L.Name"), sep = "[.]", extra = "drop")
df2 <- separate(df2, Corp, into = c("Corp", "com"), sep = "[.]", extra = "drop")
需要帮助!这是一个与工作相关的项目。我需要清理 16,000 封电子邮件......预计需要手动完成 :( 我需要找到一个方法从电子邮件中提取域名并将其放入新列中,并将名称也解析到新列中,同时仍保留原始邮件。数据部分完整。
library(tidyr)
library(magrittr)
Email.Address <- c('john.doe@abccorp.com','jdoe@cisco.com','johnd@widgetco.com')
First.Name <- c('John', 'JDoe','NA' )
Last.Name <- c('Doe','NA','NA')
Company <- c('NA','NA','NA')
data <- data.frame(Email.Address, First.Name, Last.Name, Company)
separate_DF <- data %>% separate(Email.Address, c("Company"), sep="@")
试试这个
df <- data.frame(Email.Address, First.Name, Last.Name, Company, stringsAsFactors = FALSE)
Corp <- sapply(strsplit(sapply(strsplit(df$Email.Address,"@"),"[[",2),"[.]"),"[[",1)
F.Name <- sapply(strsplit(sapply(strsplit(df$Email.Address,"@"),"[[",1), "[.]"),"[[",1)
L.Name <- sapply(strsplit(sapply(strsplit(df$Email.Address,"@"),"[[",1),"[.]"),tail,n=1)
L.Name[L.Name == F.Name] <- NA
OUT <- data.frame(df$Email.Address, F.Name, L.Name, Corp)
df[df=="NA" |is.na(df)] <- OUT[df=="NA" |is.na(df)]
df
tidyr
中的函数 separate
看起来也不错。
http://blog.rstudio.org/2014/07/22/introducing-tidyr/
根据您提供的信息,这也有效:
library(tidyr)
df <- data.frame(Email.Address, First.Name, Last.Name, Company)
df2 <- separate(df, Email.Address, into = c("Name", "Corp"), sep = "@")
df2 <- separate(df2, Name, into = c("F.Name", "L.Name"), sep = "[.]", extra = "drop")
df2 <- separate(df2, Corp, into = c("Corp", "com"), sep = "[.]", extra = "drop")