R 中的函数(使用 dplyr)
function in R (with dplyr)
我制作了一个适合我的 R 脚本,但我知道我可以通过使用函数让它变得更好(更漂亮)。不幸的是,我的各种尝试都没有成功。谁能指导我走上正确的道路?以下是我的原稿。
library(dplyr)
apples <- read.csv("JoburgApples.csv")
grs <- apples %>% filter(grepl("GRANNY", ProductName), tvaluesold >10000) %>% mutate(Variety = "Granny Smith")
cpp <- apples %>% filter(grepl("PINK", ProductName), tvaluesold >10000) %>% mutate(Variety = "Cripps Pink")
top <- apples %>% filter(grepl("TOP", ProductName), tvaluesold >10000) %>% mutate(Variety = "Top Red")
gld <- apples %>% filter(grepl("GOLDEN", ProductName), tvaluesold >10000) %>% mutate(Variety = "Golden Delicious")
ski <- apples %>% filter(grepl("STARKING", ProductName), tvaluesold >10000) %>% mutate(Variety = "Starking")
bra <- apples %>% filter(grepl("BRAEBURN", ProductName), tvaluesold >10000) %>% mutate(Variety = "Braeburn")
apples <- rbind(grs, cpp, top, gld, ski, bra)
s70 <- apples %>% filter(grepl("70$", ProductName)) %>% mutate(Count = 70)
s80 <- apples %>% filter(grepl("80$", ProductName)) %>% mutate(Count = 80)
s90 <- apples %>% filter(grepl("90$", ProductName)) %>% mutate(Count = 90)
s100 <- apples %>% filter(grepl("100$", ProductName)) %>% mutate(Count = 100)
s110 <- apples %>% filter(grepl("110$", ProductName)) %>% mutate(Count = 110)
s120 <- apples %>% filter(grepl("120$", ProductName)) %>% mutate(Count = 120)
s135 <- apples %>% filter(grepl("135$", ProductName)) %>% mutate(Count = 135)
s150 <- apples %>% filter(grepl("150$", ProductName)) %>% mutate(Count = 150)
s165 <- apples %>% filter(grepl("165$", ProductName)) %>% mutate(Count = 165)
apples <- rbind(s70, s80, s90, s100, s110, s120, s135, s150, s165)
编辑。 Link 到 .csv 文件 (https://github.com/fderyckel/showcases/blob/master/JoburgMarket/JoburgApples.csv)
> UnitMass ProductName tvaluesold tquantitysold tkgsold avgprice highestprice date
> 18.50KG CARTON CRIPPS PINK,CL 1,100 200 1 18.5 200 200 06/11/14
> 18.50KG CARTON CRIPPS RED,CL 1,70 200 1 18.5 200 200 06/11/14
> 18.50KG CARTON TOPRED,CL 1,180 1300 10 185 130 130 06/11/14
> 18.50KG CARTON GOLDEN DELICIOUS,CL 1,90 22700 108 1998 210.19 240 06/11/14
> 18.50KG CARTON STARKING,CL 1,80 17920 115 2127.5 155.83 230 06/11/14
> 18.50KG CARTON GRANNY SMITH,CL 1,135 1800 12 222 150 150 06/11/14
> 18.50KG CARTON TOPRED,CL 1,90 1730 12 222 144.17 190 06/11/14
> 18.50KG CARTON CRIPPS PINK,CL 1,90 2600 13 240.5 200 200 06/11/14
> 18.50KG CARTON GOLDEN DELICIOUS,CL 1,120 22800 136 2516 167.65 180 06/11/14
> 18.50KG CARTON GOLDEN DELICIOUS,CL 1,135 21810 136 2516 160.37 180 06/11/14
> 18.50KG CARTON GRANNY SMITH,CL 1,70 2380 14 259 170 220 06/11/14
> 18.50KG CARTON GRANNY SMITH,CL 1,165 1200 15 277.5 80 80 06/11/14
在此先感谢您的帮助。
弗朗索瓦
假设您想用自定义名称替换 "ProductName" 的 "prefix" 部分,您可以使用 qdap
中的 mgsub
。这将用修改后的元素替换 "ProductName" 中的元素。根据 tvaluesold
创建逻辑 indx
,然后使用 NA
值创建 Variety
列,将 Variety
中的行更改为 TRUE
为 indx
修改后的 ProductName
。如果你想要一个新的数据集,通过 !is.na(apples$Variety)
进行子集化更容易
library(qdap)
indx <- apples$tvaluesold>10000
v1 <- c('GRANNY SMITH', 'CRIPPS PINK', 'TOPRED',
'GOLDEN DELICIOUS','STARKING','BRAEBURN')
v2 <- c('Granny Smith', 'Cripps Pink', 'Top Red','Golden Delicious',
'Starking', 'Braeburn')
apples$Variety <- sub(',.*', '', apples$ProductName)
apples[indx, 'Variety'] <- mgsub(v1,v2, apples[indx,'Variety'] )
apples1 <- apples[indx,]
head(apples1,3)
# UnitMass ProductName tvaluesold tquantitysold tkgsold
#4 18.50KG CARTON GOLDEN DELICIOUS,CL 1,90 22700 108 1998.0
#5 18.50KG CARTON STARKING,CL 1,80 17920 115 2127.5
#9 18.50KG CARTON GOLDEN DELICIOUS,CL 1,120 22800 136 2516.0
# avgprice highestprice date Variety
#4 210.19 240 2014-11-06 Golden Delicious
#5 155.83 230 2014-11-06 Starking
#9 167.65 180 2014-11-06 Golden Delicious
或仅使用 base R
apples$Variety <- unname(setNames(v2,v1)[sub(',.*', '', apples$ProductName)])
apples1 <- apples[indx,]
对于第二种情况,您可以使用 sub
提取最后一个 ,
之后的数字,然后使用 %in%
创建逻辑 indx2
.
val1 <- as.numeric(sub(".*,", "", apples$ProductName))
indx2 <- val1 %in% c(70,80,90,100,110,120,135,150,165)
apples$Count <- NA
apples[indx2,'Count'] <- val1[indx2]
apples2 <- apples[!is.na(apples$Count),]
head(apples2,3)
# UnitMass ProductName tvaluesold tquantitysold tkgsold
#1 18.50KG CARTON CRIPPS PINK,CL 1,100 200 1 18.5
#2 18.50KG CARTON CRIPPS RED,CL 1,70 200 1 18.5
#4 18.50KG CARTON GOLDEN DELICIOUS,CL 1,90 22700 108 1998.0
# avgprice highestprice date Variety Count
#1 200.00 200 2014-11-06 CRIPPS PINK 100
#2 200.00 200 2014-11-06 CRIPPS RED 70
#4 210.19 240 2014-11-06 Golden Delicious 90
更新
您也可以使用 dplyr
创建列
library(dplyr)
apples %>%
filter(tvaluesold >10000) %>%
mutate(Variety= setNames(v2,v1)[sub(',.*', '', ProductName)])
创建Count
列
apples %>%
filter(indx2) %>%
mutate(Count=val1[indx2])
更新2
如果要提取"ProductName"的"first"和"last",另一种选择是
library(tidyr)
res1 <- extract(apples, ProductName, c("Variety", "Count"),
'([^,]+),[^,]+,([^,]+)') %>%
filter(tvaluesold >10000L & !is.na(as.numeric(Count))
数据
url <- 'https://raw.githubusercontent.com/fderyckel/showcases/master/JoburgMarket/JoburgApples.csv'
library(RCurl)
x <- getURL(url)
apples <- read.csv(textConnection(x), stringsAsFactors=FALSE)
也许您需要的是:
apples %>%
filter(tvaluesold > 10000L & grepl(".*\d+$", ProductName)) %>%
mutate(Variety = sub(",.*", "", ProductName),
Count = as.numeric(sub(".*,", "", ProductName)))
我制作了一个适合我的 R 脚本,但我知道我可以通过使用函数让它变得更好(更漂亮)。不幸的是,我的各种尝试都没有成功。谁能指导我走上正确的道路?以下是我的原稿。
library(dplyr)
apples <- read.csv("JoburgApples.csv")
grs <- apples %>% filter(grepl("GRANNY", ProductName), tvaluesold >10000) %>% mutate(Variety = "Granny Smith")
cpp <- apples %>% filter(grepl("PINK", ProductName), tvaluesold >10000) %>% mutate(Variety = "Cripps Pink")
top <- apples %>% filter(grepl("TOP", ProductName), tvaluesold >10000) %>% mutate(Variety = "Top Red")
gld <- apples %>% filter(grepl("GOLDEN", ProductName), tvaluesold >10000) %>% mutate(Variety = "Golden Delicious")
ski <- apples %>% filter(grepl("STARKING", ProductName), tvaluesold >10000) %>% mutate(Variety = "Starking")
bra <- apples %>% filter(grepl("BRAEBURN", ProductName), tvaluesold >10000) %>% mutate(Variety = "Braeburn")
apples <- rbind(grs, cpp, top, gld, ski, bra)
s70 <- apples %>% filter(grepl("70$", ProductName)) %>% mutate(Count = 70)
s80 <- apples %>% filter(grepl("80$", ProductName)) %>% mutate(Count = 80)
s90 <- apples %>% filter(grepl("90$", ProductName)) %>% mutate(Count = 90)
s100 <- apples %>% filter(grepl("100$", ProductName)) %>% mutate(Count = 100)
s110 <- apples %>% filter(grepl("110$", ProductName)) %>% mutate(Count = 110)
s120 <- apples %>% filter(grepl("120$", ProductName)) %>% mutate(Count = 120)
s135 <- apples %>% filter(grepl("135$", ProductName)) %>% mutate(Count = 135)
s150 <- apples %>% filter(grepl("150$", ProductName)) %>% mutate(Count = 150)
s165 <- apples %>% filter(grepl("165$", ProductName)) %>% mutate(Count = 165)
apples <- rbind(s70, s80, s90, s100, s110, s120, s135, s150, s165)
编辑。 Link 到 .csv 文件 (https://github.com/fderyckel/showcases/blob/master/JoburgMarket/JoburgApples.csv)
> UnitMass ProductName tvaluesold tquantitysold tkgsold avgprice highestprice date
> 18.50KG CARTON CRIPPS PINK,CL 1,100 200 1 18.5 200 200 06/11/14
> 18.50KG CARTON CRIPPS RED,CL 1,70 200 1 18.5 200 200 06/11/14
> 18.50KG CARTON TOPRED,CL 1,180 1300 10 185 130 130 06/11/14
> 18.50KG CARTON GOLDEN DELICIOUS,CL 1,90 22700 108 1998 210.19 240 06/11/14
> 18.50KG CARTON STARKING,CL 1,80 17920 115 2127.5 155.83 230 06/11/14
> 18.50KG CARTON GRANNY SMITH,CL 1,135 1800 12 222 150 150 06/11/14
> 18.50KG CARTON TOPRED,CL 1,90 1730 12 222 144.17 190 06/11/14
> 18.50KG CARTON CRIPPS PINK,CL 1,90 2600 13 240.5 200 200 06/11/14
> 18.50KG CARTON GOLDEN DELICIOUS,CL 1,120 22800 136 2516 167.65 180 06/11/14
> 18.50KG CARTON GOLDEN DELICIOUS,CL 1,135 21810 136 2516 160.37 180 06/11/14
> 18.50KG CARTON GRANNY SMITH,CL 1,70 2380 14 259 170 220 06/11/14
> 18.50KG CARTON GRANNY SMITH,CL 1,165 1200 15 277.5 80 80 06/11/14
在此先感谢您的帮助。
弗朗索瓦
假设您想用自定义名称替换 "ProductName" 的 "prefix" 部分,您可以使用 qdap
中的 mgsub
。这将用修改后的元素替换 "ProductName" 中的元素。根据 tvaluesold
创建逻辑 indx
,然后使用 NA
值创建 Variety
列,将 Variety
中的行更改为 TRUE
为 indx
修改后的 ProductName
。如果你想要一个新的数据集,通过 !is.na(apples$Variety)
library(qdap)
indx <- apples$tvaluesold>10000
v1 <- c('GRANNY SMITH', 'CRIPPS PINK', 'TOPRED',
'GOLDEN DELICIOUS','STARKING','BRAEBURN')
v2 <- c('Granny Smith', 'Cripps Pink', 'Top Red','Golden Delicious',
'Starking', 'Braeburn')
apples$Variety <- sub(',.*', '', apples$ProductName)
apples[indx, 'Variety'] <- mgsub(v1,v2, apples[indx,'Variety'] )
apples1 <- apples[indx,]
head(apples1,3)
# UnitMass ProductName tvaluesold tquantitysold tkgsold
#4 18.50KG CARTON GOLDEN DELICIOUS,CL 1,90 22700 108 1998.0
#5 18.50KG CARTON STARKING,CL 1,80 17920 115 2127.5
#9 18.50KG CARTON GOLDEN DELICIOUS,CL 1,120 22800 136 2516.0
# avgprice highestprice date Variety
#4 210.19 240 2014-11-06 Golden Delicious
#5 155.83 230 2014-11-06 Starking
#9 167.65 180 2014-11-06 Golden Delicious
或仅使用 base R
apples$Variety <- unname(setNames(v2,v1)[sub(',.*', '', apples$ProductName)])
apples1 <- apples[indx,]
对于第二种情况,您可以使用 sub
提取最后一个 ,
之后的数字,然后使用 %in%
创建逻辑 indx2
.
val1 <- as.numeric(sub(".*,", "", apples$ProductName))
indx2 <- val1 %in% c(70,80,90,100,110,120,135,150,165)
apples$Count <- NA
apples[indx2,'Count'] <- val1[indx2]
apples2 <- apples[!is.na(apples$Count),]
head(apples2,3)
# UnitMass ProductName tvaluesold tquantitysold tkgsold
#1 18.50KG CARTON CRIPPS PINK,CL 1,100 200 1 18.5
#2 18.50KG CARTON CRIPPS RED,CL 1,70 200 1 18.5
#4 18.50KG CARTON GOLDEN DELICIOUS,CL 1,90 22700 108 1998.0
# avgprice highestprice date Variety Count
#1 200.00 200 2014-11-06 CRIPPS PINK 100
#2 200.00 200 2014-11-06 CRIPPS RED 70
#4 210.19 240 2014-11-06 Golden Delicious 90
更新
您也可以使用 dplyr
创建列
library(dplyr)
apples %>%
filter(tvaluesold >10000) %>%
mutate(Variety= setNames(v2,v1)[sub(',.*', '', ProductName)])
创建Count
列
apples %>%
filter(indx2) %>%
mutate(Count=val1[indx2])
更新2
如果要提取"ProductName"的"first"和"last",另一种选择是
library(tidyr)
res1 <- extract(apples, ProductName, c("Variety", "Count"),
'([^,]+),[^,]+,([^,]+)') %>%
filter(tvaluesold >10000L & !is.na(as.numeric(Count))
数据
url <- 'https://raw.githubusercontent.com/fderyckel/showcases/master/JoburgMarket/JoburgApples.csv'
library(RCurl)
x <- getURL(url)
apples <- read.csv(textConnection(x), stringsAsFactors=FALSE)
也许您需要的是:
apples %>%
filter(tvaluesold > 10000L & grepl(".*\d+$", ProductName)) %>%
mutate(Variety = sub(",.*", "", ProductName),
Count = as.numeric(sub(".*,", "", ProductName)))