R 中的函数（使用 dplyr）

Question

我制作了一个适合我的 R 脚本，但我知道我可以通过使用函数让它变得更好（更漂亮）。不幸的是，我的各种尝试都没有成功。谁能指导我走上正确的道路？以下是我的原稿。

library(dplyr)

apples <- read.csv("JoburgApples.csv")

grs <- apples %>% filter(grepl("GRANNY", ProductName), tvaluesold >10000) %>% mutate(Variety = "Granny Smith")
cpp <- apples %>% filter(grepl("PINK", ProductName), tvaluesold >10000) %>% mutate(Variety = "Cripps Pink")
top <- apples %>% filter(grepl("TOP", ProductName), tvaluesold >10000) %>% mutate(Variety = "Top Red")
gld <- apples %>% filter(grepl("GOLDEN", ProductName), tvaluesold >10000) %>% mutate(Variety = "Golden Delicious")
ski <- apples %>% filter(grepl("STARKING", ProductName), tvaluesold >10000) %>% mutate(Variety = "Starking")
bra <- apples %>% filter(grepl("BRAEBURN", ProductName), tvaluesold >10000) %>% mutate(Variety = "Braeburn")

apples <- rbind(grs, cpp, top, gld, ski, bra)

s70 <- apples %>% filter(grepl("70$", ProductName)) %>% mutate(Count = 70)
s80 <- apples %>% filter(grepl("80$", ProductName)) %>% mutate(Count = 80)
s90 <- apples %>% filter(grepl("90$", ProductName)) %>% mutate(Count = 90)
s100 <- apples %>% filter(grepl("100$", ProductName)) %>% mutate(Count = 100)
s110 <- apples %>% filter(grepl("110$", ProductName)) %>% mutate(Count = 110)
s120 <- apples %>% filter(grepl("120$", ProductName)) %>% mutate(Count = 120)
s135 <- apples %>% filter(grepl("135$", ProductName)) %>% mutate(Count = 135)
s150 <- apples %>% filter(grepl("150$", ProductName)) %>% mutate(Count = 150)
s165 <- apples %>% filter(grepl("165$", ProductName)) %>% mutate(Count = 165)

apples <- rbind(s70, s80, s90, s100, s110, s120, s135, s150, s165)

编辑。 Link 到 .csv 文件 (https://github.com/fderyckel/showcases/blob/master/JoburgMarket/JoburgApples.csv)

> UnitMass  ProductName tvaluesold  tquantitysold   tkgsold avgprice    highestprice    date
> 18.50KG CARTON    CRIPPS PINK,CL 1,100    200 1   18.5    200 200 06/11/14
> 18.50KG CARTON    CRIPPS RED,CL 1,70  200 1   18.5    200 200 06/11/14
> 18.50KG CARTON    TOPRED,CL 1,180 1300    10  185 130 130 06/11/14
> 18.50KG CARTON    GOLDEN DELICIOUS,CL 1,90    22700   108 1998    210.19  240 06/11/14
> 18.50KG CARTON    STARKING,CL 1,80    17920   115 2127.5  155.83  230 06/11/14
> 18.50KG CARTON    GRANNY SMITH,CL 1,135   1800    12  222 150 150 06/11/14
> 18.50KG CARTON    TOPRED,CL 1,90  1730    12  222 144.17  190 06/11/14
> 18.50KG CARTON    CRIPPS PINK,CL 1,90 2600    13  240.5   200 200 06/11/14
> 18.50KG CARTON    GOLDEN DELICIOUS,CL 1,120   22800   136 2516    167.65  180 06/11/14
> 18.50KG CARTON    GOLDEN DELICIOUS,CL 1,135   21810   136 2516    160.37  180 06/11/14
> 18.50KG CARTON    GRANNY SMITH,CL 1,70    2380    14  259 170 220 06/11/14
> 18.50KG CARTON    GRANNY SMITH,CL 1,165   1200    15  277.5   80  80  06/11/14

在此先感谢您的帮助。

弗朗索瓦

Answer 1

假设您想用自定义名称替换 "ProductName" 的 "prefix" 部分，您可以使用 qdap 中的 mgsub。这将用修改后的元素替换 "ProductName" 中的元素。根据 tvaluesold 创建逻辑 indx，然后使用 NA 值创建 Variety 列，将 Variety 中的行更改为 TRUE 为 indx 修改后的 ProductName。如果你想要一个新的数据集，通过 !is.na(apples$Variety)

进行子集化更容易

library(qdap)
indx <- apples$tvaluesold>10000
v1 <- c('GRANNY SMITH', 'CRIPPS PINK', 'TOPRED',
                    'GOLDEN DELICIOUS','STARKING','BRAEBURN')

 v2 <- c('Granny Smith', 'Cripps Pink', 'Top Red','Golden Delicious',
       'Starking', 'Braeburn')
 apples$Variety <- sub(',.*', '', apples$ProductName)

 apples[indx, 'Variety'] <- mgsub(v1,v2, apples[indx,'Variety']  )
 apples1 <- apples[indx,]

 head(apples1,3)
 #       UnitMass               ProductName tvaluesold tquantitysold tkgsold
 #4 18.50KG CARTON  GOLDEN DELICIOUS,CL 1,90      22700           108  1998.0
 #5 18.50KG CARTON          STARKING,CL 1,80      17920           115  2127.5
 #9 18.50KG CARTON GOLDEN DELICIOUS,CL 1,120      22800           136  2516.0
 #  avgprice highestprice       date          Variety
 #4   210.19          240 2014-11-06 Golden Delicious
 #5   155.83          230 2014-11-06         Starking
 #9   167.65          180 2014-11-06 Golden Delicious

或仅使用 base R

 apples$Variety <-  unname(setNames(v2,v1)[sub(',.*', '', apples$ProductName)])
 apples1 <- apples[indx,]

对于第二种情况，您可以使用 sub 提取最后一个 , 之后的数字，然后使用 %in% 创建逻辑 indx2.

 val1 <-  as.numeric(sub(".*,", "", apples$ProductName))
 indx2 <-  val1 %in% c(70,80,90,100,110,120,135,150,165)

 apples$Count <- NA
 apples[indx2,'Count'] <-  val1[indx2]
 apples2 <- apples[!is.na(apples$Count),]
 head(apples2,3)
 #       UnitMass              ProductName tvaluesold tquantitysold tkgsold
 #1 18.50KG CARTON     CRIPPS PINK,CL 1,100        200             1    18.5
 #2 18.50KG CARTON       CRIPPS RED,CL 1,70        200             1    18.5
 #4 18.50KG CARTON GOLDEN DELICIOUS,CL 1,90      22700           108  1998.0
 #  avgprice highestprice       date          Variety Count
 #1   200.00          200 2014-11-06      CRIPPS PINK   100
 #2   200.00          200 2014-11-06       CRIPPS RED    70
 #4   210.19          240 2014-11-06 Golden Delicious    90

更新

您也可以使用 dplyr 创建列

library(dplyr)
apples %>%
       filter(tvaluesold >10000) %>% 
       mutate(Variety= setNames(v2,v1)[sub(',.*', '', ProductName)])

创建Count列

 apples %>%
        filter(indx2) %>%
        mutate(Count=val1[indx2])

更新2

如果要提取"ProductName"的"first"和"last"，另一种选择是

 library(tidyr)
 res1 <-  extract(apples, ProductName, c("Variety", "Count"),
                   '([^,]+),[^,]+,([^,]+)') %>%
                        filter(tvaluesold >10000L & !is.na(as.numeric(Count))

数据

 url <- 'https://raw.githubusercontent.com/fderyckel/showcases/master/JoburgMarket/JoburgApples.csv'
 library(RCurl)

 x <- getURL(url)
 apples <- read.csv(textConnection(x), stringsAsFactors=FALSE)

Answer 2

也许您需要的是：

apples %>%
  filter(tvaluesold > 10000L & grepl(".*\d+$", ProductName)) %>%
  mutate(Variety = sub(",.*", "", ProductName),
         Count = as.numeric(sub(".*,", "", ProductName)))

R 中的函数（使用 dplyr）

function in R (with dplyr)

r

function

dplyr

更新

更新2

数据