简单的 R 函数问题
Simple R function issue
我写了一个非常冗长的 R 函数,我确信它可以在 2 行或更少的代码中重新创建,但我现在还不知道如何处理。
我向我的函数传递了一个灰狗结果数据的数据帧和一个表示距离的整数 运行。
然后它 returns 在该距离内每个框数(总共 8 个框)的获胜百分比。
目前:
boxPercents <- function(dist, data) {
## get each box num of wins
x1 <- data$position == 1 & data$distance == dist & data$box == 1
x2 <- data$position == 1 & data$distance == dist & data$box == 2
x3 <- data$position == 1 & data$distance == dist & data$box == 3
...
x8 <- data$position == 1 & data$distance == dist & data$box == 8
## count the total num of races at that
numRaces <- data$position == 1 & data$distance == dist
## print out the winning percent for each box
print(sum(x1) / sum(numRaces))
print(sum(x2) / sum(numRaces))
print(sum(x3) / sum(numRaces))
...
print(sum(x8) / sum(numRaces))
}
我的输出如下所示,然后我将其转换为向量:
[1] 0.2452107
[1] 0.1340996
[1] 0.09961686
[1] 0.1034483
[1] 0.08045977
[1] 0.1034483
[1] 0.09961686
[1] 0.1340996
我很确定其中一个应用函数是我应该使用的,但所有的努力都没有结果。
编辑:这是数据的 header:
track date race position box name sp fave distance
Warrnambool 02 Jan 14 1 1 1 TOP SECRET 1.7 true 450
Warrnambool 02 Jan 14 1 2 4 FLASH WILSON 4.7 false 450
Warrnambool 02 Jan 14 1 3 8 HEAPS OF ABILITY 11.8 false 450
Warrnambool 02 Jan 14 1 4 7 OCCUPATION LAD 24.1 false 450
Warrnambool 02 Jan 14 1 5 2 HE'S A VILLIAN 19.3 false 450
Warrnambool 02 Jan 14 1 6 5 ZAC'S A SIXPENCE 9.7 false 450
你可以把它变成一个函数并使用合适的dist
dist <- 450
vapply(1:8, function(i) sum(with(data,
position==1 & distance==dist & box==i))/sum(with(data,
position==1 & distance==dist)), numeric(1L))
或
sapply(1:8, function(i) sum(with(data,
position==1 & distance==dist & box==i))/sum(with(data,
position==1 & distance==dist)))
因为 position
和 distance
在 numerator
和 denominator
中是相同的,所以我会做
sapply(1:8, function(i) {indx <- with(data, position==1 & distance==dist)
sum(indx & data$box==i)/sum(indx)} )
更新
big datasets
的更快选择是使用 data.table
library(data.table)
setDT(data)[position==1 & distance==dist, c(.SD,numRaces= .N)][,
list(percentage=unique(.N/numRaces)), by=box]
或者上面的内容可以缩短(如@Arun 所评论)
setDT(data)[position==1 & distance==dist, .N, by=box][, N := N/sum(N)]
或使用 prop.table
的选项
as.data.frame(prop.table(table(subset(data,
position==1 & distance==dist, select=c(position, box)))))
另一个使用 dplyr 的选项,对于大型数据集可能比 sapply
方法更快:
更新:
library(dplyr)
boxPercents <- function(dist, data) {
data <- data %>% filter(position == 1 & distance == dist) %>% select(box)
data %>% count(box) %>% transmute(percentage = n / sum(n))
}
原文:
boxPercents <- function(dist, data) {
data <- data %>% filter(position == 1 & distance == dist) %>% select(box)
numRaces <- nrow(data)
data %>%
group_by(box) %>%
summarise(percentage = n() / numRaces)
}
使用函数(注意我修改了输入数据 - 见下面的 dput):
boxPercents(450, data)
#Source: local data frame [2 x 2]
#
# box percentage
#1 1 0.6666667
#2 5 0.3333333
数据
data <- structure(list(track = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "Warrnambool", class = "factor"),
date = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "02 Jan 14", class = "factor"),
race = c(1L, 1L, 1L, 1L, 1L, 1L), position = c(1, 2, 3, 4,
1, 1), box = c(1, 4, 8, 7, 1, 5), name = structure(c(5L,
1L, 2L, 4L, 3L, 6L), .Label = c("FLASH WILSON", "HEAPS OF ABILITY",
"HES A VILLIAN", "OCCUPATION LAD", "TOP SECRET", "ZACS A SIXPENCE"
), class = "factor"), sp = c(1.7, 4.7, 11.8, 24.1, 19.3,
9.7), fave = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("false",
"true"), class = "factor"), distance = c(450L, 450L, 450L,
450L, 450L, 450L)), .Names = c("track", "date", "race", "position",
"box", "name", "sp", "fave", "distance"), row.names = c(NA, -6L
), class = "data.frame")
我写了一个非常冗长的 R 函数,我确信它可以在 2 行或更少的代码中重新创建,但我现在还不知道如何处理。
我向我的函数传递了一个灰狗结果数据的数据帧和一个表示距离的整数 运行。 然后它 returns 在该距离内每个框数(总共 8 个框)的获胜百分比。
目前:
boxPercents <- function(dist, data) {
## get each box num of wins
x1 <- data$position == 1 & data$distance == dist & data$box == 1
x2 <- data$position == 1 & data$distance == dist & data$box == 2
x3 <- data$position == 1 & data$distance == dist & data$box == 3
...
x8 <- data$position == 1 & data$distance == dist & data$box == 8
## count the total num of races at that
numRaces <- data$position == 1 & data$distance == dist
## print out the winning percent for each box
print(sum(x1) / sum(numRaces))
print(sum(x2) / sum(numRaces))
print(sum(x3) / sum(numRaces))
...
print(sum(x8) / sum(numRaces))
}
我的输出如下所示,然后我将其转换为向量:
[1] 0.2452107
[1] 0.1340996
[1] 0.09961686
[1] 0.1034483
[1] 0.08045977
[1] 0.1034483
[1] 0.09961686
[1] 0.1340996
我很确定其中一个应用函数是我应该使用的,但所有的努力都没有结果。
编辑:这是数据的 header:
track date race position box name sp fave distance
Warrnambool 02 Jan 14 1 1 1 TOP SECRET 1.7 true 450
Warrnambool 02 Jan 14 1 2 4 FLASH WILSON 4.7 false 450
Warrnambool 02 Jan 14 1 3 8 HEAPS OF ABILITY 11.8 false 450
Warrnambool 02 Jan 14 1 4 7 OCCUPATION LAD 24.1 false 450
Warrnambool 02 Jan 14 1 5 2 HE'S A VILLIAN 19.3 false 450
Warrnambool 02 Jan 14 1 6 5 ZAC'S A SIXPENCE 9.7 false 450
你可以把它变成一个函数并使用合适的dist
dist <- 450
vapply(1:8, function(i) sum(with(data,
position==1 & distance==dist & box==i))/sum(with(data,
position==1 & distance==dist)), numeric(1L))
或
sapply(1:8, function(i) sum(with(data,
position==1 & distance==dist & box==i))/sum(with(data,
position==1 & distance==dist)))
因为 position
和 distance
在 numerator
和 denominator
中是相同的,所以我会做
sapply(1:8, function(i) {indx <- with(data, position==1 & distance==dist)
sum(indx & data$box==i)/sum(indx)} )
更新
big datasets
的更快选择是使用 data.table
library(data.table)
setDT(data)[position==1 & distance==dist, c(.SD,numRaces= .N)][,
list(percentage=unique(.N/numRaces)), by=box]
或者上面的内容可以缩短(如@Arun 所评论)
setDT(data)[position==1 & distance==dist, .N, by=box][, N := N/sum(N)]
或使用 prop.table
as.data.frame(prop.table(table(subset(data,
position==1 & distance==dist, select=c(position, box)))))
另一个使用 dplyr 的选项,对于大型数据集可能比 sapply
方法更快:
更新:
library(dplyr)
boxPercents <- function(dist, data) {
data <- data %>% filter(position == 1 & distance == dist) %>% select(box)
data %>% count(box) %>% transmute(percentage = n / sum(n))
}
原文:
boxPercents <- function(dist, data) {
data <- data %>% filter(position == 1 & distance == dist) %>% select(box)
numRaces <- nrow(data)
data %>%
group_by(box) %>%
summarise(percentage = n() / numRaces)
}
使用函数(注意我修改了输入数据 - 见下面的 dput):
boxPercents(450, data)
#Source: local data frame [2 x 2]
#
# box percentage
#1 1 0.6666667
#2 5 0.3333333
数据
data <- structure(list(track = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "Warrnambool", class = "factor"),
date = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "02 Jan 14", class = "factor"),
race = c(1L, 1L, 1L, 1L, 1L, 1L), position = c(1, 2, 3, 4,
1, 1), box = c(1, 4, 8, 7, 1, 5), name = structure(c(5L,
1L, 2L, 4L, 3L, 6L), .Label = c("FLASH WILSON", "HEAPS OF ABILITY",
"HES A VILLIAN", "OCCUPATION LAD", "TOP SECRET", "ZACS A SIXPENCE"
), class = "factor"), sp = c(1.7, 4.7, 11.8, 24.1, 19.3,
9.7), fave = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("false",
"true"), class = "factor"), distance = c(450L, 450L, 450L,
450L, 450L, 450L)), .Names = c("track", "date", "race", "position",
"box", "name", "sp", "fave", "distance"), row.names = c(NA, -6L
), class = "data.frame")