改善 运行- 循环时间
Improve run-time of loop
我正在尝试提高以下过程的计算效率。我已经使用数据创建了玩具示例以供审查。第一种方法 运行s 是第二种方法的一半时间。
我如何改进第一种方法的 运行 时间?
library(sqldf)
id = c(1,1,1,1,2,2,2,5,5,5,5,5,5)
qn = c(0,0,1,1,0,1,0,0,0,1,0,1,0)
d = data.frame(cbind(id,qn))
names(d) = c("id", "qn")
un = unique(d$id)
holder = matrix(0,length(un), 1)
counter = 0
x = proc.time()
for (i in un)
{
z = head(which(d[d$id == i,]$qn==1),1)
counter = counter + 1
holder[counter,] = z
}
proc.time() - x
f = sqldf("select id, count(qn) from d group by id", drv = 'SQLite')
f = cbind(f,holder)
#################################
un = unique(d$id)
holder = matrix(0,length(un), 1)
counter = 0
x = proc.time()
for (i in 1:length(un))
{
y = paste("select * from d where id = ", un[i])
y = sqldf(y, drv = 'SQLite')
y = min(which(y$qn==1))
counter = counter + 1
holder[counter,] = y
}
proc.time() - x
f = sqldf("select id, count(qn) from d group by id", drv = 'SQLite')
f = cbind(f,holder)
我正在尝试为每个 id 计算 1 的第一个实例。
预期输出:
# id first
# 1: 1 3
# 2: 2 2
# 3: 5 3
您可以在没有 sqldf
的情况下使用 dplyr
library(dplyr)
d %>%
group_by(id) %>%
summarize(first=first(which(qn==1)))
我们也可以用data.table
library(data.table)
setDT(d)[, list(first= which.max(qn)) , id]
1) 在 lapply:
中使用 sqldf
do.call(rbind,
lapply(split(d, id), function(i)
sqldf("SELECT id, min(rowid) AS first
FROM (SELECT rowid, *
FROM i) AS x
WHERE qn = 1"))
)
## id first
## 1 1 3
## 2 2 2
## 5 5 3
2) 或纯 SQL 解决方案从每个组的第一个 rowid 中减去每个组中第一行的 rowid qn=1 并添加1:
sqldf("select id, min_row1 - min_row + 1 first
from (select id, min(rowid) min_row
from d
group by id)
join (select id, min(rowid) min_row1
from d where qn = 1
group by id) using (id)")
## id first
## 1 1 3
## 2 2 2
## 3 5 3
3) 或替代纯 SQL 解决方案,在内部 select 的 id 中创建一个序列 seq
然后选择id 组中的第一个 qn = 1:
sqldf("select id, min(seq) first
from (select x.id, x.qn, count() seq
from d x
join d y on x.rowid >= y.rowid and x.id = y.id
group by x.rowid)
where qn = 1
group by id")
## id first
## 1 1 3
## 2 2 2
## 3 5 3
我正在尝试提高以下过程的计算效率。我已经使用数据创建了玩具示例以供审查。第一种方法 运行s 是第二种方法的一半时间。
我如何改进第一种方法的 运行 时间?
library(sqldf)
id = c(1,1,1,1,2,2,2,5,5,5,5,5,5)
qn = c(0,0,1,1,0,1,0,0,0,1,0,1,0)
d = data.frame(cbind(id,qn))
names(d) = c("id", "qn")
un = unique(d$id)
holder = matrix(0,length(un), 1)
counter = 0
x = proc.time()
for (i in un)
{
z = head(which(d[d$id == i,]$qn==1),1)
counter = counter + 1
holder[counter,] = z
}
proc.time() - x
f = sqldf("select id, count(qn) from d group by id", drv = 'SQLite')
f = cbind(f,holder)
#################################
un = unique(d$id)
holder = matrix(0,length(un), 1)
counter = 0
x = proc.time()
for (i in 1:length(un))
{
y = paste("select * from d where id = ", un[i])
y = sqldf(y, drv = 'SQLite')
y = min(which(y$qn==1))
counter = counter + 1
holder[counter,] = y
}
proc.time() - x
f = sqldf("select id, count(qn) from d group by id", drv = 'SQLite')
f = cbind(f,holder)
我正在尝试为每个 id 计算 1 的第一个实例。
预期输出:
# id first
# 1: 1 3
# 2: 2 2
# 3: 5 3
您可以在没有 sqldf
的情况下使用 dplyr
library(dplyr)
d %>%
group_by(id) %>%
summarize(first=first(which(qn==1)))
我们也可以用data.table
library(data.table)
setDT(d)[, list(first= which.max(qn)) , id]
1) 在 lapply:
中使用 sqldfdo.call(rbind,
lapply(split(d, id), function(i)
sqldf("SELECT id, min(rowid) AS first
FROM (SELECT rowid, *
FROM i) AS x
WHERE qn = 1"))
)
## id first
## 1 1 3
## 2 2 2
## 5 5 3
2) 或纯 SQL 解决方案从每个组的第一个 rowid 中减去每个组中第一行的 rowid qn=1 并添加1:
sqldf("select id, min_row1 - min_row + 1 first
from (select id, min(rowid) min_row
from d
group by id)
join (select id, min(rowid) min_row1
from d where qn = 1
group by id) using (id)")
## id first
## 1 1 3
## 2 2 2
## 3 5 3
3) 或替代纯 SQL 解决方案,在内部 select 的 id 中创建一个序列 seq
然后选择id 组中的第一个 qn = 1:
sqldf("select id, min(seq) first
from (select x.id, x.qn, count() seq
from d x
join d y on x.rowid >= y.rowid and x.id = y.id
group by x.rowid)
where qn = 1
group by id")
## id first
## 1 1 3
## 2 2 2
## 3 5 3