两个变量的聚合和平均

aggregating and averaging by two variables

我对 R 比较陌生。我有以下超过 500 万行的数据集,我需要对其进行整形。我需要为每个 station.id

取每个 hour 的平均值 perc_full

此时我所擅长的就是按小时和站点进行子集化,这将花费很长时间。有没有办法加快这个过程?

dim(data)
[1] 5116857      12
head(data, n = 10)
         id station_id status available_bike_count available_dock_count          created_at
1  21141047          1 Active                   12                   23 2014-10-01 00:00:05
2  21141048          2 Active                    1                   32 2014-10-01 00:00:05
3  21141049          3 Active                    8                   17 2014-10-01 00:00:05
4  21141050          4 Active                   23                   39 2014-10-01 00:00:05
5  21141051          5 Active                    6                   31 2014-10-01 00:00:05
6  21141052          6 Active                    5                   14 2014-10-01 00:00:05
7  21141053          7 Active                    2                   17 2014-10-01 00:00:05
8  21141054          8 Active                   20                    8 2014-10-01 00:00:05
9  21141055          9 Active                    3                   27 2014-10-01 00:00:05
10 21141056         10 Active                    0                   45 2014-10-01 00:00:05
   station_summary_id month year hour tot_docks  perc_full
1               64087    10 2014    0        35 0.34285714
2               64087    10 2014    0        33 0.03030303
3               64087    10 2014    0        25 0.32000000
4               64087    10 2014    0        62 0.37096774
5               64087    10 2014    0        37 0.16216216
6               64087    10 2014    0        19 0.26315789
7               64087    10 2014    0        19 0.10526316
8               64087    10 2014    0        28 0.71428571
9               64087    10 2014    0        30 0.10000000
10              64087    10 2014    0        45 0.00000000

最后,我应该得到一个包含 25 列的结果 - 每个 hour 24 列,station.id

一个
output
   id         1         2         3         4         5         6         7         8         9
1   1 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
2   2 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
3   3 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
4   4 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
5   5 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
6   6 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
7   7 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
8   8 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
9   9 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
10 10 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
          10        11        12        13        14        15        16        17        18
1  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
2  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
3  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
4  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
5  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
6  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
7  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
8  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
9  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
10 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
          19        20        21        22        23        24
1  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
2  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
3  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
4  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
5  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
6  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
7  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
8  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
9  0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
10 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362 0.5554362
sapply(data, class)
$id
[1] "integer"

$station_id
[1] "integer"

$status
[1] "factor"

$available_bike_count
[1] "integer"

$available_dock_count
[1] "integer"

$created_at
[1] "POSIXlt" "POSIXt" 

$station_summary_id
[1] "integer"

$month
[1] "integer"

$year
[1] "integer"

$hour
[1] "integer"

$tot_docks
[1] "integer"

$perc_full
[1] "numeric"

这是第二个数据集,我想要完全相同的矩阵,只是这次通过求和 start.station.id 每小时


> head(test, n = 10)
   bikeid end.station.id start.station.id diff.time hour
1   16052            244              322      6544   14
2   16052            284              432      3406   21
3   16052            461              519     33416    3
4   16052            228              519     26876   13
5   16052             72              435       388   17
6   16052            319              127     27702   11
7   16052            282             2002     33882    4
8   16052            524             2021      2525   10
9   16052            387              351      2397   12
10  16052            388              526     32507   13

我应该使用这样的东西吗?

matrix <- test %>% 
  group_by(start.station.id, hour)%>%
  summarise(sum = nrow) %>%
  spread(hour, nrow) 

试试这个:

library(dplyr)
library(tidyr)

data %>%
  group_by(station_id, hour) %>%
  summarise(mean_perc_full = mean(perc_full)) %>%
  spread(hour, mean_perc_full)