将时间信息汇总到动态矩阵中
Summarize temporal information into a dynamic matrix
我有一个像 df
:
这样的数据框
id <- c("A", "A", "A", "A", "B", "B", "B", "C", "C", "D", "D", "E")
year <- c("2005", "2006", "2007", "2008", "2005", "2006", "2007", "2005", "2007", "2006", "2007", "2008")
value <- 1:12
df <- data.frame(id, year, value)
我想将 df
转换为矩阵 id_observed
,其中列按年份计算首次观察到的 ID 的数量。 Rows 统计有多少个 id“存活”到连续的一年:
id_observed <- matrix(c(3,2,3,1,0,1,1,0,0,0,0,0,0,0,0,1), nrow = 4, ncol = 4)
#First observed id's (by columns), consecutive id's observations (by rows)
colnames(id_observed) <- c("2005", "2006", "2007", "2008")
rownames(id_observed) <- c("2005", "2006", "2007", "2008")
id_observed
同样的想法适用于从 value
中获取信息生成矩阵 value_observed
。其中列按年份计算首次观察到的 id 的聚合值。 Rows统计“存活”到连续年份的ids的聚合值:
value_observed <- matrix(c(14,8,19,4,0,10,11,0,0,0,0,0,0,0,0,12), nrow = 4, ncol = 4)
#First observed value (by columns), consecutive value's observations (by rows)
colnames(value_observed) <- c("2005", "2006", "2007", "2008")
rownames(value_observed) <- c("2005", "2006", "2007", "2008")
value_observed
关于如何自动构建矩阵 id_observed
和 value_observed
的任何线索?
您可以创建这个函数,get_matrix()
,它利用 tidyverse 方法循环独特的年份,为每一年创建数据,绑定行,然后旋转更宽
library(tidyverse)
get_matrix <- function(df, type=c("value","id")) {
res = lapply(unique(df$year), function(y) {
d = df %>% group_by(id) %>% filter(min(year)==y) %>% group_by(year)
if(type == "value") d = summarize(d,n=sum(value))
else d = summarize(d,n=n())
d = mutate(d,y=y)
if(nrow(d)==0) return(tibble(year=y, n=0, y=y)) else return(d)
})
bind_rows(res) %>%
pivot_wider(id_cols = year,names_from = y,values_from = n,values_fill = 0)
}
用法
get_matrix(df, type="value")
输出
year `2005` `2006` `2007` `2008`
<chr> <dbl> <dbl> <dbl> <dbl>
1 2005 14 0 0 0
2 2006 8 10 0 0
3 2007 19 11 0 0
4 2008 4 0 0 12
用法
get_matrix(df, type="id")
输出
year `2005` `2006` `2007` `2008`
<chr> <dbl> <dbl> <dbl> <dbl>
1 2005 3 0 0 0
2 2006 2 1 0 0
3 2007 3 1 0 0
4 2008 1 0 0 1
更新:
data.table 选项
setDT(df)[, year:=as.integer(year)]
syears = unique(df$year)
df = df[, y:=min(year), by = id][, .SD[,.N, year], by=y]
dcast(
rbind(df,data.table(y=setdiff(syears, unique(df$y)))[,`:=`(year=y,N=0)]),
year~y, value.var="N"
)
输出:
year 2005 2006 2007 2008
<int> <num> <num> <num> <num>
1: 2005 3 NA NA NA
2: 2006 2 1 NA NA
3: 2007 3 1 0 NA
4: 2008 1 NA NA 1
我有一个像 df
:
id <- c("A", "A", "A", "A", "B", "B", "B", "C", "C", "D", "D", "E")
year <- c("2005", "2006", "2007", "2008", "2005", "2006", "2007", "2005", "2007", "2006", "2007", "2008")
value <- 1:12
df <- data.frame(id, year, value)
我想将 df
转换为矩阵 id_observed
,其中列按年份计算首次观察到的 ID 的数量。 Rows 统计有多少个 id“存活”到连续的一年:
id_observed <- matrix(c(3,2,3,1,0,1,1,0,0,0,0,0,0,0,0,1), nrow = 4, ncol = 4)
#First observed id's (by columns), consecutive id's observations (by rows)
colnames(id_observed) <- c("2005", "2006", "2007", "2008")
rownames(id_observed) <- c("2005", "2006", "2007", "2008")
id_observed
同样的想法适用于从 value
中获取信息生成矩阵 value_observed
。其中列按年份计算首次观察到的 id 的聚合值。 Rows统计“存活”到连续年份的ids的聚合值:
value_observed <- matrix(c(14,8,19,4,0,10,11,0,0,0,0,0,0,0,0,12), nrow = 4, ncol = 4)
#First observed value (by columns), consecutive value's observations (by rows)
colnames(value_observed) <- c("2005", "2006", "2007", "2008")
rownames(value_observed) <- c("2005", "2006", "2007", "2008")
value_observed
关于如何自动构建矩阵 id_observed
和 value_observed
的任何线索?
您可以创建这个函数,get_matrix()
,它利用 tidyverse 方法循环独特的年份,为每一年创建数据,绑定行,然后旋转更宽
library(tidyverse)
get_matrix <- function(df, type=c("value","id")) {
res = lapply(unique(df$year), function(y) {
d = df %>% group_by(id) %>% filter(min(year)==y) %>% group_by(year)
if(type == "value") d = summarize(d,n=sum(value))
else d = summarize(d,n=n())
d = mutate(d,y=y)
if(nrow(d)==0) return(tibble(year=y, n=0, y=y)) else return(d)
})
bind_rows(res) %>%
pivot_wider(id_cols = year,names_from = y,values_from = n,values_fill = 0)
}
用法
get_matrix(df, type="value")
输出
year `2005` `2006` `2007` `2008`
<chr> <dbl> <dbl> <dbl> <dbl>
1 2005 14 0 0 0
2 2006 8 10 0 0
3 2007 19 11 0 0
4 2008 4 0 0 12
用法
get_matrix(df, type="id")
输出
year `2005` `2006` `2007` `2008`
<chr> <dbl> <dbl> <dbl> <dbl>
1 2005 3 0 0 0
2 2006 2 1 0 0
3 2007 3 1 0 0
4 2008 1 0 0 1
更新:
data.table 选项
setDT(df)[, year:=as.integer(year)]
syears = unique(df$year)
df = df[, y:=min(year), by = id][, .SD[,.N, year], by=y]
dcast(
rbind(df,data.table(y=setdiff(syears, unique(df$y)))[,`:=`(year=y,N=0)]),
year~y, value.var="N"
)
输出:
year 2005 2006 2007 2008
<int> <num> <num> <num> <num>
1: 2005 3 NA NA NA
2: 2006 2 1 NA NA
3: 2007 3 1 0 NA
4: 2008 1 NA NA 1