重新排列纵向数据
Rearranging longitudinal data
我有一个大致结构如下的数据集:
case Year 2001 2002 2003 2004
1 2003 0 0 0 3
2 2002 0 5 3 2
3 2001 3 3 2 2
我正在尝试重组它,以便每一列代表从 "Year" 变量开始计算的第一年、第二年(等等),即:
case Year yr1 yr2 yr3 yr4
1 2003 0 3 0 0
2 2002 5 3 2 0
3 2001 3 3 2 2
此代码下载数据集并尝试@akrun 建议的解决方案,但失败了。
library("devtools")
df1 <- source_gist("b4c44aa67bfbcd6b72b9")
df1[-(1:2)] <- do.call(rbind,lapply(seq_len(nrow(df1)), function(i) {x <- df1[i, ]; x1 <- unlist(x[-(1:2)]); indx <- which(!is.na(x1))[1]; i <- as.numeric(names(indx))-x[,2]+1; x2 <- x1[!is.na(x1)]; x3 <- rep(NA, length(x1)); x3[i:(i+length(x2)-1)]<- x2; x3}))
这会生成:
Error in i:(i + length(x2) - 1) : NA/NaN argument
In addition: Warning message:
In FUN(1:234[[1L]], ...) : NAs introduced by coercion
如何转换数据,使每一列代表第一年、第二年(等等),从每一行的 "Year" 变量中的值开始计算?
这里有一个可能性:
library(dplyr)
library(reshape2)
df %>%
melt(id.vars = c("case", "Year")) %>%
mutate(variable = as.numeric(as.character(variable)),
yr = variable - Year + 1) %>%
filter(variable >= Year) %>%
dcast(case + Year ~ yr, fill = 0)
# case Year 1 2 3 4
# 1 1 2003 0 3 0 0
# 2 2 2002 5 3 2 0
# 3 3 2001 3 3 2 2
数据:
df <- structure(list(case = 1:3, Year = c(2003L, 2002L, 2001L), `2001` = c(0L,
0L, 3L), `2002` = c(0L, 5L, 3L), `2003` = c(0L, 3L, 2L), `2004` = c(3L,
2L, 2L)), .Names = c("case", "Year", "2001", "2002", "2003",
"2004"), class = "data.frame", row.names = c(NA, -3L))
这应该会创建您正在寻找的操作。
library("devtools")
df1 <- source_gist("b4c44aa67bfbcd6b72b9")
temp <- df1[[1]]
library(dplyr); library(tidyr); library(stringi)
temp <- temp %>%
gather(new.Years, X, -Year) %>% # convert rows to one column
mutate(Year.temp=paste0(rownames(temp), "-", Year)) %>% # concatenate the Year with row number to make them unique
mutate(new.Years = as.numeric(gsub("X", "", new.Years)), diff = new.Years-Year+1) %>% # calculate the difference to get the yr0 yr1 and so on
mutate(diff=paste0("yr", stri_sub(paste0("0", (ifelse(diff>0, diff, 0))), -2, -1))) %>% # convert the differences in Yr01 ...
select(-new.Years) %>% filter(diff != "yr00") %>% # drop new.Years column
spread(diff, X) %>% # convert column to rows
select(-Year.temp) # Drop Year.temp column
temp[is.na(temp)] <- 0 # replace NA with 0
temp %>% View
请注意,这最多可以使用 99 年。
这是一个data.table
解决方案:
require(data.table)
require(reshape2)
dt.m = melt(dt, id = 1:2, variable.factor = FALSE)
dt.m[, variable := as.integer(variable)-Year+1L]
dcast.data.table(dt.m, case + Year ~ variable, fill=0L,
value.var = "value", subset = (variable > 0L))
# case Year 1 2 3 4
# 1: 1 2003 0 3 0 0
# 2: 2 2002 5 3 2 0
# 3: 3 2001 3 3 2 2
library("devtools")
df1 <- source_gist("b4c44aa67bfbcd6b72b9")$value
我在列名中有一个 X 并将其删除:
colnames(df1) <- gsub("X", "", colnames(df1))
我有一个没有任何附加包的解决方案:
startYear <- as.numeric(colnames(df1)[2])
shifts <- df1$Year - startYear
n <- ncol(df1)
df2 <- df1
colnames(df2)[-1] <- 1:(n-1)
df2[,2:n] <- NA
for(row in 1:nrow(df1)){
if(shifts[row]>=0){
df2[row,2:(n-shifts[row])] <- df1[row, (shifts[row]+2):n]
#df2[row,2:(n-shifts[row])] <- colnames(df1)[(shifts[row]+2):n]
}else{
df2[row, (-shifts[row]+2):n] <- df1[row, 2:(n+shifts[row])]
#df2[row, (-shifts[row]+2):n] <- colnames(df1)[2:(n+shifts[row])]
}
}
您可以用 0
代替 NA
预填充 df2
。在 ifelse 条件中取消注释第二行并注释第一行以验证排列。
希望它能如您所愿。
我有一个大致结构如下的数据集:
case Year 2001 2002 2003 2004
1 2003 0 0 0 3
2 2002 0 5 3 2
3 2001 3 3 2 2
我正在尝试重组它,以便每一列代表从 "Year" 变量开始计算的第一年、第二年(等等),即:
case Year yr1 yr2 yr3 yr4
1 2003 0 3 0 0
2 2002 5 3 2 0
3 2001 3 3 2 2
此代码下载数据集并尝试@akrun 建议的解决方案,但失败了。
library("devtools")
df1 <- source_gist("b4c44aa67bfbcd6b72b9")
df1[-(1:2)] <- do.call(rbind,lapply(seq_len(nrow(df1)), function(i) {x <- df1[i, ]; x1 <- unlist(x[-(1:2)]); indx <- which(!is.na(x1))[1]; i <- as.numeric(names(indx))-x[,2]+1; x2 <- x1[!is.na(x1)]; x3 <- rep(NA, length(x1)); x3[i:(i+length(x2)-1)]<- x2; x3}))
这会生成:
Error in i:(i + length(x2) - 1) : NA/NaN argument
In addition: Warning message:
In FUN(1:234[[1L]], ...) : NAs introduced by coercion
如何转换数据,使每一列代表第一年、第二年(等等),从每一行的 "Year" 变量中的值开始计算?
这里有一个可能性:
library(dplyr)
library(reshape2)
df %>%
melt(id.vars = c("case", "Year")) %>%
mutate(variable = as.numeric(as.character(variable)),
yr = variable - Year + 1) %>%
filter(variable >= Year) %>%
dcast(case + Year ~ yr, fill = 0)
# case Year 1 2 3 4
# 1 1 2003 0 3 0 0
# 2 2 2002 5 3 2 0
# 3 3 2001 3 3 2 2
数据:
df <- structure(list(case = 1:3, Year = c(2003L, 2002L, 2001L), `2001` = c(0L,
0L, 3L), `2002` = c(0L, 5L, 3L), `2003` = c(0L, 3L, 2L), `2004` = c(3L,
2L, 2L)), .Names = c("case", "Year", "2001", "2002", "2003",
"2004"), class = "data.frame", row.names = c(NA, -3L))
这应该会创建您正在寻找的操作。
library("devtools")
df1 <- source_gist("b4c44aa67bfbcd6b72b9")
temp <- df1[[1]]
library(dplyr); library(tidyr); library(stringi)
temp <- temp %>%
gather(new.Years, X, -Year) %>% # convert rows to one column
mutate(Year.temp=paste0(rownames(temp), "-", Year)) %>% # concatenate the Year with row number to make them unique
mutate(new.Years = as.numeric(gsub("X", "", new.Years)), diff = new.Years-Year+1) %>% # calculate the difference to get the yr0 yr1 and so on
mutate(diff=paste0("yr", stri_sub(paste0("0", (ifelse(diff>0, diff, 0))), -2, -1))) %>% # convert the differences in Yr01 ...
select(-new.Years) %>% filter(diff != "yr00") %>% # drop new.Years column
spread(diff, X) %>% # convert column to rows
select(-Year.temp) # Drop Year.temp column
temp[is.na(temp)] <- 0 # replace NA with 0
temp %>% View
请注意,这最多可以使用 99 年。
这是一个data.table
解决方案:
require(data.table)
require(reshape2)
dt.m = melt(dt, id = 1:2, variable.factor = FALSE)
dt.m[, variable := as.integer(variable)-Year+1L]
dcast.data.table(dt.m, case + Year ~ variable, fill=0L,
value.var = "value", subset = (variable > 0L))
# case Year 1 2 3 4
# 1: 1 2003 0 3 0 0
# 2: 2 2002 5 3 2 0
# 3: 3 2001 3 3 2 2
library("devtools")
df1 <- source_gist("b4c44aa67bfbcd6b72b9")$value
我在列名中有一个 X 并将其删除:
colnames(df1) <- gsub("X", "", colnames(df1))
我有一个没有任何附加包的解决方案:
startYear <- as.numeric(colnames(df1)[2])
shifts <- df1$Year - startYear
n <- ncol(df1)
df2 <- df1
colnames(df2)[-1] <- 1:(n-1)
df2[,2:n] <- NA
for(row in 1:nrow(df1)){
if(shifts[row]>=0){
df2[row,2:(n-shifts[row])] <- df1[row, (shifts[row]+2):n]
#df2[row,2:(n-shifts[row])] <- colnames(df1)[(shifts[row]+2):n]
}else{
df2[row, (-shifts[row]+2):n] <- df1[row, 2:(n+shifts[row])]
#df2[row, (-shifts[row]+2):n] <- colnames(df1)[2:(n+shifts[row])]
}
}
您可以用 0
代替 NA
预填充 df2
。在 ifelse 条件中取消注释第二行并注释第一行以验证排列。
希望它能如您所愿。