使用列内的固定效果信息重塑 R 中的数据
Reshape data in R with fixed effect information within column
我在 excel 中获得了一些格式非常笨拙的数据,我需要对其进行重塑以使其适合 运行 R 中的生存分析。
我将数据的摘录上传到 Google 驱动器:https://drive.google.com/open?id=1ret3bCDCYPDALQ16YBloaeopfl2-qVbp
原始数据框有大约 2100 个观察值和 950 个变量
基本数据框如下:
my.data<-data.frame(
ID=c( "", "","C8477","C5273","C5566"),
LR=c("2012Y","State:FL",5,6,8),
LR=c("2012Y","State:AZ",5,8,10),
LR=c("2011Y","State:FL",7,2,1)
)
my.data
# ID LR LR.1 LR.2
# 1 2012Y 2012Y 2011Y
# 2 State:FL State:AZ State:FL
# 3 C8477 5 5 7
# 4 C5273 6 8 2
# 5 C5566 8 10 1
所有列都具有相同的名称 "LR"。不知道以后会不会出问题...
第 1 行给出年份,第 2 行给出观察发生的相应状态。
作为输出,我需要一些面板数据,以便在以后的生存分析中使用。
my.data<-data.frame(
ID=c("C8477","C5273","C5566"),
Year=c("2012","2012","2011"),
State=c("FL","AZ","FL"),LR=c(5,8,1)
)
my.data
# ID Year State LR
# 1 C8477 2012 FL 5
# 2 C5273 2012 AZ 8
# 3 C5566 2011 FL 1
我试过 reshape 函数和 seq 函数,但没有一个能帮助我朝着正确的方向前进,因为数据框的排列非常奇怪。
这是实现此目的的 reshape2
和 tidyr
版本:
library(tidyr)
library(reshape2)
my.data <- data.frame(
ID=c( "", "","C8477","C5273","C5566"),
LR=c("2012Y","State:FL",5,6,8),
LR=c("2012Y","State:AZ",5,8,10),
LR=c("2011Y","State:FL",7,2,1)
)
# Combine first two rows as column names
colnames(my.data) <- paste(unlist(my.data[2, ]), unlist(my.data[1, ]), sep = "|")
# Remove first two rows from data
my.data <- my.data[-c(1:2), ] # negative index removes rows
# Melt data
my.data.long <- melt(
my.data,
id.vars = 1L, # would be better to have explicit col name
value.name = "LR"
)
colnames(my.data.long) <- c("ID", "state_year", "LR")
# Split state_year column into two columns:
my.data.long <- separate(
my.data.long,
state_year,
into = c("State", "Year"),
sep = "\|" # note this is a regex
)
想法是借来的here。
这是一种 tidyverse
方法:
my.data <- data.frame(
ID=c( "", "","C8477","C5273","C5566"),
LR=c("2012Y","State:FL",5,6,8),
LR=c("2012Y","State:AZ",5,8,10),
LR=c("2011Y","State:FL",7,2,1)
)
我的代码:
library(tidyverse)
year <- as.matrix(my.data[1, -1])
year <- str_split(year, "Y", simplify = T)[,1]
state <-as.matrix(my.data[2, -1])
both<-paste(state, year, sep = "_")
mydata1<-my.data[-c(1, 2), ]
colnames(mydata1) <-c("ID", both)
long <-pivot_longer(mydata1,
cols = starts_with("state"),
names_to = "State_year",
values_to = "LR")
long %>%
transmute(
ID, LR,
state = str_split(State_year, "_", simplify = T)[, 1],
state = str_split(state, ":", simplify = T)[, 2],
year = str_split(State_year, "_", simplify = T)[, 2]
)
我们得到:
ID LR state year
1 C8477 5 FL 2012
2 C8477 5 AZ 2012
3 C8477 7 FL 2011
4 C5273 6 FL 2012
5 C5273 8 AZ 2012
6 C5273 2 FL 2011
7 C5566 8 FL 2012
8 C5566 10 AZ 2012
9 C5566 1 FL 2011
一种可能的 base-R 方法使用 reshape
:
## 1) extract the State and Year rows in a separate data.frame
dat.meta <- data.frame(
Year = unlist(dat[1, -1]),
State = sub("State:", "", dat[2, -1]),
LR.id = colnames(dat)[-1]
)
## 2) reshape the data without State and Year rows into long format
dat.long <- reshape(
data = dat[-c(1, 2), ],
varying = 2:4,
direction = "long",
v.names = "LR",
timevar = "LR.id",
times = colnames(dat)[-1]
)
dat.long$id <- NULL ## remove id-column
## 3) merge the reshaped data with the extracted State and Year data
merge(dat.long, dat.meta, by = "LR.id")
#> LR.id ID LR Year State
#> 1 LR C8477 5 2012Y FL
#> 2 LR C5273 6 2012Y FL
#> 3 LR C5566 8 2012Y FL
#> 4 LR.1 C8477 5 2012Y AZ
#> 5 LR.1 C5273 8 2012Y AZ
#> 6 LR.1 C5566 10 2012Y AZ
#> 7 LR.2 C8477 7 2011Y FL
#> 8 LR.2 C5273 2 2011Y FL
#> 9 LR.2 C5566 1 2011Y FL
数据
dat <- data.frame(
ID=c( "", "","C8477","C5273","C5566"),
LR=c("2012Y","State:FL",5,6,8),
LR=c("2012Y","State:AZ",5,8,10),
LR=c("2011Y","State:FL",7,2,1),
stringsAsFactors = FALSE
)
我在 excel 中获得了一些格式非常笨拙的数据,我需要对其进行重塑以使其适合 运行 R 中的生存分析。
我将数据的摘录上传到 Google 驱动器:https://drive.google.com/open?id=1ret3bCDCYPDALQ16YBloaeopfl2-qVbp
原始数据框有大约 2100 个观察值和 950 个变量
基本数据框如下:
my.data<-data.frame(
ID=c( "", "","C8477","C5273","C5566"),
LR=c("2012Y","State:FL",5,6,8),
LR=c("2012Y","State:AZ",5,8,10),
LR=c("2011Y","State:FL",7,2,1)
)
my.data
# ID LR LR.1 LR.2
# 1 2012Y 2012Y 2011Y
# 2 State:FL State:AZ State:FL
# 3 C8477 5 5 7
# 4 C5273 6 8 2
# 5 C5566 8 10 1
所有列都具有相同的名称 "LR"。不知道以后会不会出问题...
第 1 行给出年份,第 2 行给出观察发生的相应状态。
作为输出,我需要一些面板数据,以便在以后的生存分析中使用。
my.data<-data.frame(
ID=c("C8477","C5273","C5566"),
Year=c("2012","2012","2011"),
State=c("FL","AZ","FL"),LR=c(5,8,1)
)
my.data
# ID Year State LR
# 1 C8477 2012 FL 5
# 2 C5273 2012 AZ 8
# 3 C5566 2011 FL 1
我试过 reshape 函数和 seq 函数,但没有一个能帮助我朝着正确的方向前进,因为数据框的排列非常奇怪。
这是实现此目的的 reshape2
和 tidyr
版本:
library(tidyr)
library(reshape2)
my.data <- data.frame(
ID=c( "", "","C8477","C5273","C5566"),
LR=c("2012Y","State:FL",5,6,8),
LR=c("2012Y","State:AZ",5,8,10),
LR=c("2011Y","State:FL",7,2,1)
)
# Combine first two rows as column names
colnames(my.data) <- paste(unlist(my.data[2, ]), unlist(my.data[1, ]), sep = "|")
# Remove first two rows from data
my.data <- my.data[-c(1:2), ] # negative index removes rows
# Melt data
my.data.long <- melt(
my.data,
id.vars = 1L, # would be better to have explicit col name
value.name = "LR"
)
colnames(my.data.long) <- c("ID", "state_year", "LR")
# Split state_year column into two columns:
my.data.long <- separate(
my.data.long,
state_year,
into = c("State", "Year"),
sep = "\|" # note this is a regex
)
想法是借来的here。
这是一种 tidyverse
方法:
my.data <- data.frame(
ID=c( "", "","C8477","C5273","C5566"),
LR=c("2012Y","State:FL",5,6,8),
LR=c("2012Y","State:AZ",5,8,10),
LR=c("2011Y","State:FL",7,2,1)
)
我的代码:
library(tidyverse)
year <- as.matrix(my.data[1, -1])
year <- str_split(year, "Y", simplify = T)[,1]
state <-as.matrix(my.data[2, -1])
both<-paste(state, year, sep = "_")
mydata1<-my.data[-c(1, 2), ]
colnames(mydata1) <-c("ID", both)
long <-pivot_longer(mydata1,
cols = starts_with("state"),
names_to = "State_year",
values_to = "LR")
long %>%
transmute(
ID, LR,
state = str_split(State_year, "_", simplify = T)[, 1],
state = str_split(state, ":", simplify = T)[, 2],
year = str_split(State_year, "_", simplify = T)[, 2]
)
我们得到:
ID LR state year
1 C8477 5 FL 2012
2 C8477 5 AZ 2012
3 C8477 7 FL 2011
4 C5273 6 FL 2012
5 C5273 8 AZ 2012
6 C5273 2 FL 2011
7 C5566 8 FL 2012
8 C5566 10 AZ 2012
9 C5566 1 FL 2011
一种可能的 base-R 方法使用 reshape
:
## 1) extract the State and Year rows in a separate data.frame
dat.meta <- data.frame(
Year = unlist(dat[1, -1]),
State = sub("State:", "", dat[2, -1]),
LR.id = colnames(dat)[-1]
)
## 2) reshape the data without State and Year rows into long format
dat.long <- reshape(
data = dat[-c(1, 2), ],
varying = 2:4,
direction = "long",
v.names = "LR",
timevar = "LR.id",
times = colnames(dat)[-1]
)
dat.long$id <- NULL ## remove id-column
## 3) merge the reshaped data with the extracted State and Year data
merge(dat.long, dat.meta, by = "LR.id")
#> LR.id ID LR Year State
#> 1 LR C8477 5 2012Y FL
#> 2 LR C5273 6 2012Y FL
#> 3 LR C5566 8 2012Y FL
#> 4 LR.1 C8477 5 2012Y AZ
#> 5 LR.1 C5273 8 2012Y AZ
#> 6 LR.1 C5566 10 2012Y AZ
#> 7 LR.2 C8477 7 2011Y FL
#> 8 LR.2 C5273 2 2011Y FL
#> 9 LR.2 C5566 1 2011Y FL
数据
dat <- data.frame(
ID=c( "", "","C8477","C5273","C5566"),
LR=c("2012Y","State:FL",5,6,8),
LR=c("2012Y","State:AZ",5,8,10),
LR=c("2011Y","State:FL",7,2,1),
stringsAsFactors = FALSE
)