如何在行和列中重新排列 variables/observation 的数据框? (使用 dplyr 和 tidyr)
How to rearrange data frame with variables/observation in row and column? (using dplyr and tidyr)
在R中,如何处理混合了行和列作为变量的凌乱数据框?
days <- c(as.Date("2011-07-01") + 0:9)
set.seed(10)
d <- data.frame(days,replicate(9,round(runif(10,0,10),3)))
names(d) <- c("Date", "x.astreet.1", "x.astreet.2", "x.astreet.3",
"x.Bstreet.1", "x.Bstreet.2", "x.Bstreet.3",
"x.Cstreet.1", "x.Cstreet.2", "x.Cstreet.3")
streetnames <- c(NA,rep(c("Astr.","Bstr.","Cstr."),3))
molecule <- c(NA, rep(c("SO","CO","O3"),3))
d <- rbind(streetnames, molecule,d)
see df as tbl in this printscreen
在这种情况下,理想情况下应该只有 5 行(日期、SO、NO、O3、站)
基本的 R 方法可能如下所示。
res <- lapply(seq(2, ncol(d), by = 3), function(i){
Date <- d[-(1:2), "Date"]
SO <- d[-(1:2), i]
CO <- d[-(1:2), i + 1]
O3 <- d[-(1:2), i + 2]
data.frame(Date, SO, CO, O3)
})
res <- do.call(rbind, res)
res$Date <- as.Date(res$Date)
row.names(res) <- NULL
head(res)
# Date SO CO O3
#1 2011-07-01 5.075 6.517 8.647
#2 2011-07-02 3.068 5.677 6.154
#3 2011-07-03 4.269 1.135 7.751
#4 2011-07-04 6.931 5.959 3.556
#5 2011-07-05 0.851 3.58 4.058
#6 2011-07-06 2.254 4.288 7.066
我认为这就是您想要达到的目的。可能有更优雅的解决方案,但这会起作用。
我假设后缀1、2、3分别对应SO、CO、O3
此解决方案不使用您创建的街道名称或 molucule_number 向量,因此您可以放弃您所做的 rbind()
调用。
library(dplyr)
library(tidyr)
e <- d %>% gather(key = "station", value = "val", x.astreet.1:x.Cstreet.3)
SO <- e %>% filter(grepl("1", station))
CO <- e %>% filter(grepl("2", station))
O3 <- e %>% filter(grepl("3", station))
f <- data.frame(SO, CO %>% select(val), O3 %>% select(val))
g <- f %>% mutate(Station = case_when(station == "x.astreet.1" ~ "Astr",
station == "x.Bstreet.1" ~ "Bstr",
station == "x.Cstreet.1" ~ "Cstr"),
SO = val,
CO = val.1,
O3 = val.2) %>%
select(Date, SO, CO, O3, Station)
我留在了 DF 重命名中,所以你可以在每一步后看到结果。
从代码示例的开头开始,省略了 rbind
调用:
days <- c(as.Date("2011-07-01") + 0:9)
set.seed(10)
d <- data.frame(days,replicate(9,round(runif(10,0,10),3)))
names(d) <- c("Date", "x.astreet.1", "x.astreet.2", "x.astreet.3",
"x.Bstreet.1", "x.Bstreet.2", "x.Bstreet.3",
"x.Cstreet.1", "x.Cstreet.2", "x.Cstreet.3")
d %<>% gather(col_name, value, -Date) %>%
separate(col_name, c("x", "street_name", "molecule_number"), sep = "\.", convert = TRUE) %>%
select(-x) %>%
spread(molecule_number, value) %>%
rename(SO = `1`, NO = `2`, O3 = `3`)
这是我的方法。这样做的好处是它完全是程序化的。如果数据集是完整的,有一个手动重命名变量的解决方案很好,但如果您仍在添加新的站和气体,这种方法可以扩展到数据集。
# OP changed the 'streetnames' vector, below is the correct one they've provided.
days <- c(as.Date("2011-07-01") + 0:9)
set.seed(10)
d <- data.frame(days,replicate(9,round(runif(10,0,10),3)))
names(d) <- c("Date", "x.astreet.1", "x.astreet.2", "x.astreet.3",
"x.Bstreet.1", "x.Bstreet.2", "x.Bstreet.3",
"x.Cstreet.1", "x.Cstreet.2", "x.Cstreet.3")
streetnames <- c(NA,rep(c("Astr."),3),rep(c("Bstr."),3),rep(c("Cstr."),3))
molecule <- c(NA, rep(c("SO","CO","O3"),3))
d <- rbind(streetnames, molecule, d)
# ---------------
library(tidyr)
library(dplyr)
library(janitor)
# Replace column names with the combined first two rows. This is tricky to do inside
# a dplyr pipeline so I do it outside.
names(d) <- paste(d[1,], d[2,])
d2 <-
d %>%
slice(3:n()) %>% # Remove first 2 rows
clean_names() %>% # Janitor standardises column names
rename(date = na_na) %>%
gather(measure, value, -date) %>% # Collapse wide to long
separate(measure, # Break this column into several columns
into = c("station", "gas")) %>%
mutate_at("value", as.numeric) %>%
# You can stop there to have a long table. To get a wide table:
spread(gas, value) %>%
identity()
head(d2)
#> date station co o3 so
#> 1 2011-07-01 astr 6.517 8.647 5.075
#> 2 2011-07-01 bstr 2.755 3.543 5.356
#> 3 2011-07-01 cstr 0.756 8.614 0.319
#> 4 2011-07-02 astr 5.677 6.154 3.068
#> 5 2011-07-02 bstr 2.289 9.364 0.931
#> 6 2011-07-02 cstr 5.344 4.644 1.145
str(d2)
#> 'data.frame': 30 obs. of 5 variables:
#> $ date : Date, format: "2011-07-01" "2011-07-01" "2011-07-01" ...
#> $ station: chr "astr" "bstr" "cstr" "astr" ...
#> $ co : num 6.517 2.755 0.756 5.677 2.289 ...
#> $ o3 : num 8.65 3.54 8.61 6.15 9.36 ...
#> $ so : num 5.075 5.356 0.319 3.068 0.931 ...
注意:出于调试目的,我总是在管道末尾抛出一个 identity()
。它使您可以注释掉整行管道,而不必担心尾随 %>%
引发错误。
在R中,如何处理混合了行和列作为变量的凌乱数据框?
days <- c(as.Date("2011-07-01") + 0:9)
set.seed(10)
d <- data.frame(days,replicate(9,round(runif(10,0,10),3)))
names(d) <- c("Date", "x.astreet.1", "x.astreet.2", "x.astreet.3",
"x.Bstreet.1", "x.Bstreet.2", "x.Bstreet.3",
"x.Cstreet.1", "x.Cstreet.2", "x.Cstreet.3")
streetnames <- c(NA,rep(c("Astr.","Bstr.","Cstr."),3))
molecule <- c(NA, rep(c("SO","CO","O3"),3))
d <- rbind(streetnames, molecule,d)
see df as tbl in this printscreen
在这种情况下,理想情况下应该只有 5 行(日期、SO、NO、O3、站)
基本的 R 方法可能如下所示。
res <- lapply(seq(2, ncol(d), by = 3), function(i){
Date <- d[-(1:2), "Date"]
SO <- d[-(1:2), i]
CO <- d[-(1:2), i + 1]
O3 <- d[-(1:2), i + 2]
data.frame(Date, SO, CO, O3)
})
res <- do.call(rbind, res)
res$Date <- as.Date(res$Date)
row.names(res) <- NULL
head(res)
# Date SO CO O3
#1 2011-07-01 5.075 6.517 8.647
#2 2011-07-02 3.068 5.677 6.154
#3 2011-07-03 4.269 1.135 7.751
#4 2011-07-04 6.931 5.959 3.556
#5 2011-07-05 0.851 3.58 4.058
#6 2011-07-06 2.254 4.288 7.066
我认为这就是您想要达到的目的。可能有更优雅的解决方案,但这会起作用。
我假设后缀1、2、3分别对应SO、CO、O3
此解决方案不使用您创建的街道名称或 molucule_number 向量,因此您可以放弃您所做的 rbind()
调用。
library(dplyr)
library(tidyr)
e <- d %>% gather(key = "station", value = "val", x.astreet.1:x.Cstreet.3)
SO <- e %>% filter(grepl("1", station))
CO <- e %>% filter(grepl("2", station))
O3 <- e %>% filter(grepl("3", station))
f <- data.frame(SO, CO %>% select(val), O3 %>% select(val))
g <- f %>% mutate(Station = case_when(station == "x.astreet.1" ~ "Astr",
station == "x.Bstreet.1" ~ "Bstr",
station == "x.Cstreet.1" ~ "Cstr"),
SO = val,
CO = val.1,
O3 = val.2) %>%
select(Date, SO, CO, O3, Station)
我留在了 DF 重命名中,所以你可以在每一步后看到结果。
从代码示例的开头开始,省略了 rbind
调用:
days <- c(as.Date("2011-07-01") + 0:9)
set.seed(10)
d <- data.frame(days,replicate(9,round(runif(10,0,10),3)))
names(d) <- c("Date", "x.astreet.1", "x.astreet.2", "x.astreet.3",
"x.Bstreet.1", "x.Bstreet.2", "x.Bstreet.3",
"x.Cstreet.1", "x.Cstreet.2", "x.Cstreet.3")
d %<>% gather(col_name, value, -Date) %>%
separate(col_name, c("x", "street_name", "molecule_number"), sep = "\.", convert = TRUE) %>%
select(-x) %>%
spread(molecule_number, value) %>%
rename(SO = `1`, NO = `2`, O3 = `3`)
这是我的方法。这样做的好处是它完全是程序化的。如果数据集是完整的,有一个手动重命名变量的解决方案很好,但如果您仍在添加新的站和气体,这种方法可以扩展到数据集。
# OP changed the 'streetnames' vector, below is the correct one they've provided.
days <- c(as.Date("2011-07-01") + 0:9)
set.seed(10)
d <- data.frame(days,replicate(9,round(runif(10,0,10),3)))
names(d) <- c("Date", "x.astreet.1", "x.astreet.2", "x.astreet.3",
"x.Bstreet.1", "x.Bstreet.2", "x.Bstreet.3",
"x.Cstreet.1", "x.Cstreet.2", "x.Cstreet.3")
streetnames <- c(NA,rep(c("Astr."),3),rep(c("Bstr."),3),rep(c("Cstr."),3))
molecule <- c(NA, rep(c("SO","CO","O3"),3))
d <- rbind(streetnames, molecule, d)
# ---------------
library(tidyr)
library(dplyr)
library(janitor)
# Replace column names with the combined first two rows. This is tricky to do inside
# a dplyr pipeline so I do it outside.
names(d) <- paste(d[1,], d[2,])
d2 <-
d %>%
slice(3:n()) %>% # Remove first 2 rows
clean_names() %>% # Janitor standardises column names
rename(date = na_na) %>%
gather(measure, value, -date) %>% # Collapse wide to long
separate(measure, # Break this column into several columns
into = c("station", "gas")) %>%
mutate_at("value", as.numeric) %>%
# You can stop there to have a long table. To get a wide table:
spread(gas, value) %>%
identity()
head(d2)
#> date station co o3 so
#> 1 2011-07-01 astr 6.517 8.647 5.075
#> 2 2011-07-01 bstr 2.755 3.543 5.356
#> 3 2011-07-01 cstr 0.756 8.614 0.319
#> 4 2011-07-02 astr 5.677 6.154 3.068
#> 5 2011-07-02 bstr 2.289 9.364 0.931
#> 6 2011-07-02 cstr 5.344 4.644 1.145
str(d2)
#> 'data.frame': 30 obs. of 5 variables:
#> $ date : Date, format: "2011-07-01" "2011-07-01" "2011-07-01" ...
#> $ station: chr "astr" "bstr" "cstr" "astr" ...
#> $ co : num 6.517 2.755 0.756 5.677 2.289 ...
#> $ o3 : num 8.65 3.54 8.61 6.15 9.36 ...
#> $ so : num 5.075 5.356 0.319 3.068 0.931 ...
注意:出于调试目的,我总是在管道末尾抛出一个 identity()
。它使您可以注释掉整行管道,而不必担心尾随 %>%
引发错误。