透视更长:R 中的多行到列
Pivot longer: Multiple rows to columns in R
我目前正在尝试找出如何旋转我的数据框(下面的小 dput)。目前,一栏包含有关国家、ISO 代码、行业和部门的信息。我需要将此信息分为 4 列,并有一个对应的值列。我以前使用过 melt 和 pivot_long 函数,但不确定如何生成 4 个新列以及值列。
DI_SMALL <- structure(list(V1 = structure(c(NA, NA, NA, NA, 1L, 1L, 1L, 1L
), .Label = "Energy Usage (TJ)", class = "factor"), V2 = structure(c(NA,
NA, NA, NA, 2L, 1L, 4L, 3L), .Label = c("Coal", "Natural Gas",
"Nuclear Electricity", "Petroleum"), class = "factor"), V3 = structure(c(5L,
4L, 7L, 6L, 3L, 2L, 1L, 1L), .Label = c("0", "1.29327085460648e-05",
"1.59504500372979e-05", "AFG", "Afghanistan", "Agriculture",
"Industries"), class = "factor"), V4 = structure(c(5L, 4L, 7L,
6L, 3L, 2L, 1L, 1L), .Label = c("0", "6.53466630114587e-06",
"8.05944706428482e-06", "AFG", "Afghanistan", "Fishing", "Industries"
), class = "factor"), V5 = structure(c(5L, 4L, 6L, 7L, 3L, 2L,
1L, 1L), .Label = c("0", "1.88562621206664e-05", "2.32557880912235e-05",
"AFG", "Afghanistan", "Industries", "Mining and Quarrying"), class = "factor"),
V6 = structure(c(5L, 4L, 7L, 6L, 3L, 2L, 1L, 1L), .Label = c("0",
"2.00284547443433e-05", "2.47018365704401e-05", "AFG", "Afghanistan",
"Food & Beverages", "Industries"), class = "factor")), row.names = c("V1",
"V2", "V3", "V4", "X", "X.1", "X.2", "X.3"), class = "data.frame")
理想情况下,输出将包含 7 列。现有的第一个列,国家,ISO,行业和部门然后 Value.Like 这个:
Output <- structure(list(NA. = structure(c(1L, 1L, 1L, 1L), .Label = "Energy Usage (TJ)", class = "factor"),
NA..1 = structure(c(2L, 1L, 4L, 3L), .Label = c("Coal ",
"Natural Gas", "Nuclear Electricity", "Petroleum"), class = "factor"),
Country = structure(c(1L, 1L, 1L, 1L), .Label = "Afghanistan", class = "factor"),
ISO = structure(c(1L, 1L, 1L, 1L), .Label = "AFG", class = "factor"),
Industry = structure(c(1L, 1L, 1L, 1L), .Label = "Industries", class = "factor"),
Sector = structure(c(1L, 1L, 1L, 1L), .Label = "Agriculture", class = "factor"),
Value = c(1.595045004, 1.2932706, 0, 0)), class = "data.frame", row.names = c(NA,
-4L))
希望这是有道理的,任何想法将不胜感激!
谢谢
我会首先对数据进行子集化,然后从那里开始工作,如下所示。虽然我仍然不确定你是如何得到你想要的 Output
的 value
的。下面输出中的值与您在 MWE 中发布的查找内容不对应。希望这能为您提供线索。
subV<- as.data.frame(t(DI_SMALL[grep("V", rownames(DI_SMALL)), ]))[-c(1:2), ] # transpose `t()` this subset to get your desired variable levels into columns
subX<- DI_SMALL[grep("X", rownames(DI_SMALL)), 1:3]
Output <- cbind(subX[, 1:2], subV, subX[, 3])
colnames(Output) <- c("NA.", "NA..1", "Country", "ISO", "Industry", "Sector", "Value"); rownames(Output) <- seq(1:nrow(Output))
> Output
NA. NA..1 Country ISO Industry Sector Value
1 Energy Usage (TJ) Natural Gas Afghanistan AFG Industries Agriculture 1.59504500372979e-05
2 Energy Usage (TJ) Coal Afghanistan AFG Industries Fishing 1.29327085460648e-05
3 Energy Usage (TJ) Petroleum Afghanistan AFG Industries Mining and Quarrying 0
4 Energy Usage (TJ) Nuclear Electricity Afghanistan AFG Industries Food & Beverages 0
这不是 pivot_long
适合的情况,因为您将变量映射到行和列,并且它们不是 columns/rows 的名称。相反,您必须从变量中提取这些属性,然后构建 data.frame
"manually"。这是一个示例,我建议检查每个步骤中的变量值,以便更好地理解这里的过程:
library(dplyr)
df <- DI_SMALL %>%
mutate_all(as.character)
row_attr <- paste0(df$V1, "/", df$V2)
row_attr <- row_attr[row_attr!= "NA/NA"]
col_attr <- df[1:4, -(1:2)] %>%
apply(MARGIN = 2, function(x) paste0(x, collapse = "/"))
values <- df[-(1:4), -(1:2)] %>%
mutate_all(as.numeric) %>%
as.matrix() %>%
c()
out <- expand.grid(row_attr, col_attr)
out <- cbind(out, values)
out <- out %>%
tidyr::separate(col = "Var1", into = c("NA.", "NA..1"), sep = "/") %>%
tidyr::separate(col = "Var2",
into = c("Country", "ISO", "Industry", "Sector"),
sep = "/")
out[1:4]
我认为 Output
和 DI_SMALL
的值的结果在不同的范围内,但除此之外,这似乎是所需的输出。
NA. NA..1 Country ISO Industry Sector values
1 Energy Usage (TJ) Natural Gas Afghanistan AFG Industries Agriculture 1.595045e-05
2 Energy Usage (TJ) Coal Afghanistan AFG Industries Agriculture 1.293271e-05
3 Energy Usage (TJ) Petroleum Afghanistan AFG Industries Agriculture 0.000000e+00
4 Energy Usage (TJ) Nuclear Electricity Afghanistan AFG Industries Agriculture 0.000000e+00
在这里,您可以首先将 V3 重命名为 V6,并连接数据框的前 4 行,然后删除这些行,将数据框转换为更长的格式,最后通过子拆分列来创建所有四列"var" 使用包含新列名称的重塑创建:
library(tidyr)
library(dplyr)
colNAMES <- apply(DI_SMALL[,3:6],2,function(x) paste(x[1:4],collapse="_"))
colnames(DI_SMALL)[3:6] <- colNAMES
DI_SMALL <- DI_SMALL[-c(1:4),]
DI_SMALL %>% pivot_longer(-c(V1,V2),names_to = "var",values_to = "Value") %>%
mutate(Country = unlist(strsplit(var,"_"))[1],
ISO = unlist(strsplit(var,"_"))[2],
Industry = unlist(strsplit(var,"_"))[3],
Sector = unlist(strsplit(var,"_"))[4]) %>%
select(V1,V2,Country, ISO, Industry, Sector, Value)
# A tibble: 16 x 7
V1 V2 Country ISO Industry Sector Value
<fct> <fct> <chr> <chr> <chr> <chr> <fct>
1 Energy Usage (TJ) Natural Gas Afghanistan AFG Industries Agriculture 1.59504500372979e-05
2 Energy Usage (TJ) Natural Gas Afghanistan AFG Industries Agriculture 8.05944706428482e-06
3 Energy Usage (TJ) Natural Gas Afghanistan AFG Industries Agriculture 2.32557880912235e-05
4 Energy Usage (TJ) Natural Gas Afghanistan AFG Industries Agriculture 2.47018365704401e-05
5 Energy Usage (TJ) Coal Afghanistan AFG Industries Agriculture 1.29327085460648e-05
6 Energy Usage (TJ) Coal Afghanistan AFG Industries Agriculture 6.53466630114587e-06
7 Energy Usage (TJ) Coal Afghanistan AFG Industries Agriculture 1.88562621206664e-05
8 Energy Usage (TJ) Coal Afghanistan AFG Industries Agriculture 2.00284547443433e-05
9 Energy Usage (TJ) Petroleum Afghanistan AFG Industries Agriculture 0
10 Energy Usage (TJ) Petroleum Afghanistan AFG Industries Agriculture 0
11 Energy Usage (TJ) Petroleum Afghanistan AFG Industries Agriculture 0
12 Energy Usage (TJ) Petroleum Afghanistan AFG Industries Agriculture 0
13 Energy Usage (TJ) Nuclear Electricity Afghanistan AFG Industries Agriculture 0
14 Energy Usage (TJ) Nuclear Electricity Afghanistan AFG Industries Agriculture 0
15 Energy Usage (TJ) Nuclear Electricity Afghanistan AFG Industries Agriculture 0
16 Energy Usage (TJ) Nuclear Electricity Afghanistan AFG Industries Agriculture 0
使用data.table
包,您可以按以下方式进行:
library(data.table)
setDT(DI_SMALL)[, V3 := as.character(V3)]
cols <- c("Country", "ISO", "Industry", "Sector")
Output <- DI_SMALL[, c(.(NA. = V1), .(NA..1 = V2), setNames(V3[is.na(V1)], cols), .(value = as.numeric(V3)))][!is.na(NA.)]
# NA. NA..1 Country ISO Industry Sector value
# 1: Energy Usage (TJ) Natural Gas Afghanistan AFG Industries Agriculture 1.595045e-05
# 2: Energy Usage (TJ) Coal Afghanistan AFG Industries Agriculture 1.293271e-05
# 3: Energy Usage (TJ) Petroleum Afghanistan AFG Industries Agriculture 0.000000e+00
# 4: Energy Usage (TJ) Nuclear Electricity Afghanistan AFG Industries Agriculture 0.000000e+00
我目前正在尝试找出如何旋转我的数据框(下面的小 dput)。目前,一栏包含有关国家、ISO 代码、行业和部门的信息。我需要将此信息分为 4 列,并有一个对应的值列。我以前使用过 melt 和 pivot_long 函数,但不确定如何生成 4 个新列以及值列。
DI_SMALL <- structure(list(V1 = structure(c(NA, NA, NA, NA, 1L, 1L, 1L, 1L
), .Label = "Energy Usage (TJ)", class = "factor"), V2 = structure(c(NA,
NA, NA, NA, 2L, 1L, 4L, 3L), .Label = c("Coal", "Natural Gas",
"Nuclear Electricity", "Petroleum"), class = "factor"), V3 = structure(c(5L,
4L, 7L, 6L, 3L, 2L, 1L, 1L), .Label = c("0", "1.29327085460648e-05",
"1.59504500372979e-05", "AFG", "Afghanistan", "Agriculture",
"Industries"), class = "factor"), V4 = structure(c(5L, 4L, 7L,
6L, 3L, 2L, 1L, 1L), .Label = c("0", "6.53466630114587e-06",
"8.05944706428482e-06", "AFG", "Afghanistan", "Fishing", "Industries"
), class = "factor"), V5 = structure(c(5L, 4L, 6L, 7L, 3L, 2L,
1L, 1L), .Label = c("0", "1.88562621206664e-05", "2.32557880912235e-05",
"AFG", "Afghanistan", "Industries", "Mining and Quarrying"), class = "factor"),
V6 = structure(c(5L, 4L, 7L, 6L, 3L, 2L, 1L, 1L), .Label = c("0",
"2.00284547443433e-05", "2.47018365704401e-05", "AFG", "Afghanistan",
"Food & Beverages", "Industries"), class = "factor")), row.names = c("V1",
"V2", "V3", "V4", "X", "X.1", "X.2", "X.3"), class = "data.frame")
理想情况下,输出将包含 7 列。现有的第一个列,国家,ISO,行业和部门然后 Value.Like 这个:
Output <- structure(list(NA. = structure(c(1L, 1L, 1L, 1L), .Label = "Energy Usage (TJ)", class = "factor"),
NA..1 = structure(c(2L, 1L, 4L, 3L), .Label = c("Coal ",
"Natural Gas", "Nuclear Electricity", "Petroleum"), class = "factor"),
Country = structure(c(1L, 1L, 1L, 1L), .Label = "Afghanistan", class = "factor"),
ISO = structure(c(1L, 1L, 1L, 1L), .Label = "AFG", class = "factor"),
Industry = structure(c(1L, 1L, 1L, 1L), .Label = "Industries", class = "factor"),
Sector = structure(c(1L, 1L, 1L, 1L), .Label = "Agriculture", class = "factor"),
Value = c(1.595045004, 1.2932706, 0, 0)), class = "data.frame", row.names = c(NA,
-4L))
希望这是有道理的,任何想法将不胜感激!
谢谢
我会首先对数据进行子集化,然后从那里开始工作,如下所示。虽然我仍然不确定你是如何得到你想要的 Output
的 value
的。下面输出中的值与您在 MWE 中发布的查找内容不对应。希望这能为您提供线索。
subV<- as.data.frame(t(DI_SMALL[grep("V", rownames(DI_SMALL)), ]))[-c(1:2), ] # transpose `t()` this subset to get your desired variable levels into columns
subX<- DI_SMALL[grep("X", rownames(DI_SMALL)), 1:3]
Output <- cbind(subX[, 1:2], subV, subX[, 3])
colnames(Output) <- c("NA.", "NA..1", "Country", "ISO", "Industry", "Sector", "Value"); rownames(Output) <- seq(1:nrow(Output))
> Output
NA. NA..1 Country ISO Industry Sector Value
1 Energy Usage (TJ) Natural Gas Afghanistan AFG Industries Agriculture 1.59504500372979e-05
2 Energy Usage (TJ) Coal Afghanistan AFG Industries Fishing 1.29327085460648e-05
3 Energy Usage (TJ) Petroleum Afghanistan AFG Industries Mining and Quarrying 0
4 Energy Usage (TJ) Nuclear Electricity Afghanistan AFG Industries Food & Beverages 0
这不是 pivot_long
适合的情况,因为您将变量映射到行和列,并且它们不是 columns/rows 的名称。相反,您必须从变量中提取这些属性,然后构建 data.frame
"manually"。这是一个示例,我建议检查每个步骤中的变量值,以便更好地理解这里的过程:
library(dplyr)
df <- DI_SMALL %>%
mutate_all(as.character)
row_attr <- paste0(df$V1, "/", df$V2)
row_attr <- row_attr[row_attr!= "NA/NA"]
col_attr <- df[1:4, -(1:2)] %>%
apply(MARGIN = 2, function(x) paste0(x, collapse = "/"))
values <- df[-(1:4), -(1:2)] %>%
mutate_all(as.numeric) %>%
as.matrix() %>%
c()
out <- expand.grid(row_attr, col_attr)
out <- cbind(out, values)
out <- out %>%
tidyr::separate(col = "Var1", into = c("NA.", "NA..1"), sep = "/") %>%
tidyr::separate(col = "Var2",
into = c("Country", "ISO", "Industry", "Sector"),
sep = "/")
out[1:4]
我认为 Output
和 DI_SMALL
的值的结果在不同的范围内,但除此之外,这似乎是所需的输出。
NA. NA..1 Country ISO Industry Sector values
1 Energy Usage (TJ) Natural Gas Afghanistan AFG Industries Agriculture 1.595045e-05
2 Energy Usage (TJ) Coal Afghanistan AFG Industries Agriculture 1.293271e-05
3 Energy Usage (TJ) Petroleum Afghanistan AFG Industries Agriculture 0.000000e+00
4 Energy Usage (TJ) Nuclear Electricity Afghanistan AFG Industries Agriculture 0.000000e+00
在这里,您可以首先将 V3 重命名为 V6,并连接数据框的前 4 行,然后删除这些行,将数据框转换为更长的格式,最后通过子拆分列来创建所有四列"var" 使用包含新列名称的重塑创建:
library(tidyr)
library(dplyr)
colNAMES <- apply(DI_SMALL[,3:6],2,function(x) paste(x[1:4],collapse="_"))
colnames(DI_SMALL)[3:6] <- colNAMES
DI_SMALL <- DI_SMALL[-c(1:4),]
DI_SMALL %>% pivot_longer(-c(V1,V2),names_to = "var",values_to = "Value") %>%
mutate(Country = unlist(strsplit(var,"_"))[1],
ISO = unlist(strsplit(var,"_"))[2],
Industry = unlist(strsplit(var,"_"))[3],
Sector = unlist(strsplit(var,"_"))[4]) %>%
select(V1,V2,Country, ISO, Industry, Sector, Value)
# A tibble: 16 x 7
V1 V2 Country ISO Industry Sector Value
<fct> <fct> <chr> <chr> <chr> <chr> <fct>
1 Energy Usage (TJ) Natural Gas Afghanistan AFG Industries Agriculture 1.59504500372979e-05
2 Energy Usage (TJ) Natural Gas Afghanistan AFG Industries Agriculture 8.05944706428482e-06
3 Energy Usage (TJ) Natural Gas Afghanistan AFG Industries Agriculture 2.32557880912235e-05
4 Energy Usage (TJ) Natural Gas Afghanistan AFG Industries Agriculture 2.47018365704401e-05
5 Energy Usage (TJ) Coal Afghanistan AFG Industries Agriculture 1.29327085460648e-05
6 Energy Usage (TJ) Coal Afghanistan AFG Industries Agriculture 6.53466630114587e-06
7 Energy Usage (TJ) Coal Afghanistan AFG Industries Agriculture 1.88562621206664e-05
8 Energy Usage (TJ) Coal Afghanistan AFG Industries Agriculture 2.00284547443433e-05
9 Energy Usage (TJ) Petroleum Afghanistan AFG Industries Agriculture 0
10 Energy Usage (TJ) Petroleum Afghanistan AFG Industries Agriculture 0
11 Energy Usage (TJ) Petroleum Afghanistan AFG Industries Agriculture 0
12 Energy Usage (TJ) Petroleum Afghanistan AFG Industries Agriculture 0
13 Energy Usage (TJ) Nuclear Electricity Afghanistan AFG Industries Agriculture 0
14 Energy Usage (TJ) Nuclear Electricity Afghanistan AFG Industries Agriculture 0
15 Energy Usage (TJ) Nuclear Electricity Afghanistan AFG Industries Agriculture 0
16 Energy Usage (TJ) Nuclear Electricity Afghanistan AFG Industries Agriculture 0
使用data.table
包,您可以按以下方式进行:
library(data.table)
setDT(DI_SMALL)[, V3 := as.character(V3)]
cols <- c("Country", "ISO", "Industry", "Sector")
Output <- DI_SMALL[, c(.(NA. = V1), .(NA..1 = V2), setNames(V3[is.na(V1)], cols), .(value = as.numeric(V3)))][!is.na(NA.)]
# NA. NA..1 Country ISO Industry Sector value
# 1: Energy Usage (TJ) Natural Gas Afghanistan AFG Industries Agriculture 1.595045e-05
# 2: Energy Usage (TJ) Coal Afghanistan AFG Industries Agriculture 1.293271e-05
# 3: Energy Usage (TJ) Petroleum Afghanistan AFG Industries Agriculture 0.000000e+00
# 4: Energy Usage (TJ) Nuclear Electricity Afghanistan AFG Industries Agriculture 0.000000e+00