将不相等数组的列转换为 R 中的单值列
Transforming columns of unequal arrays to column of single values in R
作为此 之后的下一步,假设有多个长度不同的数组列。例如:
Col_A
Col_B
Col_C
[0.1,0.5,0.7]
[1.54E12, 1.54E12, 1.54E12]
[1, 3, 4, 5}
我如何采用这种格式并将其重新格式化为以下格式,在适当的地方为 Col_A 和 Col_b 提供 NA:
Col_A
Col_B
Col_C
0.1
1.54E12
1
0.5
1.54E12
3
0.7
1.54E12
4
NA
NA
5
此代码适用于所有数组都相等的情况,但如果数组不相等则会抛出错误:
library(dplyr)
library(stringr)
library(tidyr)
df %>%
mutate(across(everything(), str_extract_all, "(?<=\[)[^]]+")) %>%
unnest(c(NDVIs, dates)) %>%
separate_rows(c(NDVIs, dates), sep=",\s+", convert = TRUE)
这里 tidyverse
的经验不足,所以这里是我的 data.table
解决方案。我在步骤和结果之间包含了所有内容,以显示正在发生的事情...
library( data.table )
#create sample data
DT <- fread("Col_A Col_B Col_C
[0.1,0.5,0.7] [1.54E12, 1.54E12, 1.54E12] [1, 3, 4, 5]")
# Col_A Col_B Col_C
# 1: [0.1,0.5,0.7] [1.54E12, 1.54E12, 1.54E12] [1, 3, 4, 5]
#melt to long format
ans <- melt( DT, measure.vars = names(DT), variable.factor = FALSE )
# variable value
# 1: Col_A [0.1,0.5,0.7]
# 2: Col_B [1.54E12, 1.54E12, 1.54E12]
# 3: Col_C [1, 3, 4, 5]
#remove [] and split the value column using ', ' as sepatator
ans[, value := gsub( "\[|\]", "", value ) ]
ans[, paste0( "v", 1:length( tstrsplit(ans$value, "," ) ) ) :=
lapply( tstrsplit(value, "," ), as.numeric ) ][]
# variable value v1 v2 v3 v4
# 1: Col_A 0.1,0.5,0.7 1.00e-01 5.00e-01 7.00e-01 NA
# 2: Col_B 1.54E12, 1.54E12, 1.54E12 1.54e+12 1.54e+12 1.54e+12 NA
# 3: Col_C 1, 3, 4, 5 1.00e+00 3.00e+00 4.00e+00 5
#transpose (without value-columns) to get wide format again
transpose( ans[, -"value"], make.names = "variable" )
# Col_A Col_B Col_C
# 1: 0.1 1.54e+12 1
# 2: 0.5 1.54e+12 3
# 3: 0.7 1.54e+12 4
# 4: NA NA 5
我们可以使用 splitstackshape
中的 cSplit
library(splitstackshape)
library(data.table)
cSplit(setDT(df)[, lapply(.SD, gsub, pattern = "[][}]",
replacement = "")], names(df), sep=",", fixed = FALSE, "long")
# Col_A Col_B Col_C
#1: 0.1 1.54e+12 1
#2: 0.5 1.54e+12 3
#3: 0.7 1.54e+12 4
#4: NA NA 5
数据
df <- structure(list(Col_A = "[0.1,0.5,0.7]", Col_B = "[1.54E12, 1.54E12, 1.54E12]",
Col_C = "[1, 3, 4, 5}"), class = "data.frame", row.names = c(NA,
-1L))
作为此
Col_A | Col_B | Col_C |
---|---|---|
[0.1,0.5,0.7] | [1.54E12, 1.54E12, 1.54E12] | [1, 3, 4, 5} |
我如何采用这种格式并将其重新格式化为以下格式,在适当的地方为 Col_A 和 Col_b 提供 NA:
Col_A | Col_B | Col_C |
---|---|---|
0.1 | 1.54E12 | 1 |
0.5 | 1.54E12 | 3 |
0.7 | 1.54E12 | 4 |
NA | NA | 5 |
此代码适用于所有数组都相等的情况,但如果数组不相等则会抛出错误:
library(dplyr)
library(stringr)
library(tidyr)
df %>%
mutate(across(everything(), str_extract_all, "(?<=\[)[^]]+")) %>%
unnest(c(NDVIs, dates)) %>%
separate_rows(c(NDVIs, dates), sep=",\s+", convert = TRUE)
这里 tidyverse
的经验不足,所以这里是我的 data.table
解决方案。我在步骤和结果之间包含了所有内容,以显示正在发生的事情...
library( data.table )
#create sample data
DT <- fread("Col_A Col_B Col_C
[0.1,0.5,0.7] [1.54E12, 1.54E12, 1.54E12] [1, 3, 4, 5]")
# Col_A Col_B Col_C
# 1: [0.1,0.5,0.7] [1.54E12, 1.54E12, 1.54E12] [1, 3, 4, 5]
#melt to long format
ans <- melt( DT, measure.vars = names(DT), variable.factor = FALSE )
# variable value
# 1: Col_A [0.1,0.5,0.7]
# 2: Col_B [1.54E12, 1.54E12, 1.54E12]
# 3: Col_C [1, 3, 4, 5]
#remove [] and split the value column using ', ' as sepatator
ans[, value := gsub( "\[|\]", "", value ) ]
ans[, paste0( "v", 1:length( tstrsplit(ans$value, "," ) ) ) :=
lapply( tstrsplit(value, "," ), as.numeric ) ][]
# variable value v1 v2 v3 v4
# 1: Col_A 0.1,0.5,0.7 1.00e-01 5.00e-01 7.00e-01 NA
# 2: Col_B 1.54E12, 1.54E12, 1.54E12 1.54e+12 1.54e+12 1.54e+12 NA
# 3: Col_C 1, 3, 4, 5 1.00e+00 3.00e+00 4.00e+00 5
#transpose (without value-columns) to get wide format again
transpose( ans[, -"value"], make.names = "variable" )
# Col_A Col_B Col_C
# 1: 0.1 1.54e+12 1
# 2: 0.5 1.54e+12 3
# 3: 0.7 1.54e+12 4
# 4: NA NA 5
我们可以使用 splitstackshape
cSplit
library(splitstackshape)
library(data.table)
cSplit(setDT(df)[, lapply(.SD, gsub, pattern = "[][}]",
replacement = "")], names(df), sep=",", fixed = FALSE, "long")
# Col_A Col_B Col_C
#1: 0.1 1.54e+12 1
#2: 0.5 1.54e+12 3
#3: 0.7 1.54e+12 4
#4: NA NA 5
数据
df <- structure(list(Col_A = "[0.1,0.5,0.7]", Col_B = "[1.54E12, 1.54E12, 1.54E12]",
Col_C = "[1, 3, 4, 5}"), class = "data.frame", row.names = c(NA,
-1L))