过滤变量以连接不同维度的两个数据框
filter variables to connect two dataframes over different dimensions
我想连接两个数据帧。
所以因为我有 2 个维度来从第二个 table 的列中过滤一个满足第一个 table 的某些条件的值。
第一个数据框如下所示:
letter year value
A 2001
B 2002
C 2003
D 2004
第二个:
letter 2001 2002 2003 2004
A 4 9 9 9
B 6 7 6 6
C 2 3 5 8
D 1 1 1 1
这给了我这样的东西
letter year value
A 2001 4
B 2002 7
C 2003 5
D 2004 1
thank all of you
一个选项是 row/column
索引。在这里,行索引可以是行序列,而列索引是我们从 match
将第一个数据的 'year' 列与第二个列名称相结合得到的, cbind
要创建的索引a matrix
('m1') 并使用它从第二个数据集中提取值并将这些值分配给第一个数据
中的'value'列
i1 <- seq_len(nrow(df1))
j1 <- match(df1$year, names(df2)[-1])
m1 <- cbind(i1, j1)
df1$value <- df2[-1][m1]
df1
# letter year value
#1 A 2001 4
#2 B 2002 7
#3 C 2003 5
#4 D 2004 1
具体的例子,提取的pattern好像是diag
onal元素,那我们也可以用
df1$value <- diag(as.matrix(df2[-1]))
数据
df1 <- structure(list(letter = c("A", "B", "C", "D"), year = 2001:2004),
class = "data.frame", row.names = c(NA,
-4L))
df2 <- structure(list(letter = c("A", "B", "C", "D"), `2001` = c(4L,
6L, 2L, 1L), `2002` = c(9L, 7L, 3L, 1L), `2003` = c(9L, 6L, 5L,
1L), `2004` = c(9L, 6L, 8L, 1L)), class = "data.frame",
row.names = c(NA,
-4L))
tidyverse 中的另一个选项是首先将您的值数据转换为更长的数据框(数据来自@akrun 的回答):
df2.long <- df2 %>%
pivot_longer(`2001`:`2004`, names_to = 'year', values_to = 'value')
# A tibble: 16 x 3
letter year value
<chr> <chr> <int>
1 A 2001 4
2 A 2002 9
3 A 2003 9
4 A 2004 9
5 B 2001 6
6 B 2002 7
7 B 2003 6
8 B 2004 6
9 C 2001 2
10 C 2002 3
...
然后对包含所需 letter/year 组合的数据框执行 inner_join
:
df.final <- df2.long %>%
mutate(year = as.numeric(year)) %>%
inner_join(df1)
letter year value
<chr> <dbl> <int>
1 A 2001 4
2 B 2002 7
3 C 2003 5
4 D 2004 1
基础 R 解决方案:
# Reshape your dataframe from wide to long:
df3 <- reshape(df2,
direction = "long",
idvar = "letter",
varying = c(names(df2)[names(df2) != "letter"]),
v.names = "Value",
timevar = "Year",
times = names(df2)[names(df2) != "letter"],
new.row.names = 1:(nrow(df2) * length(names(df2)[names(df2) != "letter"]))
)
# Inner join the long_df with the first dataframe:
df_final <- merge(df1[,c(names(df1) != "Value")], df3, by = intersect(colnames(df1), colnames(df3)))
Tidyverse 解决方案(略微扩展下面@jdobres 的解决方案):
lapply(c("dplyr", "tidyr"), require, character.only = TRUE)
df3_long <-
df2 %>%
pivot_longer(`2001`:`2004`, names_to = 'year', values_to = 'value') %>%
mutate(year = as.numeric(year)) %>%
inner_join(., df1, by = intersect(colnames(df1, df2)))
数据:
df1 <-
structure(list(letter = c("A", "B", "C", "D"), year = 2001:2004),
class = "data.frame",
row.names = c(NA,-4L))
df2 <-
structure(
list(
letter = c("A", "B", "C", "D"),
`2001` = c(4L,
6L, 2L, 1L),
`2002` = c(9L, 7L, 3L, 1L),
`2003` = c(9L, 6L, 5L,
1L),
`2004` = c(9L, 6L, 8L, 1L)
),
class = "data.frame",
row.names = c(NA,-4L)
)
我想连接两个数据帧。
所以因为我有 2 个维度来从第二个 table 的列中过滤一个满足第一个 table 的某些条件的值。 第一个数据框如下所示:
letter year value
A 2001
B 2002
C 2003
D 2004
第二个:
letter 2001 2002 2003 2004
A 4 9 9 9
B 6 7 6 6
C 2 3 5 8
D 1 1 1 1
这给了我这样的东西
letter year value
A 2001 4
B 2002 7
C 2003 5
D 2004 1
thank all of you
一个选项是 row/column
索引。在这里,行索引可以是行序列,而列索引是我们从 match
将第一个数据的 'year' 列与第二个列名称相结合得到的, cbind
要创建的索引a matrix
('m1') 并使用它从第二个数据集中提取值并将这些值分配给第一个数据
i1 <- seq_len(nrow(df1))
j1 <- match(df1$year, names(df2)[-1])
m1 <- cbind(i1, j1)
df1$value <- df2[-1][m1]
df1
# letter year value
#1 A 2001 4
#2 B 2002 7
#3 C 2003 5
#4 D 2004 1
具体的例子,提取的pattern好像是diag
onal元素,那我们也可以用
df1$value <- diag(as.matrix(df2[-1]))
数据
df1 <- structure(list(letter = c("A", "B", "C", "D"), year = 2001:2004),
class = "data.frame", row.names = c(NA,
-4L))
df2 <- structure(list(letter = c("A", "B", "C", "D"), `2001` = c(4L,
6L, 2L, 1L), `2002` = c(9L, 7L, 3L, 1L), `2003` = c(9L, 6L, 5L,
1L), `2004` = c(9L, 6L, 8L, 1L)), class = "data.frame",
row.names = c(NA,
-4L))
tidyverse 中的另一个选项是首先将您的值数据转换为更长的数据框(数据来自@akrun 的回答):
df2.long <- df2 %>%
pivot_longer(`2001`:`2004`, names_to = 'year', values_to = 'value')
# A tibble: 16 x 3
letter year value
<chr> <chr> <int>
1 A 2001 4
2 A 2002 9
3 A 2003 9
4 A 2004 9
5 B 2001 6
6 B 2002 7
7 B 2003 6
8 B 2004 6
9 C 2001 2
10 C 2002 3
...
然后对包含所需 letter/year 组合的数据框执行 inner_join
:
df.final <- df2.long %>%
mutate(year = as.numeric(year)) %>%
inner_join(df1)
letter year value
<chr> <dbl> <int>
1 A 2001 4
2 B 2002 7
3 C 2003 5
4 D 2004 1
基础 R 解决方案:
# Reshape your dataframe from wide to long:
df3 <- reshape(df2,
direction = "long",
idvar = "letter",
varying = c(names(df2)[names(df2) != "letter"]),
v.names = "Value",
timevar = "Year",
times = names(df2)[names(df2) != "letter"],
new.row.names = 1:(nrow(df2) * length(names(df2)[names(df2) != "letter"]))
)
# Inner join the long_df with the first dataframe:
df_final <- merge(df1[,c(names(df1) != "Value")], df3, by = intersect(colnames(df1), colnames(df3)))
Tidyverse 解决方案(略微扩展下面@jdobres 的解决方案):
lapply(c("dplyr", "tidyr"), require, character.only = TRUE)
df3_long <-
df2 %>%
pivot_longer(`2001`:`2004`, names_to = 'year', values_to = 'value') %>%
mutate(year = as.numeric(year)) %>%
inner_join(., df1, by = intersect(colnames(df1, df2)))
数据:
df1 <-
structure(list(letter = c("A", "B", "C", "D"), year = 2001:2004),
class = "data.frame",
row.names = c(NA,-4L))
df2 <-
structure(
list(
letter = c("A", "B", "C", "D"),
`2001` = c(4L,
6L, 2L, 1L),
`2002` = c(9L, 7L, 3L, 1L),
`2003` = c(9L, 6L, 5L,
1L),
`2004` = c(9L, 6L, 8L, 1L)
),
class = "data.frame",
row.names = c(NA,-4L)
)