如何使用 dplyr 旋转复杂的整洁数据框
How to pivot a complex tidy data frame using dplyr
我有一个整洁的数据集,它有几个数字和分类列,例如 id、categorical_1、categorical_2 等。另外,我得到的列遵循以下模式:
n_neighbors <- c(15,30,50,100,150)
min_distance <- c( 0.001, 0.003, 0.009,0.03,0.09)
metrics <- c("euclidean" ,"cosine","hamming")
在上面的示例中,我只显示了数据框的 4 列。以下栏目名称参考:
- nn_.15.md_.0.001.metric.euclidean.1
- n_neighbors = 15
- min_distance = 0.001
- metric = euclidean.
- 维度 = 1
- nn_.15.md_.0.001.metric.euclidean.2
- n_neighbors = 15
- min_distance = 0.001
- metric = euclidean.
- 维度 = 2
我想以创建新列的方式旋转 table:dimension_1、dimension_2、n_neighbors、min_distance、公制。如果可能,使用 deplyr 和 tidy dataframes
dimension_1 和 dimension_2 应该包含行的值在上面的例子中,而 n_neighbors, min_distance, metric 应该包含写在列中的值姓名。我的目标输出是:
row_1 -> dimension_1=0.5677311, dimension_2=-11.70898, n_neighbors=15,
min_distance=0.001, metric = euclidean
row_2 -> dimension_1=0.5682809, dimension_2=-11.71056, n_neighbors=15,
min_distance=0.001, metric = euclidean
row_3 -> dimension_1=0.5674967, dimension_2=-11.70665, n_neighbors=15,
min_distance=0.001, metric = euclidean
row_4 -> dimension_1=0.5687164, dimension_2=-11.70848, n_neighbors=15,
min_distance=0.001, metric = euclidean
row_5 -> dimension_1=-1.674230, dimension_2=16.693973, n_neighbors=15,
min_distance=0.001, metric = cosine
..
..
row_8 -> dimension_1=-1.674726, dimension_2=16.69405,
n_neighbors=15, min_distance=0.001, metric = cosine
您可以在此处找到数据子集的代码示例:
structure(list(nn_.15.md_.0.001.metric.euclidean.1 = c(0.567731082439423,
0.568280875682831, 0.567496657371521, 0.568716406822205), nn_.15.md_.0.001.metric.euclidean.2 = c(-11.7089824676514,
-11.7105579376221, -11.7066516876221, -11.7084884643555), nn_.15.md_.0.001.metric.cosine.1 = c(-1.67423057556152,
-1.67501986026764, -1.69689452648163, -1.67472624778748), nn_.15.md_.0.001.metric.cosine.2 = c(16.6939735412598,
16.6941356658936, 16.7890815734863, 16.6940479278564))
您可以使用以下解决方案:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(everything(), names_to = c(NA, "n_neighbour", NA, "min_distance", NA, ".value"),
names_pattern = "(nn_\.)(\d+)\.(md_\.)(\d\.\d+)\.(metric\.)(.*)") %>%
pivot_longer(!c(min_distance, n_neighbour),
names_to = c("metric", ".value"), names_pattern = "(.*)\.(\d)") %>%
rename_with(~ paste("Dimension", .), c(4, 5)) %>%
relocate(`Dimension 1`, `Dimension 2`) %>%
arrange(desc(metric))
# A tibble: 8 x 5
`Dimension 1` `Dimension 2` min_distance n_neighbour metric
<dbl> <dbl> <chr> <chr> <chr>
1 0.568 -11.7 0.001 15 euclidean
2 0.568 -11.7 0.001 15 euclidean
3 0.567 -11.7 0.001 15 euclidean
4 0.569 -11.7 0.001 15 euclidean
5 -1.67 16.7 0.001 15 cosine
6 -1.68 16.7 0.001 15 cosine
7 -1.70 16.8 0.001 15 cosine
8 -1.67 16.7 0.001 15 cosine
您可以使用pivot_longer
获取长格式的数据。
library(tidyverse)
df %>%
pivot_longer(cols = everything(),
names_to = c('n_neighbors', 'min_distance', 'metric', 'row'),
names_pattern = 'nn_\.(\d+)\.md_\.(\d+\.\d+)\.metric\.([a-z]+)\.(\d+)')
# n_neighbors min_distance metric row value
# <chr> <chr> <chr> <chr> <dbl>
# 1 15 0.001 euclidean 1 0.568
# 2 15 0.001 euclidean 2 -11.7
# 3 15 0.001 cosine 1 -1.67
# 4 15 0.001 cosine 2 16.7
# 5 15 0.001 euclidean 1 0.568
# 6 15 0.001 euclidean 2 -11.7
# 7 15 0.001 cosine 1 -1.68
# 8 15 0.001 cosine 2 16.7
# 9 15 0.001 euclidean 1 0.567
#10 15 0.001 euclidean 2 -11.7
#11 15 0.001 cosine 1 -1.70
#12 15 0.001 cosine 2 16.8
#13 15 0.001 euclidean 1 0.569
#14 15 0.001 euclidean 2 -11.7
#15 15 0.001 cosine 1 -1.67
#16 15 0.001 cosine 2 16.7
如果您想要的数据与显示的完全相同,您可以添加 pivot_wider
步骤。
df %>%
pivot_longer(cols = everything(),
names_to = c('n_neighbors', 'min_distance', 'metric', 'row'),
names_pattern = 'nn_\.(\d+)\.md_\.(\d+\.\d+)\.metric\.([a-z]+)\.(\d+)') %>%
pivot_wider(names_from = row, values_from = value, names_prefix = 'dimension_', values_fn = list) %>%
unnest(starts_with('dimension'))
# n_neighbors min_distance metric dimension_1 dimension_2
# <chr> <chr> <chr> <dbl> <dbl>
#1 15 0.001 euclidean 0.568 -11.7
#2 15 0.001 euclidean 0.568 -11.7
#3 15 0.001 euclidean 0.567 -11.7
#4 15 0.001 euclidean 0.569 -11.7
#5 15 0.001 cosine -1.67 16.7
#6 15 0.001 cosine -1.68 16.7
#7 15 0.001 cosine -1.70 16.8
#8 15 0.001 cosine -1.67 16.7
我有一个整洁的数据集,它有几个数字和分类列,例如 id、categorical_1、categorical_2 等。另外,我得到的列遵循以下模式:
n_neighbors <- c(15,30,50,100,150)
min_distance <- c( 0.001, 0.003, 0.009,0.03,0.09)
metrics <- c("euclidean" ,"cosine","hamming")
在上面的示例中,我只显示了数据框的 4 列。以下栏目名称参考:
- nn_.15.md_.0.001.metric.euclidean.1
- n_neighbors = 15
- min_distance = 0.001
- metric = euclidean.
- 维度 = 1
- nn_.15.md_.0.001.metric.euclidean.2
- n_neighbors = 15
- min_distance = 0.001
- metric = euclidean.
- 维度 = 2
我想以创建新列的方式旋转 table:dimension_1、dimension_2、n_neighbors、min_distance、公制。如果可能,使用 deplyr 和 tidy dataframes
dimension_1 和 dimension_2 应该包含行的值在上面的例子中,而 n_neighbors, min_distance, metric 应该包含写在列中的值姓名。我的目标输出是:
row_1 -> dimension_1=0.5677311, dimension_2=-11.70898, n_neighbors=15, min_distance=0.001, metric = euclidean
row_2 -> dimension_1=0.5682809, dimension_2=-11.71056, n_neighbors=15, min_distance=0.001, metric = euclidean
row_3 -> dimension_1=0.5674967, dimension_2=-11.70665, n_neighbors=15, min_distance=0.001, metric = euclidean
row_4 -> dimension_1=0.5687164, dimension_2=-11.70848, n_neighbors=15, min_distance=0.001, metric = euclidean
row_5 -> dimension_1=-1.674230, dimension_2=16.693973, n_neighbors=15, min_distance=0.001, metric = cosine
..
..
row_8 -> dimension_1=-1.674726, dimension_2=16.69405, n_neighbors=15, min_distance=0.001, metric = cosine
您可以在此处找到数据子集的代码示例:
structure(list(nn_.15.md_.0.001.metric.euclidean.1 = c(0.567731082439423,
0.568280875682831, 0.567496657371521, 0.568716406822205), nn_.15.md_.0.001.metric.euclidean.2 = c(-11.7089824676514,
-11.7105579376221, -11.7066516876221, -11.7084884643555), nn_.15.md_.0.001.metric.cosine.1 = c(-1.67423057556152,
-1.67501986026764, -1.69689452648163, -1.67472624778748), nn_.15.md_.0.001.metric.cosine.2 = c(16.6939735412598,
16.6941356658936, 16.7890815734863, 16.6940479278564))
您可以使用以下解决方案:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(everything(), names_to = c(NA, "n_neighbour", NA, "min_distance", NA, ".value"),
names_pattern = "(nn_\.)(\d+)\.(md_\.)(\d\.\d+)\.(metric\.)(.*)") %>%
pivot_longer(!c(min_distance, n_neighbour),
names_to = c("metric", ".value"), names_pattern = "(.*)\.(\d)") %>%
rename_with(~ paste("Dimension", .), c(4, 5)) %>%
relocate(`Dimension 1`, `Dimension 2`) %>%
arrange(desc(metric))
# A tibble: 8 x 5
`Dimension 1` `Dimension 2` min_distance n_neighbour metric
<dbl> <dbl> <chr> <chr> <chr>
1 0.568 -11.7 0.001 15 euclidean
2 0.568 -11.7 0.001 15 euclidean
3 0.567 -11.7 0.001 15 euclidean
4 0.569 -11.7 0.001 15 euclidean
5 -1.67 16.7 0.001 15 cosine
6 -1.68 16.7 0.001 15 cosine
7 -1.70 16.8 0.001 15 cosine
8 -1.67 16.7 0.001 15 cosine
您可以使用pivot_longer
获取长格式的数据。
library(tidyverse)
df %>%
pivot_longer(cols = everything(),
names_to = c('n_neighbors', 'min_distance', 'metric', 'row'),
names_pattern = 'nn_\.(\d+)\.md_\.(\d+\.\d+)\.metric\.([a-z]+)\.(\d+)')
# n_neighbors min_distance metric row value
# <chr> <chr> <chr> <chr> <dbl>
# 1 15 0.001 euclidean 1 0.568
# 2 15 0.001 euclidean 2 -11.7
# 3 15 0.001 cosine 1 -1.67
# 4 15 0.001 cosine 2 16.7
# 5 15 0.001 euclidean 1 0.568
# 6 15 0.001 euclidean 2 -11.7
# 7 15 0.001 cosine 1 -1.68
# 8 15 0.001 cosine 2 16.7
# 9 15 0.001 euclidean 1 0.567
#10 15 0.001 euclidean 2 -11.7
#11 15 0.001 cosine 1 -1.70
#12 15 0.001 cosine 2 16.8
#13 15 0.001 euclidean 1 0.569
#14 15 0.001 euclidean 2 -11.7
#15 15 0.001 cosine 1 -1.67
#16 15 0.001 cosine 2 16.7
如果您想要的数据与显示的完全相同,您可以添加 pivot_wider
步骤。
df %>%
pivot_longer(cols = everything(),
names_to = c('n_neighbors', 'min_distance', 'metric', 'row'),
names_pattern = 'nn_\.(\d+)\.md_\.(\d+\.\d+)\.metric\.([a-z]+)\.(\d+)') %>%
pivot_wider(names_from = row, values_from = value, names_prefix = 'dimension_', values_fn = list) %>%
unnest(starts_with('dimension'))
# n_neighbors min_distance metric dimension_1 dimension_2
# <chr> <chr> <chr> <dbl> <dbl>
#1 15 0.001 euclidean 0.568 -11.7
#2 15 0.001 euclidean 0.568 -11.7
#3 15 0.001 euclidean 0.567 -11.7
#4 15 0.001 euclidean 0.569 -11.7
#5 15 0.001 cosine -1.67 16.7
#6 15 0.001 cosine -1.68 16.7
#7 15 0.001 cosine -1.70 16.8
#8 15 0.001 cosine -1.67 16.7