如何将 tibbles 与常见但 non-identical 列组合?
How to combine tibbles with common but non-identical columns?
我有两个小标题,每个小标题最多有 4 列。每个列名要么对两者通用,要么从一个或另一个中丢失。我需要将它们组合成两行的小标题,并在缺少它们的列中组合 NA
。我需要一般地执行此操作,因此它可以处理或多或少缺少的列。这是从两个示例网页生成 tibbles 的代码;
library(tidyverse)
library(htmltab)
read_results <- function(filename) {
doc <- read_file(filename)
df <- as_tibble(htmltab(doc=doc, which="//table[@id='results']"))
colnames(df) <- c("pos", "name", "time", "age_cat", "age_grade", "gender", "gender_pos", "note", "total_runs")
tib = t(as_tibble(df) %>% group_by(substr(note,1,12)) %>% summarise(number=n()))
colnames(tib) <- as.character(unlist(tib[1,]))
tib = tib[-1,]
r <- t(tib)
return (r);
}
# saved from http://www.parkrun.org.uk/henleyonthames/results/weeklyresults/?runSeqNumber=2
r2 = read_results("results _ henleyonthames parkrun_2.html")
# saved from http://www.parkrun.org.uk/henleyonthames/results/weeklyresults/?runSeqNumber=4
r4 = read_results("results _ henleyonthames parkrun_4.html")
现在 t2
和 t4
包含
> r2
First Timer! New PB! PB stays at <NA>
[1,] "58" "11" " 3" " 4"
> r4
First Timer! New PB! PB stays at
[1,] "62" "16" "11"
我想将 t_all
构建为
First Timer! New PB! PB stays at <NA>
58 11 3 4
62 16 11 0
您的问题是 r2 的其中一列的名称为 NA。因此,大多数根据列名将类似矩阵的对象配对的函数都会失败。
要解决它,请将此行添加到您的函数中:names(tib)[is.na(names(tib))] <- "Blank"
library(tidyverse)
library(htmltab)
read_results <- function(filename) {
doc <- read_file(filename)
df <- as_tibble(htmltab(doc=doc, which="//table[@id='results']"))
colnames(df) <- c("pos", "name", "time", "age_cat", "age_grade", "gender", "gender_pos", "note", "total_runs")
tib = t(as_tibble(df) %>% group_by(substr(note,1,12)) %>% summarise(number=n()))
colnames(tib) <- as.character(unlist(tib[1,]))
tib = tib[-1,]
names(tib)[is.na(names(tib))] <- "Blank" ## New Line
r <- t(tib)
return (r);
}
# saved from http://www.parkrun.org.uk/henleyonthames/results/weeklyresults/?runSeqNumber=2
r2 = read_results("results _ henleyonthames parkrun_2.html")
# saved from http://www.parkrun.org.uk/henleyonthames/results/weeklyresults/?runSeqNumber=4
r4 = read_results("results _ henleyonthames parkrun_4.html")
dplyr::bind_rows(as_data_frame(r2),as_data_frame(r4))
# A tibble: 2 × 4
`First Timer!` `New PB!` `PB stays at ` Blank
<chr> <chr> <chr> <chr>
1 58 11 3 4
2 62 16 11 <NA>
我有两个小标题,每个小标题最多有 4 列。每个列名要么对两者通用,要么从一个或另一个中丢失。我需要将它们组合成两行的小标题,并在缺少它们的列中组合 NA
。我需要一般地执行此操作,因此它可以处理或多或少缺少的列。这是从两个示例网页生成 tibbles 的代码;
library(tidyverse)
library(htmltab)
read_results <- function(filename) {
doc <- read_file(filename)
df <- as_tibble(htmltab(doc=doc, which="//table[@id='results']"))
colnames(df) <- c("pos", "name", "time", "age_cat", "age_grade", "gender", "gender_pos", "note", "total_runs")
tib = t(as_tibble(df) %>% group_by(substr(note,1,12)) %>% summarise(number=n()))
colnames(tib) <- as.character(unlist(tib[1,]))
tib = tib[-1,]
r <- t(tib)
return (r);
}
# saved from http://www.parkrun.org.uk/henleyonthames/results/weeklyresults/?runSeqNumber=2
r2 = read_results("results _ henleyonthames parkrun_2.html")
# saved from http://www.parkrun.org.uk/henleyonthames/results/weeklyresults/?runSeqNumber=4
r4 = read_results("results _ henleyonthames parkrun_4.html")
现在 t2
和 t4
包含
> r2
First Timer! New PB! PB stays at <NA>
[1,] "58" "11" " 3" " 4"
> r4
First Timer! New PB! PB stays at
[1,] "62" "16" "11"
我想将 t_all
构建为
First Timer! New PB! PB stays at <NA>
58 11 3 4
62 16 11 0
您的问题是 r2 的其中一列的名称为 NA。因此,大多数根据列名将类似矩阵的对象配对的函数都会失败。 要解决它,请将此行添加到您的函数中:names(tib)[is.na(names(tib))] <- "Blank"
library(tidyverse)
library(htmltab)
read_results <- function(filename) {
doc <- read_file(filename)
df <- as_tibble(htmltab(doc=doc, which="//table[@id='results']"))
colnames(df) <- c("pos", "name", "time", "age_cat", "age_grade", "gender", "gender_pos", "note", "total_runs")
tib = t(as_tibble(df) %>% group_by(substr(note,1,12)) %>% summarise(number=n()))
colnames(tib) <- as.character(unlist(tib[1,]))
tib = tib[-1,]
names(tib)[is.na(names(tib))] <- "Blank" ## New Line
r <- t(tib)
return (r);
}
# saved from http://www.parkrun.org.uk/henleyonthames/results/weeklyresults/?runSeqNumber=2
r2 = read_results("results _ henleyonthames parkrun_2.html")
# saved from http://www.parkrun.org.uk/henleyonthames/results/weeklyresults/?runSeqNumber=4
r4 = read_results("results _ henleyonthames parkrun_4.html")
dplyr::bind_rows(as_data_frame(r2),as_data_frame(r4))
# A tibble: 2 × 4
`First Timer!` `New PB!` `PB stays at ` Blank
<chr> <chr> <chr> <chr>
1 58 11 3 4
2 62 16 11 <NA>