将数据框中的值附加到在 for 循环中创建的列表
Append values from a data frame to a list created in for loop
*编辑:多亏了 Martin 和一点点时间和关注,我才能够在我需要的地方获得代码。丑吗?是的,但它的工作方式现在对我有用。关于如何清理它并使其更有效率的任何提示都将非常有帮助。
使用数据框 trace_list
,我试图将 Title
和 Year
中的值附加到 for 循环中每个列表的输出。以下代码打开每个州的 PDF link 第 10 页,提取城市数据(范围从 1-12 个城市)。 Clean/tidies 数据,并将其存储在列表中,以便在收集每个 PDF 的数据后进行绑定。 现在它只拉取城市名称和一个数值。
data.frame(Link = c('https://www.atf.gov/file/146951/download','https://www.atf.gov/file/146966/download','https://www.atf.gov/file/146976/download','https://www.atf.gov/file/137041/download','https://www.atf.gov/file/137231/download','https://www.atf.gov/file/137301/download','https://www.atf.gov/docs/undefined/flwebsite17183911pdf/download','https://www.atf.gov/docs/undefined/kywebsite17183876pdf/download','https://www.atf.gov/docs/undefined/prwebsite17183917pdf/download'), Title = c('Alabama','California','District of Columbia','Alaska','Pennsylvania','Wyoming','Florida','Kentucky','Puerto Rico'), Year = c('2019','2019','2019','2018','2018','2018','2017','2017','2017'))
library(pdftools)
library(dplyr)
library(tabulizer)
library(english)
library(gsubfn)
library(rebus)
library(htmlwidgets)
citytrace <- list()
trace_list <- as.data.frame(trace_list)
for (i in seq_len(nrow(trace_list[c(1:159),]))) {
pdf_link <- trace_list[i, 1]
pdf_link <- paste0('',pdf_link,'')
gpi_table <- tabulizer::extract_tables(pdf_link,output = "data.frame",pages = c(10, 10), area = list(c(230,0,280,717), c(275,0,321,725)),guess = F)
city <- list(gpi_table)[[1]][[1]]
#city <- city[!Reduce(`|`, lapply(city[1], grepl, pattern = '^[0-9]+$""')),]
city[city==""] <- NA
city <- setNames(rbind(names(city), city), names(city))
colnames(city) <- paste("V",seq(1,length(city),1),sep="")
#city <- ifelse(city[1,]=='city',city[-1,],city)
city <- if(length(city)>4){
a <- data.frame(t(city))
colnames(a) <- paste("X",seq(1,length(a),1),sep="")
a[,1] <- factor(paste(a$X1,a$X2,a$X3,a$X4, sep = " "))
a[,1] <- trimws(gsub("X|X\.[[:digit:]]|\.[[:digit:]]","",a$X1))
a <- a[,-c(2:4)]
} else {
city %>%
unite(city, 1:length(city), sep = " ", remove = FALSE) %>%
mutate_all(na_if,"") %>%
drop_na() %>%
mutate(city = trimws(city), city = str_replace(city," "," ")) %>%
select(city)
}
city <- ifelse(grepl(c("X|[[:digit:]]"),city),sapply(city, function(x) gsub(c('"*"|[[:digit:]]+|X|,|\.|^c\(|\)$|'),"",x)),sapply(city, function(x) gsub("\."," ",x)))
city <- unique(data.frame(matrix(unlist(city), nrow=length(city), byrow=TRUE)))
city[which(city=="" | city=="NA"),] <- NA
city <- city[complete.cases(city), , drop=FALSE]
colnames(city) <- "city"
count <- list(gpi_table)[[1]][[2]]
count <- setNames(rbind(names(count), count), names(count))
colnames(count) <- paste("V",seq(1,length(count),1),sep="")
count <- ifelse(grepl("^X[[:digit:]]+$|^X[[:digit:]]+\.[[:digit:]]+$",count),sapply(count,function(x) gsub("X|\.[[:digit:]]$","",x)),count)
count <- cbind(city,count)
library(english)
library(gsubfn)
result <- pdf_text(pdf_link)[10] %>%
str_split("\n") %>%
first() %>%
as_tibble() %>%
mutate_all(list(~na_if(.,""))) %>%
filter(grepl("NOTE:|determined",value))
x3 <- ifelse(is.na(result[2,]),paste(result[1,], result[2,], sep=". "),paste(result[1,], result[2,], sep=" "))
x3 <- dplyr::tibble(line = 1, text = x3)
sv <- strsplit(x3$text, split = "\. ")
x3 <- data.frame(V1 = rep(x3$line, sapply(sv, length)), V2 = unlist(sv))
x3[1,2] <- gsubfn("\w+", setNames(as.list(1:10), as.english(1:10)), x3[1,2])
x3[2,2] <- gsubfn("\w+", setNames(as.list(1:10), as.english(1:10)), x3[2,2])
x3$V2 <- gsub(",","",x3$V2)
x3$V2 <- gsub("NA",0,x3$V2)
x3$city <- ifelse(grepl("additional",x3$V2),"Other","None")
library(rebus)
library(htmlwidgets)
trcount <- DGT %R% optional(DGT) %R% optional(DGT) %R% optional(DGT) %R% optional(DGT)
str_view_all(x3$V2,
pattern = trcount)
a0 <- str_match_all(x3$V2, pattern = trcount)
a0[[1]] <- a0[[1]][-1,1]
x3$count <- unlist(a0)
x4 <- as.data.frame(x3[,-c(1:2)])
x5 <- rbind(count,x4)
x5 <- x5 %>%
mutate(state = trace_list[i, 2],
year = trace_list[i, 3])
citytrace[[i]] <- x5
}
citytrace <- do.call(rbind,citytrace)
citytrace$city <- gsub(c(' NA| '), '', citytrace$city)
citytrace$count <- gsub(c('\.'), '', citytrace$count)
print(citytrace)
我遇到的问题是将 'Title' 和 'Year' 中的值从 'trace_list' 分配给循环输出。预期结果如下:
city
count
state
year
Birmingham
100
Alabama
2019
Fairbanks
10
Alaska
2018
我不太确定如何开始这样做,并且正在寻求帮助。关于如何清理代码的任何建议 非常感谢。
因为我不能运行你的代码在这里给你的代码一个小建议
library(dplyr)
for (i in seq_len(nrow(trace_list))) {
pdf_link <- trace_list[i, 1]
# Do stuff with the URL
# probably you don't need the inner for-loop
# create the data.frame x5
x5 %>%
mutate(state = trace_list[i, 2],
year = trace_list[i, 3])
}
一些备注:
- 我不明白你的
for
循环的定义:遍历 list(trace_list[c(1:2),])
对我来说没有任何意义。
- 将
url
提取并存储在 pdf_link
中最好由 pdf_link <- trace_list[i, 1]
完成。
- 遍历
unlist(pdf_link)
对我来说也没有意义。我认为您可以改用 trace_list[i, 1]
、trace_list[i, 2]
或 trace_list[i, 3]
。
*编辑:多亏了 Martin 和一点点时间和关注,我才能够在我需要的地方获得代码。丑吗?是的,但它的工作方式现在对我有用。关于如何清理它并使其更有效率的任何提示都将非常有帮助。
使用数据框 trace_list
,我试图将 Title
和 Year
中的值附加到 for 循环中每个列表的输出。以下代码打开每个州的 PDF link 第 10 页,提取城市数据(范围从 1-12 个城市)。 Clean/tidies 数据,并将其存储在列表中,以便在收集每个 PDF 的数据后进行绑定。 现在它只拉取城市名称和一个数值。
data.frame(Link = c('https://www.atf.gov/file/146951/download','https://www.atf.gov/file/146966/download','https://www.atf.gov/file/146976/download','https://www.atf.gov/file/137041/download','https://www.atf.gov/file/137231/download','https://www.atf.gov/file/137301/download','https://www.atf.gov/docs/undefined/flwebsite17183911pdf/download','https://www.atf.gov/docs/undefined/kywebsite17183876pdf/download','https://www.atf.gov/docs/undefined/prwebsite17183917pdf/download'), Title = c('Alabama','California','District of Columbia','Alaska','Pennsylvania','Wyoming','Florida','Kentucky','Puerto Rico'), Year = c('2019','2019','2019','2018','2018','2018','2017','2017','2017'))
library(pdftools)
library(dplyr)
library(tabulizer)
library(english)
library(gsubfn)
library(rebus)
library(htmlwidgets)
citytrace <- list()
trace_list <- as.data.frame(trace_list)
for (i in seq_len(nrow(trace_list[c(1:159),]))) {
pdf_link <- trace_list[i, 1]
pdf_link <- paste0('',pdf_link,'')
gpi_table <- tabulizer::extract_tables(pdf_link,output = "data.frame",pages = c(10, 10), area = list(c(230,0,280,717), c(275,0,321,725)),guess = F)
city <- list(gpi_table)[[1]][[1]]
#city <- city[!Reduce(`|`, lapply(city[1], grepl, pattern = '^[0-9]+$""')),]
city[city==""] <- NA
city <- setNames(rbind(names(city), city), names(city))
colnames(city) <- paste("V",seq(1,length(city),1),sep="")
#city <- ifelse(city[1,]=='city',city[-1,],city)
city <- if(length(city)>4){
a <- data.frame(t(city))
colnames(a) <- paste("X",seq(1,length(a),1),sep="")
a[,1] <- factor(paste(a$X1,a$X2,a$X3,a$X4, sep = " "))
a[,1] <- trimws(gsub("X|X\.[[:digit:]]|\.[[:digit:]]","",a$X1))
a <- a[,-c(2:4)]
} else {
city %>%
unite(city, 1:length(city), sep = " ", remove = FALSE) %>%
mutate_all(na_if,"") %>%
drop_na() %>%
mutate(city = trimws(city), city = str_replace(city," "," ")) %>%
select(city)
}
city <- ifelse(grepl(c("X|[[:digit:]]"),city),sapply(city, function(x) gsub(c('"*"|[[:digit:]]+|X|,|\.|^c\(|\)$|'),"",x)),sapply(city, function(x) gsub("\."," ",x)))
city <- unique(data.frame(matrix(unlist(city), nrow=length(city), byrow=TRUE)))
city[which(city=="" | city=="NA"),] <- NA
city <- city[complete.cases(city), , drop=FALSE]
colnames(city) <- "city"
count <- list(gpi_table)[[1]][[2]]
count <- setNames(rbind(names(count), count), names(count))
colnames(count) <- paste("V",seq(1,length(count),1),sep="")
count <- ifelse(grepl("^X[[:digit:]]+$|^X[[:digit:]]+\.[[:digit:]]+$",count),sapply(count,function(x) gsub("X|\.[[:digit:]]$","",x)),count)
count <- cbind(city,count)
library(english)
library(gsubfn)
result <- pdf_text(pdf_link)[10] %>%
str_split("\n") %>%
first() %>%
as_tibble() %>%
mutate_all(list(~na_if(.,""))) %>%
filter(grepl("NOTE:|determined",value))
x3 <- ifelse(is.na(result[2,]),paste(result[1,], result[2,], sep=". "),paste(result[1,], result[2,], sep=" "))
x3 <- dplyr::tibble(line = 1, text = x3)
sv <- strsplit(x3$text, split = "\. ")
x3 <- data.frame(V1 = rep(x3$line, sapply(sv, length)), V2 = unlist(sv))
x3[1,2] <- gsubfn("\w+", setNames(as.list(1:10), as.english(1:10)), x3[1,2])
x3[2,2] <- gsubfn("\w+", setNames(as.list(1:10), as.english(1:10)), x3[2,2])
x3$V2 <- gsub(",","",x3$V2)
x3$V2 <- gsub("NA",0,x3$V2)
x3$city <- ifelse(grepl("additional",x3$V2),"Other","None")
library(rebus)
library(htmlwidgets)
trcount <- DGT %R% optional(DGT) %R% optional(DGT) %R% optional(DGT) %R% optional(DGT)
str_view_all(x3$V2,
pattern = trcount)
a0 <- str_match_all(x3$V2, pattern = trcount)
a0[[1]] <- a0[[1]][-1,1]
x3$count <- unlist(a0)
x4 <- as.data.frame(x3[,-c(1:2)])
x5 <- rbind(count,x4)
x5 <- x5 %>%
mutate(state = trace_list[i, 2],
year = trace_list[i, 3])
citytrace[[i]] <- x5
}
citytrace <- do.call(rbind,citytrace)
citytrace$city <- gsub(c(' NA| '), '', citytrace$city)
citytrace$count <- gsub(c('\.'), '', citytrace$count)
print(citytrace)
我遇到的问题是将 'Title' 和 'Year' 中的值从 'trace_list' 分配给循环输出。预期结果如下:
city | count | state | year |
---|---|---|---|
Birmingham | 100 | Alabama | 2019 |
Fairbanks | 10 | Alaska | 2018 |
我不太确定如何开始这样做,并且正在寻求帮助。关于如何清理代码的任何建议 非常感谢。
因为我不能运行你的代码在这里给你的代码一个小建议
library(dplyr)
for (i in seq_len(nrow(trace_list))) {
pdf_link <- trace_list[i, 1]
# Do stuff with the URL
# probably you don't need the inner for-loop
# create the data.frame x5
x5 %>%
mutate(state = trace_list[i, 2],
year = trace_list[i, 3])
}
一些备注:
- 我不明白你的
for
循环的定义:遍历list(trace_list[c(1:2),])
对我来说没有任何意义。 - 将
url
提取并存储在pdf_link
中最好由pdf_link <- trace_list[i, 1]
完成。 - 遍历
unlist(pdf_link)
对我来说也没有意义。我认为您可以改用trace_list[i, 1]
、trace_list[i, 2]
或trace_list[i, 3]
。