在 R 中的列表中组合不同的表
Combine different tables in a list in R
更新:下面的代码似乎有效
我不太确定这个问题是怎么回事,所以如果措辞不当,我深表歉意。我试着寻找 "combine different elements of a list using apply" 但这似乎不起作用。
无论如何,作为抓取网站的结果,我有两个向量提供识别信息和一个包含许多不同 table 的列表。简化版本看起来像这样:
respondents <- c("A", "B")
questions <- c("question1", "question2")
df1 <- data.frame(
option = c("yes", "no"),
percentage = c(70, 30), stringsAsFactors = FALSE)
df2 <- data.frame(
option= c("today", "yesterday"),
percentage =c(30, 70), stringsAsFactors = FALSE)
df3 <- data.frame(
option = c("yes", "no"),
percentage = c(60, 40), stringsAsFactors = FALSE)
df4 <- data.frame(
option= c("today", "yesterday"),
percentage =c(20, 80), stringsAsFactors = FALSE)
lst <- list(df1, df2, df3, df4)
前两个 table 是第一位参与者的问题和回答,后两个 table 是第二位参与者的问题。我想做的是创建两个 tables,其中包含两个参与者的问题答案。所以我想要这样的东西:
question1 <- data.frame(
option = c("yes", "no"),
A = c(70, 30),
B = c(60, 40), stringsAsFactors = FALSE)
question2 <- data.frame(
option = c("today", "yesterday"),
A = c(30, 70),
B = c(20, 80), stringsAsFactors = FALSE)
在我的例子中,我有 51 名参与者的 122 条回复,它的顺序是 tables 1-122 来自第一位参与者,接下来的 122 tables 来自第二位参与者参与者等。最终,我想要 122 tables(每个问题一个 table),每个 table 包含与每个参与者对应的 51 列。对于如何执行此操作,我或多或少不知所措,因此我将不胜感激任何建议。
现在应该可以了:
library("RCurl")
library("XML")
# Get the data
## Create URL address
mainURL <- 'http://www4.uwm.edu/FLL/linguistics/dialect/staticmaps/'
stateURL <- 'states.html'
url <- paste0(mainURL, stateURL)
## Download URL
tmp <- getURL(url)
## Parse
tmp <- htmlTreeParse(tmp, useInternalNodes = TRUE)
## Extract page addresses and save to subURL
subURL <- unlist(xpathSApply(tmp, '//a[@href]', xmlAttrs))
## Remove pages that aren't state's names
subURL <- subURL[-(1:4)]
## Show first four states
head(subURL, 4)
# Get questions
## Select first state
suburl <- subURL[1]
## Paste it at the end of the main URL
url <- paste0(mainURL, suburl)
## Download URL
tmp <- getURL(url)
## Read data from html
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
##Remove empty strings
Questions <- Questions[Questions!= '']
# Create objects to populate later
stateNames <- rep('', length(subURL))
## Populate stateNames
### Remove state_ from stateNames
stateNames <- gsub('state_','',subURL)
### Remove .html from stateNames
stateNames <- gsub('.html','',stateNames)
# Remove pictures in the data representing IPA symbols with their names (e.g., names of the pictures)
## Get url
url <- paste0(mainURL, subURL)
tmp <- getURL(url)
## Replace .gif with _
tmp <- gsub(".gif>", '_', tmp)
## Replace "<img\s+src=./images/" with _
tmp <- gsub("<img\s+src=./images/", '_', tmp)
# Read in data
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
## Subset 2nd and 4th columns and apply to every item on list
tb <- lapply(tb, function(x) x[,c(2,4)])
## Remove quotation marks, percent sign and convert to number; apply to every item
tb <- lapply(tb, function(x) {
x [,2 ] = gsub('\(','',x[,2] )
x [,2 ] = gsub('%\)','',x[,2])
x [,2 ] = as.numeric(x[,2])
x
}
)
## Assign column names to all dataframes
tb <- lapply(tb, setNames , nm = c("option", "percentage"))
#get rid of extra tables
tb1 <- tb[-seq(1, length(tb), by=123)]
## Function to clean data sets
f1 <- function(list1){ Reduce(function(...) merge(..., by= 'option', all=TRUE), list1) }; res <- lapply(1:122, function(i) {indx <- seq(i, length(tb), by=122); f1(tb[indx])})
## Function to merge datasets together
res1 <- lapply(1:122, function(i) f1(tb1[seq(i, length(tb1), by=122)]))
## Create names for the states
stateNames2 <- c("option", stateNames)
# Rename columns in the new dataframes
res2 <- lapply(res1, setNames , nm = stateNames2)
# Test to see whether it works
test <- res2[[122]]
多亏了 akrun(见评论),我才开始工作。完整代码在这里:
library("RCurl")
library("XML")
# Get the data
## Create URL address
mainURL <- 'http://www4.uwm.edu/FLL/linguistics/dialect/staticmaps/'
stateURL <- 'states.html'
url <- paste0(mainURL, stateURL)
url
## Download URL
tmp <- getURL(url)
## Parse
tmp <- htmlTreeParse(tmp, useInternalNodes = TRUE)
## Extract page addresses and save to subURL
subURL <- unlist(xpathSApply(tmp, '//a[@href]', xmlAttrs))
## Remove pages that aren't state's names
subURL <- subURL[-(1:4)]
## Show first four states
head(subURL, 4)
# Get questions
## Select first state
suburl <- subURL[1]
## Paste it at the end of the main URL
url <- paste0(mainURL, suburl)
## Download URL
tmp <- getURL(url)
## Read data from html
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
## Remove first column
Questions <- tb[[1]][,1]
##Remove empty strings
Questions <- Questions[Questions!= '']
# Create objects to populate later
survey <- vector(length(subURL), mode = "list")
i <- 1
stateNames <- rep('', length(subURL))
## Populate stateNames
### Remove state_ from stateNames
stateNames <- gsub('state_','',subURL)
### Remove .html from stateNames
stateNames <- gsub('.html','',stateNames)
# Remove pictures in the data representing IPA symbols with their names (e.g., names of the pictures)
## Get url
url <- paste0(mainURL, subURL)
tmp <- getURL(url)
## Replace .gif with _
tmp <- gsub(".gif>", '_', tmp)
## Replace "<img\s+src=./images/" with _
tmp <- gsub("<img\s+src=./images/", '_', tmp)
# Read in data
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
#tb <- tb[-1]
## Subset 2nd and 4th columns and apply to every item on list
tb <- lapply(tb, function(x) x[,c(2,4)])
## Remove quotation marks, percent sign and convert to number; apply to every item
tb <- lapply(tb, function(x) {
x [,2 ] = gsub('\(','',x[,2] )
x [,2 ] = gsub('%\)','',x[,2])
x [,2 ] = as.numeric(x[,2])
x
}
)
## Assign column names to all dataframes
tb <- lapply(tb, setNames , nm = c("option", "percentage"))
## Remove unneeded dataframes in list
tb1 <- tb[-seq(1, length(tb), by=123)]
## Function to clean data sets
f1 <- function(list1){ Reduce(function(...) merge(..., by= 'option', all=TRUE), list1) }; res <- lapply(1:122, function(i) {indx <- seq(i, length(tb), by=122); f1(tb[indx])})
## Function to merge datasets together
res1 <- lapply(1:122, function(i) f1(tb1[seq(i, length(tb1), by=122)]))
## Create names for the states
stateNames2 <- c("Options", stateNames)
# Rename columns in the new dataframes
res2 <- lapply(res1, setNames , nm = stateNames2)
# Test to see whether it works
test <- res2[[1]]
更新:下面的代码似乎有效
我不太确定这个问题是怎么回事,所以如果措辞不当,我深表歉意。我试着寻找 "combine different elements of a list using apply" 但这似乎不起作用。
无论如何,作为抓取网站的结果,我有两个向量提供识别信息和一个包含许多不同 table 的列表。简化版本看起来像这样:
respondents <- c("A", "B")
questions <- c("question1", "question2")
df1 <- data.frame(
option = c("yes", "no"),
percentage = c(70, 30), stringsAsFactors = FALSE)
df2 <- data.frame(
option= c("today", "yesterday"),
percentage =c(30, 70), stringsAsFactors = FALSE)
df3 <- data.frame(
option = c("yes", "no"),
percentage = c(60, 40), stringsAsFactors = FALSE)
df4 <- data.frame(
option= c("today", "yesterday"),
percentage =c(20, 80), stringsAsFactors = FALSE)
lst <- list(df1, df2, df3, df4)
前两个 table 是第一位参与者的问题和回答,后两个 table 是第二位参与者的问题。我想做的是创建两个 tables,其中包含两个参与者的问题答案。所以我想要这样的东西:
question1 <- data.frame(
option = c("yes", "no"),
A = c(70, 30),
B = c(60, 40), stringsAsFactors = FALSE)
question2 <- data.frame(
option = c("today", "yesterday"),
A = c(30, 70),
B = c(20, 80), stringsAsFactors = FALSE)
在我的例子中,我有 51 名参与者的 122 条回复,它的顺序是 tables 1-122 来自第一位参与者,接下来的 122 tables 来自第二位参与者参与者等。最终,我想要 122 tables(每个问题一个 table),每个 table 包含与每个参与者对应的 51 列。对于如何执行此操作,我或多或少不知所措,因此我将不胜感激任何建议。
现在应该可以了:
library("RCurl")
library("XML")
# Get the data
## Create URL address
mainURL <- 'http://www4.uwm.edu/FLL/linguistics/dialect/staticmaps/'
stateURL <- 'states.html'
url <- paste0(mainURL, stateURL)
## Download URL
tmp <- getURL(url)
## Parse
tmp <- htmlTreeParse(tmp, useInternalNodes = TRUE)
## Extract page addresses and save to subURL
subURL <- unlist(xpathSApply(tmp, '//a[@href]', xmlAttrs))
## Remove pages that aren't state's names
subURL <- subURL[-(1:4)]
## Show first four states
head(subURL, 4)
# Get questions
## Select first state
suburl <- subURL[1]
## Paste it at the end of the main URL
url <- paste0(mainURL, suburl)
## Download URL
tmp <- getURL(url)
## Read data from html
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
##Remove empty strings
Questions <- Questions[Questions!= '']
# Create objects to populate later
stateNames <- rep('', length(subURL))
## Populate stateNames
### Remove state_ from stateNames
stateNames <- gsub('state_','',subURL)
### Remove .html from stateNames
stateNames <- gsub('.html','',stateNames)
# Remove pictures in the data representing IPA symbols with their names (e.g., names of the pictures)
## Get url
url <- paste0(mainURL, subURL)
tmp <- getURL(url)
## Replace .gif with _
tmp <- gsub(".gif>", '_', tmp)
## Replace "<img\s+src=./images/" with _
tmp <- gsub("<img\s+src=./images/", '_', tmp)
# Read in data
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
## Subset 2nd and 4th columns and apply to every item on list
tb <- lapply(tb, function(x) x[,c(2,4)])
## Remove quotation marks, percent sign and convert to number; apply to every item
tb <- lapply(tb, function(x) {
x [,2 ] = gsub('\(','',x[,2] )
x [,2 ] = gsub('%\)','',x[,2])
x [,2 ] = as.numeric(x[,2])
x
}
)
## Assign column names to all dataframes
tb <- lapply(tb, setNames , nm = c("option", "percentage"))
#get rid of extra tables
tb1 <- tb[-seq(1, length(tb), by=123)]
## Function to clean data sets
f1 <- function(list1){ Reduce(function(...) merge(..., by= 'option', all=TRUE), list1) }; res <- lapply(1:122, function(i) {indx <- seq(i, length(tb), by=122); f1(tb[indx])})
## Function to merge datasets together
res1 <- lapply(1:122, function(i) f1(tb1[seq(i, length(tb1), by=122)]))
## Create names for the states
stateNames2 <- c("option", stateNames)
# Rename columns in the new dataframes
res2 <- lapply(res1, setNames , nm = stateNames2)
# Test to see whether it works
test <- res2[[122]]
多亏了 akrun(见评论),我才开始工作。完整代码在这里:
library("RCurl")
library("XML")
# Get the data
## Create URL address
mainURL <- 'http://www4.uwm.edu/FLL/linguistics/dialect/staticmaps/'
stateURL <- 'states.html'
url <- paste0(mainURL, stateURL)
url
## Download URL
tmp <- getURL(url)
## Parse
tmp <- htmlTreeParse(tmp, useInternalNodes = TRUE)
## Extract page addresses and save to subURL
subURL <- unlist(xpathSApply(tmp, '//a[@href]', xmlAttrs))
## Remove pages that aren't state's names
subURL <- subURL[-(1:4)]
## Show first four states
head(subURL, 4)
# Get questions
## Select first state
suburl <- subURL[1]
## Paste it at the end of the main URL
url <- paste0(mainURL, suburl)
## Download URL
tmp <- getURL(url)
## Read data from html
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
## Remove first column
Questions <- tb[[1]][,1]
##Remove empty strings
Questions <- Questions[Questions!= '']
# Create objects to populate later
survey <- vector(length(subURL), mode = "list")
i <- 1
stateNames <- rep('', length(subURL))
## Populate stateNames
### Remove state_ from stateNames
stateNames <- gsub('state_','',subURL)
### Remove .html from stateNames
stateNames <- gsub('.html','',stateNames)
# Remove pictures in the data representing IPA symbols with their names (e.g., names of the pictures)
## Get url
url <- paste0(mainURL, subURL)
tmp <- getURL(url)
## Replace .gif with _
tmp <- gsub(".gif>", '_', tmp)
## Replace "<img\s+src=./images/" with _
tmp <- gsub("<img\s+src=./images/", '_', tmp)
# Read in data
tb <- readHTMLTable(tmp, stringsAsFactors = FALSE)
#tb <- tb[-1]
## Subset 2nd and 4th columns and apply to every item on list
tb <- lapply(tb, function(x) x[,c(2,4)])
## Remove quotation marks, percent sign and convert to number; apply to every item
tb <- lapply(tb, function(x) {
x [,2 ] = gsub('\(','',x[,2] )
x [,2 ] = gsub('%\)','',x[,2])
x [,2 ] = as.numeric(x[,2])
x
}
)
## Assign column names to all dataframes
tb <- lapply(tb, setNames , nm = c("option", "percentage"))
## Remove unneeded dataframes in list
tb1 <- tb[-seq(1, length(tb), by=123)]
## Function to clean data sets
f1 <- function(list1){ Reduce(function(...) merge(..., by= 'option', all=TRUE), list1) }; res <- lapply(1:122, function(i) {indx <- seq(i, length(tb), by=122); f1(tb[indx])})
## Function to merge datasets together
res1 <- lapply(1:122, function(i) f1(tb1[seq(i, length(tb1), by=122)]))
## Create names for the states
stateNames2 <- c("Options", stateNames)
# Rename columns in the new dataframes
res2 <- lapply(res1, setNames , nm = stateNames2)
# Test to see whether it works
test <- res2[[1]]