使用 R 对数据集进行子集化的两种方法均失败,请求帮助
Failed two methods to subset dataset with R, requesting assistance
我正在尝试用 R(开源统计脚本语言)制作一些数据的子集。我尝试了两种方法,但都没有成功。一个 returns 一个 table 没有数据,另一个 returns 一个 table 所有 "NA" 单元格,但尺寸显然正确。
我把代码排得很清楚,注释了--
首先,我创建了将用于对数据进行子集化的邮政编码列表。邮政编码列表来自我将要使用的数据集。
邮政编码列表称为 "zipCodesOfData"
接下来,我下载我将要进行子集化的犯罪数据。我基本上只是将其子集化为我需要的数据集。
最后一部分,即第三节,表明我尝试了 %in% 和 filter 方法来根据邮政编码数据过滤犯罪数据。
不幸的是,这两种方法都行不通。我希望有人能够指出我的错误或为第三部分推荐不同的子集方法。
(顺便说一句,在第二部分中,我试图将列表转换为数据框,但它不起作用。我很好奇为什么,如果有人能为我阐明这一点。)
感谢您的宝贵时间和协助!
####
#### Section zero: references and dependencies
####
# r's "choroplethr" library creator's blog for reference:
# http://www.arilamstein.com/blog/2015/06/25/learn-to-map-census-data-in-r/
#
#
# library(choroplethr)
# library(choroplethrMaps)
# library(ggplot2)
# # use the devtools package from CRAN to install choroplethrZip from github
# # install.packages("devtools")
# library(devtools)
# install_github('arilamstein/choroplethrZip')
# library(choroplethrZip)
# library(data.table)
#
####
#### Section one: the data set providing the zipcode we'll use to subset the crime set
####
austin2014_data_raw <- fread('https://data.austintexas.gov/resource/hcnj-rei3.csv')
names(austin2014_data_raw)
nrow(austin2014_data_raw)
## clean up: make any blank cells in column ZipCode say "NA" instead -> source:
austin2014_data_raw[austin2014_data_raw$ZipCode==""] <- NA
# keep only rows that do not have "NA"
austin2014_data <- na.omit(austin2014_data_raw)
nrow(austin2014_data) # now there's one less row.
# selecting the first column, which is ZipCode
zipCodesOfData <- austin2014_data[,1]
View(zipCodesOfData)
# Now we have the zipcodes we need: zipCodesOfData
####
#### Section two: Crime data
####
# Crime by zipcode: https://data.austintexas.gov/dataset/Annual-Crime-2014/7g8v-xxja
# (visualized: https://data.austintexas.gov/dataset/Annual-Crime-2014/8mst-ed5t )
# https://data.austintexas.gov/resource/<insertResourceNameHere>.csv w/ resource "7g8v-xxja"
austinCrime2014_data_raw <- fread('https://data.austintexas.gov/resource/7g8v-xxja.csv')
View(austinCrime2014_data_raw)
nrow(austinCrime2014_data_raw)
# First, let's remove the data we don't need
names(austinCrime2014_data_raw)
columnSelection_Crime <- c("GO Location Zip", "GO Highest Offense Desc", "Highest NIBRS/UCR Offense Description")
austinCrime2014_data_selected_columns <- subset(austinCrime2014_data_raw, select=columnSelection_Crime)
names(austinCrime2014_data_selected_columns)
nrow(austinCrime2014_data_selected_columns)
####
#### Section Three: The problem: I am unable to make subsets with the two following methods.
####
# Neither of these methods work:
# Attempt 1:
austinCrime2014_data_selected_columns <- austinCrime2014_data_selected_columns[austinCrime2014_data_selected_columns$`GO Location Zip` %in% zipCodesOfData , ]
View(austinCrime2014_data_selected_columns) # No data in the table
# Attempt 2:
# This initially told me an error:
# Then, I installed dplyr and the error went away.
library(dplyr)
# However, it still doesn't create anything-- just an empty set w/ headers
austinCrime2014_data_selected_zips <- filter(austinCrime2014_data_selected_columns, `GO Location Zip` %in% zipCodesOfData)
View(austinCrime2014_data_selected_zips)
在意识到没有必要后,我删除了这一部分。
####
#### Bad section
####
nrow(austinCrime2014_data_selected_columns)
# Then, let's keep only the zipcodes we need
# doesnt work: austinCrime2014_data_selected_columns_df <- data.frame(austinCrime2014_data_selected_columns)
# typeof(austinCrime2014_data_selected_columns_df)
austinCrime<-do.call("rbind", austinCrime2014_data_selected_columns)
austinCrime_needsTranspose <-as.data.frame(austinCrime)
austinCrime <- t(austinCrime_needsTranspose)
typeof(austinCrime)
View(austinCrime)
names(austinCrime)
####
#### Bad section
####
我不确定你为什么要 do.call
ing 和 t
ransposing 你的数据。您可以只使用 dplyr
的 semi_join
之类的东西来仅获取您想要的邮政编码:
library(data.table)
library(dplyr)
#> -------------------------------------------------------------------------
#> data.table + dplyr code now lives in dtplyr.
#> Please library(dtplyr)!
#> -------------------------------------------------------------------------
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:data.table':
#>
#> between, first, last
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
zipCodesOfData <- fread('https://data.austintexas.gov/resource/hcnj-rei3.csv') %>%
mutate(`Zip Code` = ifelse(`Zip Code` == "", NA, `Zip Code`)) %>%
na.omit() %>%
select(`Zip Code`)
austinCrime2014_data_raw <- fread('https://data.austintexas.gov/resource/7g8v-xxja.csv') %>%
select(`GO Location Zip`, `GO Highest Offense Desc`, `Highest NIBRS/UCR Offense Description`) %>%
semi_join(zipCodesOfData, by = c("GO Location Zip" = "Zip Code")) %>%
rename(zipcode = `GO Location Zip`,
highestOffenseDesc = `GO Highest Offense Desc`,
NIBRS_OffenseDesc = `Highest NIBRS/UCR Offense Description`)
我觉得readr
和dplyr
可以解决你的问题。很简单:
library(readr)
library(dplyr)
### SECTION 1
# Import data
austin2014_data_raw <- read_csv('https://data.austintexas.gov/resource/hcnj-rei3.csv', na = '')
glimpse(austin2014_data_raw)
nrow(austin2014_data_raw)
# Remove NAs
austin2014_data <- na.omit(austin2014_data_raw)
nrow(austin2014_data) # now there's one less row.
# Get zip codes
zipCodesOfData <- austin2014_data$`Zip Code`
### SECTION 2
# Import data
austinCrime2014_data_raw <- read_csv('https://data.austintexas.gov/resource/7g8v-xxja.csv', na = '')
glimpse(austinCrime2014_data_raw)
nrow(austinCrime2014_data_raw)
# Select and rename required columns
columnSelection_Crime <- c("GO Location Zip", "GO Highest Offense Desc", "Highest NIBRS/UCR Offense Description")
austinCrime_df <- select(austinCrime2014_data_raw, one_of(columnSelection_Crime))
names(austinCrime_df) <- c("zipcode", "highestOffenseDesc", "NIBRS_OffenseDesc")
glimpse(austinCrime_df)
nrow(austinCrime_df)
### SECTION 3
# Filter by zipcode
austinCrime2014_data_selected_zips <- filter(austinCrime_df, zipcode %in% zipCodesOfData)
glimpse(austinCrime2014_data_selected_zips)
nrow(austinCrime2014_data_selected_zips)
这里我使用readr
包中的read_csv()
导入数据,使用dplyr
包中的子集方法select()
和filter()
获取所需的列和行。
我正在尝试用 R(开源统计脚本语言)制作一些数据的子集。我尝试了两种方法,但都没有成功。一个 returns 一个 table 没有数据,另一个 returns 一个 table 所有 "NA" 单元格,但尺寸显然正确。
我把代码排得很清楚,注释了--
首先,我创建了将用于对数据进行子集化的邮政编码列表。邮政编码列表来自我将要使用的数据集。 邮政编码列表称为 "zipCodesOfData"
接下来,我下载我将要进行子集化的犯罪数据。我基本上只是将其子集化为我需要的数据集。
最后一部分,即第三节,表明我尝试了 %in% 和 filter 方法来根据邮政编码数据过滤犯罪数据。
不幸的是,这两种方法都行不通。我希望有人能够指出我的错误或为第三部分推荐不同的子集方法。
(顺便说一句,在第二部分中,我试图将列表转换为数据框,但它不起作用。我很好奇为什么,如果有人能为我阐明这一点。)
感谢您的宝贵时间和协助!
####
#### Section zero: references and dependencies
####
# r's "choroplethr" library creator's blog for reference:
# http://www.arilamstein.com/blog/2015/06/25/learn-to-map-census-data-in-r/
#
#
# library(choroplethr)
# library(choroplethrMaps)
# library(ggplot2)
# # use the devtools package from CRAN to install choroplethrZip from github
# # install.packages("devtools")
# library(devtools)
# install_github('arilamstein/choroplethrZip')
# library(choroplethrZip)
# library(data.table)
#
####
#### Section one: the data set providing the zipcode we'll use to subset the crime set
####
austin2014_data_raw <- fread('https://data.austintexas.gov/resource/hcnj-rei3.csv')
names(austin2014_data_raw)
nrow(austin2014_data_raw)
## clean up: make any blank cells in column ZipCode say "NA" instead -> source:
austin2014_data_raw[austin2014_data_raw$ZipCode==""] <- NA
# keep only rows that do not have "NA"
austin2014_data <- na.omit(austin2014_data_raw)
nrow(austin2014_data) # now there's one less row.
# selecting the first column, which is ZipCode
zipCodesOfData <- austin2014_data[,1]
View(zipCodesOfData)
# Now we have the zipcodes we need: zipCodesOfData
####
#### Section two: Crime data
####
# Crime by zipcode: https://data.austintexas.gov/dataset/Annual-Crime-2014/7g8v-xxja
# (visualized: https://data.austintexas.gov/dataset/Annual-Crime-2014/8mst-ed5t )
# https://data.austintexas.gov/resource/<insertResourceNameHere>.csv w/ resource "7g8v-xxja"
austinCrime2014_data_raw <- fread('https://data.austintexas.gov/resource/7g8v-xxja.csv')
View(austinCrime2014_data_raw)
nrow(austinCrime2014_data_raw)
# First, let's remove the data we don't need
names(austinCrime2014_data_raw)
columnSelection_Crime <- c("GO Location Zip", "GO Highest Offense Desc", "Highest NIBRS/UCR Offense Description")
austinCrime2014_data_selected_columns <- subset(austinCrime2014_data_raw, select=columnSelection_Crime)
names(austinCrime2014_data_selected_columns)
nrow(austinCrime2014_data_selected_columns)
####
#### Section Three: The problem: I am unable to make subsets with the two following methods.
####
# Neither of these methods work:
# Attempt 1:
austinCrime2014_data_selected_columns <- austinCrime2014_data_selected_columns[austinCrime2014_data_selected_columns$`GO Location Zip` %in% zipCodesOfData , ]
View(austinCrime2014_data_selected_columns) # No data in the table
# Attempt 2:
# This initially told me an error:
# Then, I installed dplyr and the error went away.
library(dplyr)
# However, it still doesn't create anything-- just an empty set w/ headers
austinCrime2014_data_selected_zips <- filter(austinCrime2014_data_selected_columns, `GO Location Zip` %in% zipCodesOfData)
View(austinCrime2014_data_selected_zips)
在意识到没有必要后,我删除了这一部分。
####
#### Bad section
####
nrow(austinCrime2014_data_selected_columns)
# Then, let's keep only the zipcodes we need
# doesnt work: austinCrime2014_data_selected_columns_df <- data.frame(austinCrime2014_data_selected_columns)
# typeof(austinCrime2014_data_selected_columns_df)
austinCrime<-do.call("rbind", austinCrime2014_data_selected_columns)
austinCrime_needsTranspose <-as.data.frame(austinCrime)
austinCrime <- t(austinCrime_needsTranspose)
typeof(austinCrime)
View(austinCrime)
names(austinCrime)
####
#### Bad section
####
我不确定你为什么要 do.call
ing 和 t
ransposing 你的数据。您可以只使用 dplyr
的 semi_join
之类的东西来仅获取您想要的邮政编码:
library(data.table)
library(dplyr)
#> -------------------------------------------------------------------------
#> data.table + dplyr code now lives in dtplyr.
#> Please library(dtplyr)!
#> -------------------------------------------------------------------------
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:data.table':
#>
#> between, first, last
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
zipCodesOfData <- fread('https://data.austintexas.gov/resource/hcnj-rei3.csv') %>%
mutate(`Zip Code` = ifelse(`Zip Code` == "", NA, `Zip Code`)) %>%
na.omit() %>%
select(`Zip Code`)
austinCrime2014_data_raw <- fread('https://data.austintexas.gov/resource/7g8v-xxja.csv') %>%
select(`GO Location Zip`, `GO Highest Offense Desc`, `Highest NIBRS/UCR Offense Description`) %>%
semi_join(zipCodesOfData, by = c("GO Location Zip" = "Zip Code")) %>%
rename(zipcode = `GO Location Zip`,
highestOffenseDesc = `GO Highest Offense Desc`,
NIBRS_OffenseDesc = `Highest NIBRS/UCR Offense Description`)
我觉得readr
和dplyr
可以解决你的问题。很简单:
library(readr)
library(dplyr)
### SECTION 1
# Import data
austin2014_data_raw <- read_csv('https://data.austintexas.gov/resource/hcnj-rei3.csv', na = '')
glimpse(austin2014_data_raw)
nrow(austin2014_data_raw)
# Remove NAs
austin2014_data <- na.omit(austin2014_data_raw)
nrow(austin2014_data) # now there's one less row.
# Get zip codes
zipCodesOfData <- austin2014_data$`Zip Code`
### SECTION 2
# Import data
austinCrime2014_data_raw <- read_csv('https://data.austintexas.gov/resource/7g8v-xxja.csv', na = '')
glimpse(austinCrime2014_data_raw)
nrow(austinCrime2014_data_raw)
# Select and rename required columns
columnSelection_Crime <- c("GO Location Zip", "GO Highest Offense Desc", "Highest NIBRS/UCR Offense Description")
austinCrime_df <- select(austinCrime2014_data_raw, one_of(columnSelection_Crime))
names(austinCrime_df) <- c("zipcode", "highestOffenseDesc", "NIBRS_OffenseDesc")
glimpse(austinCrime_df)
nrow(austinCrime_df)
### SECTION 3
# Filter by zipcode
austinCrime2014_data_selected_zips <- filter(austinCrime_df, zipcode %in% zipCodesOfData)
glimpse(austinCrime2014_data_selected_zips)
nrow(austinCrime2014_data_selected_zips)
这里我使用readr
包中的read_csv()
导入数据,使用dplyr
包中的子集方法select()
和filter()
获取所需的列和行。