使用 R 对数据集进行子集化的两种方法均失败,请求帮助

Failed two methods to subset dataset with R, requesting assistance

我正在尝试用 R(开源统计脚本语言)制作一些数据的子集。我尝试了两种方法,但都没有成功。一个 returns 一个 table 没有数据,另一个 returns 一个 table 所有 "NA" 单元格,但尺寸显然正确。

我把代码排得很清楚,注释了--

不幸的是,这两种方法都行不通。我希望有人能够指出我的错误或为第三部分推荐不同的子集方法。

(顺便说一句,在第二部分中,我试图将列表转换为数据框,但它不起作用。我很好奇为什么,如果有人能为我阐明这一点。)

感谢您的宝贵时间和协助!

####
#### Section zero: references and dependencies
####
# r's "choroplethr" library creator's blog for reference:
# http://www.arilamstein.com/blog/2015/06/25/learn-to-map-census-data-in-r/
# 
# 
# library(choroplethr)
# library(choroplethrMaps)
# library(ggplot2)
# # use the devtools package from CRAN to install choroplethrZip from github
# # install.packages("devtools")
# library(devtools)
# install_github('arilamstein/choroplethrZip')
# library(choroplethrZip)
# library(data.table)
# 
####
#### Section one: the data set providing the zipcode we'll use to subset the crime set
####
austin2014_data_raw <- fread('https://data.austintexas.gov/resource/hcnj-rei3.csv')
names(austin2014_data_raw)
nrow(austin2014_data_raw)
## clean up: make any blank cells in column ZipCode say "NA" instead -> source:  
austin2014_data_raw[austin2014_data_raw$ZipCode==""] <- NA
# keep only rows that do not have "NA"
austin2014_data <- na.omit(austin2014_data_raw)
nrow(austin2014_data) # now there's one less row.

# selecting the first column, which is ZipCode
zipCodesOfData <- austin2014_data[,1]
View(zipCodesOfData)
# Now we have the zipcodes we need: zipCodesOfData

####
#### Section two: Crime data
####
# Crime by zipcode: https://data.austintexas.gov/dataset/Annual-Crime-2014/7g8v-xxja
#   (visualized: https://data.austintexas.gov/dataset/Annual-Crime-2014/8mst-ed5t )
# https://data.austintexas.gov/resource/<insertResourceNameHere>.csv  w/ resource "7g8v-xxja"

austinCrime2014_data_raw <- fread('https://data.austintexas.gov/resource/7g8v-xxja.csv')
View(austinCrime2014_data_raw)
nrow(austinCrime2014_data_raw)

# First, let's remove the data we don't need
names(austinCrime2014_data_raw)
columnSelection_Crime <- c("GO Location Zip", "GO Highest Offense Desc", "Highest NIBRS/UCR Offense Description")
austinCrime2014_data_selected_columns <- subset(austinCrime2014_data_raw, select=columnSelection_Crime)
names(austinCrime2014_data_selected_columns)
nrow(austinCrime2014_data_selected_columns)


####
#### Section Three: The problem: I am unable to make subsets with the two following methods.
####
# Neither of these methods work: 

# Attempt 1:

austinCrime2014_data_selected_columns <- austinCrime2014_data_selected_columns[austinCrime2014_data_selected_columns$`GO Location Zip` %in% zipCodesOfData , ]
View(austinCrime2014_data_selected_columns) # No data in the table

# Attempt 2:

# This initially told me an error:
# Then, I installed dplyr and the error went away.  
library(dplyr)
# However, it still doesn't create anything-- just an empty set w/ headers
austinCrime2014_data_selected_zips <- filter(austinCrime2014_data_selected_columns, `GO Location Zip` %in% zipCodesOfData)
View(austinCrime2014_data_selected_zips)

在意识到没有必要后,我删除了这一部分。

####
####  Bad section
####
nrow(austinCrime2014_data_selected_columns)

# Then, let's keep only the zipcodes we need
# doesnt work: austinCrime2014_data_selected_columns_df <- data.frame(austinCrime2014_data_selected_columns)
# typeof(austinCrime2014_data_selected_columns_df)

austinCrime<-do.call("rbind", austinCrime2014_data_selected_columns)
austinCrime_needsTranspose <-as.data.frame(austinCrime)
austinCrime <- t(austinCrime_needsTranspose)
typeof(austinCrime)
View(austinCrime)
names(austinCrime)

####
####  Bad section
####

我不确定你为什么要 do.calling 和 transposing 你的数据。您可以只使用 dplyrsemi_join 之类的东西来仅获取您想要的邮政编码:


library(data.table)
library(dplyr)
#> -------------------------------------------------------------------------
#> data.table + dplyr code now lives in dtplyr.
#> Please library(dtplyr)!
#> -------------------------------------------------------------------------
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:data.table':
#> 
#>     between, first, last
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
zipCodesOfData <- fread('https://data.austintexas.gov/resource/hcnj-rei3.csv') %>%
  mutate(`Zip Code` = ifelse(`Zip Code` == "", NA, `Zip Code`)) %>%
  na.omit() %>% 
  select(`Zip Code`)

austinCrime2014_data_raw <- fread('https://data.austintexas.gov/resource/7g8v-xxja.csv') %>%
  select(`GO Location Zip`, `GO Highest Offense Desc`, `Highest NIBRS/UCR Offense Description`) %>%
  semi_join(zipCodesOfData, by = c("GO Location Zip" = "Zip Code")) %>%
  rename(zipcode = `GO Location Zip`, 
         highestOffenseDesc = `GO Highest Offense Desc`, 
         NIBRS_OffenseDesc = `Highest NIBRS/UCR Offense Description`)

我觉得readrdplyr可以解决你的问题。很简单:

library(readr)
library(dplyr)

### SECTION 1

# Import data
austin2014_data_raw <- read_csv('https://data.austintexas.gov/resource/hcnj-rei3.csv', na = '')
glimpse(austin2014_data_raw)
nrow(austin2014_data_raw)

# Remove NAs
austin2014_data <- na.omit(austin2014_data_raw)
nrow(austin2014_data) # now there's one less row.

# Get zip codes
zipCodesOfData <- austin2014_data$`Zip Code`

### SECTION 2

# Import data
austinCrime2014_data_raw <- read_csv('https://data.austintexas.gov/resource/7g8v-xxja.csv', na = '')
glimpse(austinCrime2014_data_raw)
nrow(austinCrime2014_data_raw)

# Select and rename required columns
columnSelection_Crime <- c("GO Location Zip", "GO Highest Offense Desc", "Highest NIBRS/UCR Offense Description")
austinCrime_df <- select(austinCrime2014_data_raw, one_of(columnSelection_Crime))
names(austinCrime_df) <- c("zipcode", "highestOffenseDesc", "NIBRS_OffenseDesc")
glimpse(austinCrime_df)
nrow(austinCrime_df)

### SECTION 3

# Filter by zipcode
austinCrime2014_data_selected_zips <- filter(austinCrime_df, zipcode %in% zipCodesOfData)
glimpse(austinCrime2014_data_selected_zips)
nrow(austinCrime2014_data_selected_zips)

这里我使用readr包中的read_csv()导入数据,使用dplyr包中的子集方法select()filter()获取所需的列和行。