如何将扩展名为 .DUSMCPUB 的文件导入 r?

How do I import a file into r with extension .DUSMCPUB?

我正在尝试从位于 link:

的国家卫生统计中心导入死亡率多因文件

https://www.cdc.gov/nchs/data_access/vitalstatsonline.htm#Downloadable

link to image of where to find file on NCHS website

文件的扩展名为 .DUSMCPUB(例如,2020 年的文件名为“VS20MORT。DUSMCPUB_r20220105”)。如何导入这样的文件?我不熟悉扩展名。

我尝试使用以下代码导入,但它导致我的 R 程序终止。您能否就如何导入这些类型的文件向我提供建议?

VS20MORT <- read_delim("VS20MORT.DUSMCPUB_r20220105")

数据为等宽文件形式。国家卫生统计中心数据的用户指南包含适当的宽度。我给出的答案是来自另一个论坛的修改后的答案,由@Hack-R 发布。

https://opendata.stackexchange.com/questions/18375/how-can-one-interpret-the-nvss-mortality-multiple-cause-of-death-data-sets

map <- data.frame(widths=c(19, 1,40,2,1,1,2,2,1,1,1,1,1,1,2,2,2,2,1,1,1,16,4,1,1,1,1,34,1,1,4,
                           3,1,3,3,2,1,2,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
                           36,2,1,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,1,2,1,1,1,1,33,3,
                           1,1))
#Set column names 
map$cn <- c("blank", # cols 1-19
            "res_status",  #20
            "blank2", # 21-60
            "ed_v89",#61-62
            "ed_v03",#63
            "ed_flag", #64
            "death_month", #65-66
            "blank3",
            "sex", 
            "age_years",
            "age_months", 
            "age_3",
            "age_4", 
            "age_sub_flag", 
            "age_recode_52", 
            "age_recode_27",
            "age_recode_12", 
            "infant_age_recode_22", 
            "place_of_death", 
            "marital_status",
            "death_day", 
            "blank4", 
            "current_year", 
            "work_injury", 
            "death_manner", 
            "disposition",
            "autopsy", 
            "blank5", 
            "activity_code", 
            "place_injured", 
            "icd_cause_of_death", 
            "cause_recode358",
            "blank6", 
            "cause_recode113", 
            "infant_cause_recode130", 
            "cause_recode39", 
            "blank7",
            "num_entity_axis",
            "cond1","cond2","cond3","cond4","cond5","cond6","cond7","cond8","cond9","cond10",
            "cond11","cond12","cond13","cond14","cond15","cond16","cond17","cond18","cond19",
            "cond20",
            "blank7",
            "num_rec_axis_cond", 
            "blank8", 
            "acond1", "acond2", "acond3",  "acond4",  "acond5",  "acond6",  "acond7",  
            "acond8",  "acond9", "acond10", "acond11", "acond12", "acond13", "acond14", 
            "acond15", "acond16", "acond17", "acond18", "acond19", "acond20", 
            "blank9",
            "race",
            "bridged_race_flag",
            "race_imp_flag", 
            "race_recode3", 
            "race_recode5", 
            "blank10",
            "hisp",
            "blank11", 
            "hisp_recode")


#Import the file 
mort2020 <- read_fwf("./data/original/VS20MORT.DUSMCPUB_r20220105", fwf_widths(map$widths, map$cn))

感谢@Mel G 分享这种方法。当我尝试 运行 时,我意识到死亡率文件包括一些截至 2020 年的新变量(即死者的职业和行业)。这是一个包含新变量的细微变化。

# Install and load necessary packages
# install.packages("sqldf") # Used to read in DUSMCPUB file
# install.packages("dplyr") # Used for tidy data management
library(sqldf)
library(dplyr)

#Increase memory limit to make space for large file
# memory.limit()
memory.limit(size=20000)

# Create dataframe containing variables for column width, name, and end position
columns <- data.frame(widths=c(19,1,40,2,1,1,2,2,1,4,1,2,2,2,2,1,1,1,16,4,1,1,1,
                               1,34,1,1,4,3,1,3,3,2,1,2,7,7,7,7,7,7,7,7,7,7,7,7,
                               7,7,7,7,7,7,7,7,36,2,1,5,5,5,5,5,5,5,5,5,5,5,5,5,
                               5,5,5,5,5,5,5,1,2,1,1,1,1,33,3,1,1,2,315,4,2,4,2))
columns$names <- c("blank1", # tape locations 1-19
                   "Resident_Status_US",  # tape location 20
                   "blank2", 
                   "Education_1989",
                   "Education_2003",
                   "Education_flag", 
                   "Month_of_Death", 
                   "blank3",
                   "Sex", 
                   "DetailAge",
                   "Age_Substitution_Flag", 
                   "Age_Recode_52",
                   "Age_Recode_27", 
                   "Age_Recode_12",
                   "Infant_Age_Recode_22",
                   "Place_of_Death_and_Status",
                   "Marital_Status",
                   "Day_of_Week_of_Death",
                   "blank4",
                   "Current_Data_Year",
                   "Injury_at_Work",
                   "Manner_of_Death",
                   "Method_of_Disposition",
                   "Autopsy",
                   "blank5",
                   "Activity_Code",
                   "Place_of_Injury",
                   "ICD_Code_10",
                   "Cause_Recode_358",
                   "blank6",
                   "Cause_Recode_113",
                   "Infant_Cause_Recode_130",
                   "Cause_Recode_39",
                   "blank7",
                   "Number_Entity_Axis_Conditions",
                   "Condition_1EA", "Condition_2EA", "Condition_3EA", "Condition_4EA", "Condition_5EA",
                   "Condition_6EA", "Condition_7EA", "Condition_8EA", "Condition_9EA", "Condition_10EA",
                   "Condition_11EA", "Condition_12EA", "Condition_13EA", "Condition_14EA", "Condition_15EA",
                   "Condition_16EA", "Condition_17EA", "Condition_18EA", "Condition_19EA", "Condition_20EA",
                   "blank8",
                   "Number_Record_Axis_Conditions",
                   "blank9",
                   "Condition_1RA", "Condition_2RA", "Condition_3RA", "Condition_4RA", "Condition_5RA",
                   "Condition_6RA", "Condition_7RA", "Condition_8RA", "Condition_9RA", "Condition_10RA",
                   "Condition_11RA", "Condition_12RA", "Condition_13RA", "Condition_14RA", "Condition_15RA",
                   "Condition_16RA", "Condition_17RA", "Condition_18RA", "Condition_19RA", "Condition_20RA",
                   "blank10",
                   "Race",
                   "Bridged_Race_Flag",
                   "Race_Imputation_Flag",
                   "Race_Recode_3",
                   "Race_Recode_5",
                   "blank11",
                   "Hispanic_Origin",
                   "blank12",
                   "Hispanic_Origin_9_Race_Recode",
                   "Race_Recode_40",
                   "blank13",
                   "CensusOcc",
                   "Occ_26",
                   "CensusInd",
                   "Ind_23")

# Read in file using parameters from 'columns' dataframe
mort2020<- read.fwf("VS20MORT.DUSMCPUB_r20220105", widths=columns$widths, stringsAsFactors=F)
# Attach column names to variables
colnames(mort2020) <- columns$names

# Remove blank variables
mort2020x <- mort2020 %>% dplyr::select(-starts_with("blank"))

或者,这些文件似乎在此处以 CSV 格式发布了大部分年份:https://www.nber.org/research/data/mortality-data-vital-statistics-nchs-multiple-cause-death-data。 2020 年还没有结束,但对于其他年份,将 CSV 读入 R 比使用 read.fwf.

要快得多