将非定界文本文件读入 R

Reading non delimited text file into R

我有一大组 Data that is in an non delimited format that I am trying to import into R. The data set comes with a Column Locations 文件,其中包括行号、列名和变量的字符位置。是否有用于处理此类数据的现有包?如果不行,怎么办?

我用来解析列位置文件的宽度并使用它们读取数据文件的最终代码:

ColumnNames_Download <- function(){
 if(file.exists("ANES_ColumnNames.txt")){
      columnNames <- read_delim("ANES_ColumnNames.txt", delim = " ")
 } else{
      download.file(ANES_ColNamesURL, "ANES_ColumnNames.txt", mode = "wb")
      columnNames <- read_delim("ANES_ColumnNames.txt", delim = " ")
 }
 return(columnNames)

}

ColumnWidths <- function(columnNames){
 columnNames_list <- columnNames$File
 columnNames_listSub <- gsub(pattern = "-", replacement = ":",
                           x = columnNames_list)
 columnNames_split <- strsplit(columnNames_listSub, split = ":")
 columnWidth <- vector()
 for (i in seq_along(columnNames_split)){
      if (length(columnNames_split[[i]]) > 1){
           columnWidth <- append(columnWidth,c(as.numeric(
                          columnNames_split[[i]][2]) - 
                          as.numeric(columnNames_split[[i]][1]) + 1))
      } else{
           columnWidth <- append(columnWidth,c(1))

      }

 }

}

Data_Download <- function(Width){
 read_fwf("ANES_Data.txt",fwf_widths(Width))

}

如果您的文件保存为文本文件(.txt),那么您可以使用read.csv()并设置sep = " "如下:

col.loc <- read.csv("filename.txt", sep = " ")

# > head(col.loc)
#   Writing       COL  File
# 1      1)  VERSION:  1-28
# 2      2)  VCF0004: 29-32
# 3      3)  VCF0006: 33-36
# 4      4) VCF0006A: 37-44
# 5      5)  VCF0009: 45-59
# 6      6) VCF0009A: 60-74

可以通过$:

选择将任意列的数据保存到向量中
file <- col.loc$File

# > file
# [1] 1-28      29-32     33-36     37-44     45-59     60-74     75-89     90-104    105-119   120-134   135       136       137      
# [14] 138       139       140       141       142       143       144       145       146       147       148       149       150      
# [27] 151       152       153       154       155       156       157-158   159       160       161       162       163       164      
# [40] 165       166       167-169   170       171       172       173       174       175       176       177       178       179      
# [53] 180-182   183-185   186-188   189-191   192-193   194-195   196       197       198-199   200-201   202       203-204   205      
# [66] 206       207       208       209-211   212       213       214       215-217   218-220   221       222       223       224      
# [79] 225       226       227       228       229       230       231       232       233       234       235-237   238       239-241  
# [92] 242-244   245       246       247       248       249       250-251   252       253-255   256       257-258   259-260   261-262  
# [105] 263-264   265       266       267       268-271   272-274   275-279   280-284   285-286   287-288   289-290   291-292   293-294
# [118] 295-296   297-298   299-300   301-302   303-304   305-306   307-308   309-310   311-312   313-314   315-316   317-318   319-320  
# [131] 321-322   323-324   325-326   327-328   329-330   331-332   333-334   335-336   337-338   339-340   341-342   343-344   345-346  

这将创建一个新的因子向量,可以将其修改为字符向量,如下所示:

file.char <- as.character(file)

由于新向量是字符向量,您可以使用 gsub() 将所有破折号 (-) 替换为您想要的任何其他标点符号。 我选择用冒号替换 (:)。使用gsub()如下:

file.char.gsub <- gsub(pattern = "-", replacement = ":", x = file.char)

# > file.char.gsub
# [1] "1:28"      "29:32"     "33:36"     "37:44"     "45:59"     "60:74"     "75:89"     "90:104"    "105:119"   "120:134"   "135"      
# [12] "136"       "137"       "138"       "139"       "140"       "141"       "142"       "143"       "144"       "145"       "146"      
# [23] "147"       "148"       "149"       "150"       "151"       "152"       "153"       "154"       "155"       "156"       "157:158"  
# [34] "159"       "160"       "161"       "162"       "163"       "164"       "165"       "166"       "167:169"   "170"       "171"      
# [45] "172"       "173"       "174"       "175"       "176"       "177"       "178"       "179"       "180:182"   "183:185"   "186:188"  
# [56] "189:191"   "192:193"   "194:195"   "196"       "197"       "198:199"   "200:201"   "202"       "203:204"   "205"       "206"      
# [67] "207"       "208"       "209:211"   "212"       "213"       "214"       "215:217"   "218:220"   "221"       "222"       "223"      
# [78] "224"       "225"       "226"       "227"       "228"       "229"       "230"       "231"       "232"       "233"       "234"      
# [89] "235:237"   "238"       "239:241"   "242:244"   "245"       "246"       "247"       "248"       "249"       "250:251"   "252"      
# [100] "253:255"   "256"       "257:258"   "259:260"   "261:262"   "263:264"   "265"       "266"       "267"       "268:271"   "272:274"  
# [111] "275:279"   "280:284"   "285:286"   "287:288"   "289:290"   "291:292"   "293:294"   "295:296"   "297:298"   "299:300"   "301:302"  
# [122] "303:304"   "305:306"   "307:308"   "309:310"   "311:312"   "313:314"   "315:316"   "317:318"   "319:320"   "321:322"   "323:324"