将非定界文本文件读入 R
Reading non delimited text file into R
我有一大组 Data that is in an non delimited format that I am trying to import into R. The data set comes with a Column Locations 文件,其中包括行号、列名和变量的字符位置。是否有用于处理此类数据的现有包?如果不行,怎么办?
我用来解析列位置文件的宽度并使用它们读取数据文件的最终代码:
ColumnNames_Download <- function(){
if(file.exists("ANES_ColumnNames.txt")){
columnNames <- read_delim("ANES_ColumnNames.txt", delim = " ")
} else{
download.file(ANES_ColNamesURL, "ANES_ColumnNames.txt", mode = "wb")
columnNames <- read_delim("ANES_ColumnNames.txt", delim = " ")
}
return(columnNames)
}
ColumnWidths <- function(columnNames){
columnNames_list <- columnNames$File
columnNames_listSub <- gsub(pattern = "-", replacement = ":",
x = columnNames_list)
columnNames_split <- strsplit(columnNames_listSub, split = ":")
columnWidth <- vector()
for (i in seq_along(columnNames_split)){
if (length(columnNames_split[[i]]) > 1){
columnWidth <- append(columnWidth,c(as.numeric(
columnNames_split[[i]][2]) -
as.numeric(columnNames_split[[i]][1]) + 1))
} else{
columnWidth <- append(columnWidth,c(1))
}
}
}
Data_Download <- function(Width){
read_fwf("ANES_Data.txt",fwf_widths(Width))
}
如果您的文件保存为文本文件(.txt
),那么您可以使用read.csv()
并设置sep = " "
如下:
col.loc <- read.csv("filename.txt", sep = " ")
# > head(col.loc)
# Writing COL File
# 1 1) VERSION: 1-28
# 2 2) VCF0004: 29-32
# 3 3) VCF0006: 33-36
# 4 4) VCF0006A: 37-44
# 5 5) VCF0009: 45-59
# 6 6) VCF0009A: 60-74
可以通过$
:
选择将任意列的数据保存到向量中
file <- col.loc$File
# > file
# [1] 1-28 29-32 33-36 37-44 45-59 60-74 75-89 90-104 105-119 120-134 135 136 137
# [14] 138 139 140 141 142 143 144 145 146 147 148 149 150
# [27] 151 152 153 154 155 156 157-158 159 160 161 162 163 164
# [40] 165 166 167-169 170 171 172 173 174 175 176 177 178 179
# [53] 180-182 183-185 186-188 189-191 192-193 194-195 196 197 198-199 200-201 202 203-204 205
# [66] 206 207 208 209-211 212 213 214 215-217 218-220 221 222 223 224
# [79] 225 226 227 228 229 230 231 232 233 234 235-237 238 239-241
# [92] 242-244 245 246 247 248 249 250-251 252 253-255 256 257-258 259-260 261-262
# [105] 263-264 265 266 267 268-271 272-274 275-279 280-284 285-286 287-288 289-290 291-292 293-294
# [118] 295-296 297-298 299-300 301-302 303-304 305-306 307-308 309-310 311-312 313-314 315-316 317-318 319-320
# [131] 321-322 323-324 325-326 327-328 329-330 331-332 333-334 335-336 337-338 339-340 341-342 343-344 345-346
这将创建一个新的因子向量,可以将其修改为字符向量,如下所示:
file.char <- as.character(file)
由于新向量是字符向量,您可以使用 gsub()
将所有破折号 (-
) 替换为您想要的任何其他标点符号。
我选择用冒号替换 (:
)。使用gsub()
如下:
file.char.gsub <- gsub(pattern = "-", replacement = ":", x = file.char)
# > file.char.gsub
# [1] "1:28" "29:32" "33:36" "37:44" "45:59" "60:74" "75:89" "90:104" "105:119" "120:134" "135"
# [12] "136" "137" "138" "139" "140" "141" "142" "143" "144" "145" "146"
# [23] "147" "148" "149" "150" "151" "152" "153" "154" "155" "156" "157:158"
# [34] "159" "160" "161" "162" "163" "164" "165" "166" "167:169" "170" "171"
# [45] "172" "173" "174" "175" "176" "177" "178" "179" "180:182" "183:185" "186:188"
# [56] "189:191" "192:193" "194:195" "196" "197" "198:199" "200:201" "202" "203:204" "205" "206"
# [67] "207" "208" "209:211" "212" "213" "214" "215:217" "218:220" "221" "222" "223"
# [78] "224" "225" "226" "227" "228" "229" "230" "231" "232" "233" "234"
# [89] "235:237" "238" "239:241" "242:244" "245" "246" "247" "248" "249" "250:251" "252"
# [100] "253:255" "256" "257:258" "259:260" "261:262" "263:264" "265" "266" "267" "268:271" "272:274"
# [111] "275:279" "280:284" "285:286" "287:288" "289:290" "291:292" "293:294" "295:296" "297:298" "299:300" "301:302"
# [122] "303:304" "305:306" "307:308" "309:310" "311:312" "313:314" "315:316" "317:318" "319:320" "321:322" "323:324"
我有一大组 Data that is in an non delimited format that I am trying to import into R. The data set comes with a Column Locations 文件,其中包括行号、列名和变量的字符位置。是否有用于处理此类数据的现有包?如果不行,怎么办?
我用来解析列位置文件的宽度并使用它们读取数据文件的最终代码:
ColumnNames_Download <- function(){
if(file.exists("ANES_ColumnNames.txt")){
columnNames <- read_delim("ANES_ColumnNames.txt", delim = " ")
} else{
download.file(ANES_ColNamesURL, "ANES_ColumnNames.txt", mode = "wb")
columnNames <- read_delim("ANES_ColumnNames.txt", delim = " ")
}
return(columnNames)
}
ColumnWidths <- function(columnNames){
columnNames_list <- columnNames$File
columnNames_listSub <- gsub(pattern = "-", replacement = ":",
x = columnNames_list)
columnNames_split <- strsplit(columnNames_listSub, split = ":")
columnWidth <- vector()
for (i in seq_along(columnNames_split)){
if (length(columnNames_split[[i]]) > 1){
columnWidth <- append(columnWidth,c(as.numeric(
columnNames_split[[i]][2]) -
as.numeric(columnNames_split[[i]][1]) + 1))
} else{
columnWidth <- append(columnWidth,c(1))
}
}
}
Data_Download <- function(Width){
read_fwf("ANES_Data.txt",fwf_widths(Width))
}
如果您的文件保存为文本文件(.txt
),那么您可以使用read.csv()
并设置sep = " "
如下:
col.loc <- read.csv("filename.txt", sep = " ")
# > head(col.loc)
# Writing COL File
# 1 1) VERSION: 1-28
# 2 2) VCF0004: 29-32
# 3 3) VCF0006: 33-36
# 4 4) VCF0006A: 37-44
# 5 5) VCF0009: 45-59
# 6 6) VCF0009A: 60-74
可以通过$
:
file <- col.loc$File
# > file
# [1] 1-28 29-32 33-36 37-44 45-59 60-74 75-89 90-104 105-119 120-134 135 136 137
# [14] 138 139 140 141 142 143 144 145 146 147 148 149 150
# [27] 151 152 153 154 155 156 157-158 159 160 161 162 163 164
# [40] 165 166 167-169 170 171 172 173 174 175 176 177 178 179
# [53] 180-182 183-185 186-188 189-191 192-193 194-195 196 197 198-199 200-201 202 203-204 205
# [66] 206 207 208 209-211 212 213 214 215-217 218-220 221 222 223 224
# [79] 225 226 227 228 229 230 231 232 233 234 235-237 238 239-241
# [92] 242-244 245 246 247 248 249 250-251 252 253-255 256 257-258 259-260 261-262
# [105] 263-264 265 266 267 268-271 272-274 275-279 280-284 285-286 287-288 289-290 291-292 293-294
# [118] 295-296 297-298 299-300 301-302 303-304 305-306 307-308 309-310 311-312 313-314 315-316 317-318 319-320
# [131] 321-322 323-324 325-326 327-328 329-330 331-332 333-334 335-336 337-338 339-340 341-342 343-344 345-346
这将创建一个新的因子向量,可以将其修改为字符向量,如下所示:
file.char <- as.character(file)
由于新向量是字符向量,您可以使用 gsub()
将所有破折号 (-
) 替换为您想要的任何其他标点符号。
我选择用冒号替换 (:
)。使用gsub()
如下:
file.char.gsub <- gsub(pattern = "-", replacement = ":", x = file.char)
# > file.char.gsub
# [1] "1:28" "29:32" "33:36" "37:44" "45:59" "60:74" "75:89" "90:104" "105:119" "120:134" "135"
# [12] "136" "137" "138" "139" "140" "141" "142" "143" "144" "145" "146"
# [23] "147" "148" "149" "150" "151" "152" "153" "154" "155" "156" "157:158"
# [34] "159" "160" "161" "162" "163" "164" "165" "166" "167:169" "170" "171"
# [45] "172" "173" "174" "175" "176" "177" "178" "179" "180:182" "183:185" "186:188"
# [56] "189:191" "192:193" "194:195" "196" "197" "198:199" "200:201" "202" "203:204" "205" "206"
# [67] "207" "208" "209:211" "212" "213" "214" "215:217" "218:220" "221" "222" "223"
# [78] "224" "225" "226" "227" "228" "229" "230" "231" "232" "233" "234"
# [89] "235:237" "238" "239:241" "242:244" "245" "246" "247" "248" "249" "250:251" "252"
# [100] "253:255" "256" "257:258" "259:260" "261:262" "263:264" "265" "266" "267" "268:271" "272:274"
# [111] "275:279" "280:284" "285:286" "287:288" "289:290" "291:292" "293:294" "295:296" "297:298" "299:300" "301:302"
# [122] "303:304" "305:306" "307:308" "309:310" "311:312" "313:314" "315:316" "317:318" "319:320" "321:322" "323:324"