读取 CSV 文件直到符合唯一标记

Question

我有很多数据集包含超出特定行的额外信息。这些文件都是csv。我可以循环遍历它们并使用 "skip" 参数 read.csv 来清理数据的顶部，但数据帧的长度都不同。唯一的共同点是“---------------- ----------------- ----- -----”这一行在将有意义的数据与其下方的摘要和无关信息分开的总计列中。

这是我在没有 skip = 14 的情况下读取数据的方式（这是所有内容的标准）。

before<-read.csv("Example.csv", header = FALSE,
             col.names = c("CountryID","Name","Type","Symbol","Code","Unit", 
"Total", "Measurement", "Value", "Percent", "CO2" ))

然而，----- 标记可能在不同的行，但它是第一个要点击的东西。这是之前的数据：

structure(list(CountryID = structure(c(26L, 19L, 21L, 23L, 21L, 
7L, 1L, 1L, 1L, 22L, 3L, 1L, 19L, 2L, 8L, 14L, 15L, 13L, 9L, 
12L, 18L, 17L, 8L, 13L, 15L, 10L, 8L, 8L, 11L, 16L, 1L, 1L, 1L, 
20L, 4L, 6L, 1L, 25L, 5L, 1L, 1L, 1L, 24L, 1L), .Label = c("", 
"------------", "-------------", "---------------", "------------------", 
" ", "08.15.1997", "10000", "15000", "200", "2000", "2500", "3000", 
"45000", "5000", "7000", "8000", "8300", "Country", "Output", 
"Production", "Quantity", "Serial Output", "TOTAL SUM", "Unaccounted", 
"United Nations Data"), class = "factor"), Name = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 20L, 2L, 1L, 1L, 1L, 21L, 4L, 
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 
1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 19L, 1L, 1L, 1L, 1L), .Label = c("", 
"--------------------", " ", "Bahrain", "Bangladesh", "Barbados", 
"Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", 
"Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", 
"Burkina Faso", "Chad", "Name", "The Bahamas"), class = "factor"), 
    Type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 4L, 
    2L, 1L, 1L, 1L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
    6L, 6L, 6L, 6L, 6L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L), .Label = c("", "----", " ", "Code", "Type", 
    "Unit"), class = "factor"), Symbol = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 20L, 22L, 2L, 1L, 1L, 1L, 4L, 5L, 
    6L, 7L, 9L, 8L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 
    19L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 21L, 1L, 1L, 1L, 
    1L), .Label = c("", "------------", " ", "BAHM", "BAHR", 
    "BANG", "BARB", "BELGM", "BELS", "BELZ", "BEN", "BHUT", "BOL", 
    "BOSHER", "BOTS", "BRAZ", "BRUN", "BULG", "BURKF", "Country", 
    "private", "Symbol"), class = "factor"), Code = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 19L, 2L, 1L, 1L, 1L, 12L, 
    15L, 11L, 17L, 4L, 13L, 14L, 9L, 18L, 10L, 5L, 16L, 3L, 7L, 
    8L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L), .Label = c("", "------------", "1504944270", "2287368539", 
    "2388991307", "2453202442", "2561470743", "3205402223", "3221488867", 
    "3230369605", "3247578406", "3712013344", "4307638090", "462793263", 
    "4835205752", "4854959101", "5842098895", "5932776587", "Code"
    ), class = "factor"), Unit = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 17L, 16L, 2L, 1L, 1L, 1L, 7L, 9L, 10L, 14L, 
    12L, 15L, 15L, 11L, 13L, 3L, 8L, 13L, 15L, 6L, 5L, 9L, 1L, 
    1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", 
    "-------------", "100", "1109", "27", "35", "40", "45", "58", 
    "70", "74", "77", "79", "82", "95", "Output", "Per Unit"), class = "factor"), 
    Total = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 25L, 
    24L, 2L, 1L, 1L, 1L, 18L, 5L, 17L, 8L, 23L, 20L, 6L, 9L, 
    7L, 11L, 12L, 13L, 19L, 15L, 14L, 10L, 3L, 16L, 1L, 1L, 1L, 
    16L, 1L, 1L, 1L, 21L, 1L, 3L, 22L, 4L), .Label = c("", "---------------", 
    "---------------            ----------------  ------  -----", 
    "===============            ================  ======  =====", 
    "126912", "147431", "170553", "175973", "203728", "230761", 
    "293789", "304471", "376281", "386526", "399160", "4417002", 
    "476025", "478030", "502999", "51012", "5610654", "56406056", 
    "93351", "Output", "Total"), class = "factor"), Measurement = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 2L, 1L, 1L, 1L, 3L, 
    9L, 3L, 4L, 10L, 9L, 6L, 4L, 5L, 10L, 7L, 9L, 4L, 8L, 10L, 
    9L, 1L, 1L, 1L, 1L, 1L, 11L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L), .Label = c("", "--------", "20", "23", "24", "26", "27", 
    "28", "29", "30", "420", "Measurement"), class = "factor"), 
    Value = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 22L, 
    23L, 2L, 1L, 1L, 1L, 5L, 19L, 11L, 8L, 3L, 18L, 13L, 6L, 
    4L, 9L, 14L, 17L, 7L, 10L, 12L, 15L, 1L, 16L, 1L, 1L, 1L, 
    16L, 1L, 1L, 1L, 20L, 1L, 1L, 21L, 1L), .Label = c("", "----------------", 
    "15150240", "15891735", "16083459", "16959919", "20350968", 
    "20909501", "21770264", "25121096", "27726279", "30024743", 
    "34069742", "34841369", "38498281", "468004111", "49524999", 
    "50512814", "50568702", "540650", "64506", "Country", "Value"
    ), class = "factor"), Percent = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 23L, 2L, 1L, 1L, 1L, 11L, 12L, 8L, 3L, 
    17L, 16L, 5L, 10L, 20L, 9L, 6L, 7L, 4L, 15L, 14L, 22L, 1L, 
    13L, 1L, 1L, 1L, 21L, 1L, 1L, 1L, 19L, 1L, 1L, 18L, 1L), .Label = c("", 
    "------", "102", "104", "106", "112", "126", "129", "142", 
    "15", "160", "177", "1775", "180", "191", "24", "25", "5640645", 
    "650163", "87", "887.5", "95", "Production Percent"), class = "factor"), 
    CO2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 15L, 14L, 
    2L, 1L, 1L, 1L, 9L, 4L, 9L, 7L, 4L, 5L, 4L, 7L, 4L, 9L, 4L, 
    11L, 4L, 12L, 10L, 4L, 1L, 6L, 1L, 1L, 1L, 8L, 1L, 1L, 1L, 
    3L, 1L, 1L, 13L, 1L), .Label = c("", "-----", "?", "0", "0.2", 
    "0.6", "1", "19.4", "2", "2.2", "4", "5", "564065", "CO2", 
    "Cur."), class = "factor")), class = "data.frame", row.names = c(NA, 
-44L))

下面是我希望的样子：

structure(list(CountryID = c(10000L, 45000L, 5000L, 3000L, 15000L, 
2500L, 8300L, 8000L, 10000L, 3000L, 5000L, 200L, 10000L, 10000L, 
2000L, 7000L), Name = structure(c(16L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), .Label = c("Bahrain", 
"Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", 
"Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil", 
"Brunei", "Bulgaria", "Burkina Faso", "The Bahamas"), class = "factor"), 
    Type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L), .Label = "Unit", class = "factor"), 
    Symbol = structure(c(1L, 2L, 3L, 4L, 6L, 5L, 7L, 8L, 9L, 
    10L, 11L, 12L, 13L, 14L, 15L, 16L), .Label = c("BAHM", "BAHR", 
    "BANG", "BARB", "BELGM", "BELS", "BELZ", "BEN", "BHUT", "BOL", 
    "BOSHER", "BOTS", "BRAZ", "BRUN", "BULG", "BURKF"), class = "factor"), 
    Code = c(3712013344, 4835205752, 3247578406, 5842098895, 
    2287368539, 4307638090, 462793263, 3221488867, 5932776587, 
    3230369605, 2388991307, 4854959101, 1504944270, 2561470743, 
    3205402223, 2453202442), Unit = c(40L, 58L, 70L, 82L, 77L, 
    95L, 95L, 74L, 79L, 100L, 45L, 79L, 95L, 35L, 27L, 58L), 
    Total = c(478030L, 126912L, 476025L, 175973L, 93351L, 51012L, 
    147431L, 203728L, 170553L, 293789L, 304471L, 376281L, 502999L, 
    399160L, 386526L, 230761L), Measurement = c(20L, 29L, 20L, 
    23L, 30L, 29L, 26L, 23L, 24L, 30L, 27L, 29L, 23L, 28L, 30L, 
    29L), Value = c(16083459L, 50568702L, 27726279L, 20909501L, 
    15150240L, 50512814L, 34069742L, 16959919L, 15891735L, 21770264L, 
    34841369L, 49524999L, 20350968L, 25121096L, 30024743L, 38498281L
    ), Percent = c(160L, 177L, 129L, 102L, 25L, 24L, 106L, 15L, 
    87L, 142L, 112L, 126L, 104L, 191L, 180L, 95L), CO2 = c(2, 
    0, 2, 1, 0, 0.2, 0, 1, 0, 2, 0, 4, 0, 5, 2.2, 0)), class = "data.frame", row.names = c(NA, 
-16L))

这可以整合到 read.csv 论点中吗，或者是否可以通过其他方式更轻松地清洁它的底部。

Answer 1

读两遍。第一次，使用readLines("Example.csv")，通过行查找数据结束标记。假设它在第 n 行。然后在第二次读取时，使用

read.csv("Example.csv", header = FALSE,
         col.names = c("CountryID","Name","Type","Symbol","Code","Unit", 
         "Total", "Measurement", "Value", "Percent", "CO2" ), nrows = n - 1)

（或者 nrows 可能需要一个不同的值，如果你跳过一些）。

Answer 2

三个想法：

使用readLines（如@user2554330所建议），find/remove特定行，过滤它，然后用read.csv解析文本向量，最少的三.
before[seq_len(min(head(which(!grepl("^[^- ]+$", before$Total)),1)-1L,nrow(before))),]；有点复杂，理所当然，但它可以满足您的需求（假设您已经使用 skip=.
在 pipe(...) 类型的事物中使用外部脚本，例如 sed -e '1,14d;/^[ -]\+$/{g;q;}。

读取 CSV 文件直到符合唯一标记

Read CSV file up to line with unique marker

r

gsub

read.csv