从 r 中的非结构化文本文件中提取 table
Extract table from unstructured text file in r
我有一个文本文件 data.txt 包含以下格式的多个表格。
//
// TABLE ET_ARCMAT
// ARCID MATID VALTO VALFR MINLO MAXLO MAXFL TRVOL TCONA TCONB TCONC TCOST SUBCO TPRIO HORTO RECTI
// STRING STRING INT INT REAL REAL STRING REAL REAL REAL REAL REAL STRING REAL INT INT
01KNkQkk7jAziGNzdcbHDW 051MjWuW7jkNeBn4hOHngG 738487 738068 0.000000000000000E+00 0.000000000000000E+00 o 0.000000000000000E+00 1.000000000000000E+00 0.000000000000000E+00 0.000000000000000E+00 1.000000000000000E-02 o 0.000000000000000E+00 0 0
01KNkQkk7jAziGNzdcbHDW Ww7Em5Fm7jk1dsFo2QZ130 738487 738068 0.000000000000000E+00 0.000000000000000E+00 o 0.000000000000000E+00 1.000000000000000E+00 0.000000000000000E+00 0.000000000000000E+00 1.000000000000000E-02 o 0.000000000000000E+00 0 0
01KNkQkk7jES}VQys4BYhm Ww7Em5Fm7jk1dsFo2QZ130 738487 738068 0.000000000000000E+00 0.000000000000000E+00 o 0.000000000000000E+00 1.000000000000000E+00 0.000000000000000E+00 0.000000000000000E+00 1.000000000000000E-02 o 0.000000000000000E+00 0 0
01KNkQkk7jES}WTyY4T2hm 051MjWuW7jkNeBn4hOHngG 738487 738068 0.000000000000000E+00 0.000000000000000E+00 o 0.000000000000000E+00 1.000000000000000E+00 0.000000000000000E+00 0.000000000000000E+00 1.000000000000000E-02 o 0.000000000000000E+00 0 0
// TABLE ET_ARC
// ARCID FLEID LOCFR LOCTO TTYPE DURAT RNDTR TCTYP DISCR RLDUR
// STRING STRING STRING STRING STRING INT REAL REAL STRING REAL
01KNkQfB7jA{yV0dDRB2r0 01KNkQfB7jA{yV0dDRB2r0 01KNkQUr7jAkokBn7kGH7W 01KNkQUr7jAkokBn7kEn7W 0003 0 1.000000000000000E+00 0.000000000000000E+00 o 0.000000000000000E+00
01KNkQfB7kE5kmrh1M5ImW 01KNkQfB7kE5kmrh1M5ImW 01KNkQUr7jAkokBn7kGH7W 01KNkQUr7jAkokBn7kFn7W 0003 0 1.000000000000000E+00 0.000000000000000E+00 o 0.000000000000000E+00
01KNkQkk7jAIb2l2O8wQ2G 01KNkQkk7jAIb2l2O8wQ2G ULkf}cY{L4Y{P1HCP6yexG avYtOUGQB4MYMCUioDjT9W 0003 0 1.000000000000000E+00 0.000000000000000E+00 o 0.000000000000000E+00
//
// TABLE ET_DEMCLTIM
// DEMCL LOCID MATID BCKTO LAPEN MAXLA NDPEN PENGR
// INT STRING STRING INT REAL INT REAL INT
1 01KNkQUr7jAkokBn7kEn7W 051Meqlv7jgqwRJvGAyX1m 69 4.500000000000000E+08 21 9.999999999000000E+09 0
2 01KNkQUr7jAkokBn7kEn7W 051Meqlv7jgqwRJvGAyX1m 69 4.500000000000000E+08 21 9.999999999000000E+09 0
3 01KNkQUr7jAkokBn7kEn7W 051Meqlv7jgqwRJvGAyX1m 69 4.500000000000000E+08 21 9.999999999000000E+09 0
我需要提取所有表格并将它们存储在 excel 文件中。我试图搜索代码,但找不到正确的代码。请在这方面帮助我。
你可以试试这个代码,它对我分享的示例很有效。
#read the data line wise
tmp <- readLines('data.txt')
#Get the position of table names
table_name_index <- grep('TABLE', tmp)
#Get the name of the tab;e
table_names <- sub('.*TABLE\s+', '', tmp[table_name_index])
#Drop datatypes STRING, INT etc and the tablename
tmp1 <- tmp[-c(table_name_index + 2, table_name_index)]
#Drop empty rows
tmp1 <- tmp1[!(tmp1 == "//" | tmp1 == "")]
#split each table separately
setNames(by(tmp1, cumsum(grepl('//', tmp1)), function(x) {
#Remove '//' from the column names lines
x[1] <- sub('//\s+', '', x[1])
#Read the data as dataframe
read.table(text = paste0(x, collapse = '\n'), header = TRUE)
}), table_names) -> list_of_tables
#Write to excel
writexl::write_xlsx(list_of_tables, 'result.xlsx')
数据存储在 excel 中,分为 3 个单独的工作表。
我有一个文本文件 data.txt 包含以下格式的多个表格。
//
// TABLE ET_ARCMAT
// ARCID MATID VALTO VALFR MINLO MAXLO MAXFL TRVOL TCONA TCONB TCONC TCOST SUBCO TPRIO HORTO RECTI
// STRING STRING INT INT REAL REAL STRING REAL REAL REAL REAL REAL STRING REAL INT INT
01KNkQkk7jAziGNzdcbHDW 051MjWuW7jkNeBn4hOHngG 738487 738068 0.000000000000000E+00 0.000000000000000E+00 o 0.000000000000000E+00 1.000000000000000E+00 0.000000000000000E+00 0.000000000000000E+00 1.000000000000000E-02 o 0.000000000000000E+00 0 0
01KNkQkk7jAziGNzdcbHDW Ww7Em5Fm7jk1dsFo2QZ130 738487 738068 0.000000000000000E+00 0.000000000000000E+00 o 0.000000000000000E+00 1.000000000000000E+00 0.000000000000000E+00 0.000000000000000E+00 1.000000000000000E-02 o 0.000000000000000E+00 0 0
01KNkQkk7jES}VQys4BYhm Ww7Em5Fm7jk1dsFo2QZ130 738487 738068 0.000000000000000E+00 0.000000000000000E+00 o 0.000000000000000E+00 1.000000000000000E+00 0.000000000000000E+00 0.000000000000000E+00 1.000000000000000E-02 o 0.000000000000000E+00 0 0
01KNkQkk7jES}WTyY4T2hm 051MjWuW7jkNeBn4hOHngG 738487 738068 0.000000000000000E+00 0.000000000000000E+00 o 0.000000000000000E+00 1.000000000000000E+00 0.000000000000000E+00 0.000000000000000E+00 1.000000000000000E-02 o 0.000000000000000E+00 0 0
// TABLE ET_ARC
// ARCID FLEID LOCFR LOCTO TTYPE DURAT RNDTR TCTYP DISCR RLDUR
// STRING STRING STRING STRING STRING INT REAL REAL STRING REAL
01KNkQfB7jA{yV0dDRB2r0 01KNkQfB7jA{yV0dDRB2r0 01KNkQUr7jAkokBn7kGH7W 01KNkQUr7jAkokBn7kEn7W 0003 0 1.000000000000000E+00 0.000000000000000E+00 o 0.000000000000000E+00
01KNkQfB7kE5kmrh1M5ImW 01KNkQfB7kE5kmrh1M5ImW 01KNkQUr7jAkokBn7kGH7W 01KNkQUr7jAkokBn7kFn7W 0003 0 1.000000000000000E+00 0.000000000000000E+00 o 0.000000000000000E+00
01KNkQkk7jAIb2l2O8wQ2G 01KNkQkk7jAIb2l2O8wQ2G ULkf}cY{L4Y{P1HCP6yexG avYtOUGQB4MYMCUioDjT9W 0003 0 1.000000000000000E+00 0.000000000000000E+00 o 0.000000000000000E+00
//
// TABLE ET_DEMCLTIM
// DEMCL LOCID MATID BCKTO LAPEN MAXLA NDPEN PENGR
// INT STRING STRING INT REAL INT REAL INT
1 01KNkQUr7jAkokBn7kEn7W 051Meqlv7jgqwRJvGAyX1m 69 4.500000000000000E+08 21 9.999999999000000E+09 0
2 01KNkQUr7jAkokBn7kEn7W 051Meqlv7jgqwRJvGAyX1m 69 4.500000000000000E+08 21 9.999999999000000E+09 0
3 01KNkQUr7jAkokBn7kEn7W 051Meqlv7jgqwRJvGAyX1m 69 4.500000000000000E+08 21 9.999999999000000E+09 0
我需要提取所有表格并将它们存储在 excel 文件中。我试图搜索代码,但找不到正确的代码。请在这方面帮助我。
你可以试试这个代码,它对我分享的示例很有效。
#read the data line wise
tmp <- readLines('data.txt')
#Get the position of table names
table_name_index <- grep('TABLE', tmp)
#Get the name of the tab;e
table_names <- sub('.*TABLE\s+', '', tmp[table_name_index])
#Drop datatypes STRING, INT etc and the tablename
tmp1 <- tmp[-c(table_name_index + 2, table_name_index)]
#Drop empty rows
tmp1 <- tmp1[!(tmp1 == "//" | tmp1 == "")]
#split each table separately
setNames(by(tmp1, cumsum(grepl('//', tmp1)), function(x) {
#Remove '//' from the column names lines
x[1] <- sub('//\s+', '', x[1])
#Read the data as dataframe
read.table(text = paste0(x, collapse = '\n'), header = TRUE)
}), table_names) -> list_of_tables
#Write to excel
writexl::write_xlsx(list_of_tables, 'result.xlsx')
数据存储在 excel 中,分为 3 个单独的工作表。