使用 R 从 pdf 文件中提取数据
Extract data from pdf files with R
我正在尝试从 pdf 文件中提取数据(表格)并将它们存储为数据框。
library(pdftools)
library(tabulizerjars)
library(tabulizer)
library(tidyverse)
f <- file.path("D:/Araratbank/Statement USD-pages-1.pdf")
#using pdf tools package
text <- pdf_text(f)
text
#> [1] " ´ ³ÝϳÛÇÝ ·³Õï ÝÇù +\r\n γï³ñáÕ`\r\n îå»ó` سñ·³ñÛ³Ý ²Ýݳ èáµ»ñïÇF226 17/12/19 13:45:39\r\n ø²Ôì²Ìø ´²ÜβÚÆÜ Ð²ÞìÆò\r\n ïñ³Ù³¹ñÙ³Ý ³Ùë³ÃÇíÁ 17/12/19 13:46:16\r\n ´ ³ÝÏ AM24149, ÚáõÝǵ³ÝÏ äñÇí» Ù³ëݳ×ÛáõÕ\r\n Ð³×³Ë áñ¹Ç ³Ýáõ ÝÁ/³Ýí ³Ýáõ ÙÁ §²¸²ØÆàôئ êäÀ\r\n гë ó» вڲêî²Ü ºñ¨³Ý èáõµÇÝÛ³Ýó ÷áÕ. 21/3-19\r\n г׳Õáñ¹Ç Ñ ³ßí Ç Ñ ³Ù³ñÁ/² ñÅáõ ÛÃÁ 24149000206001 USD\r\n ø³Õí ³ÍùÇ Ñ ³Ù³ñ\r\n Ü ³Ë áñ¹ ù³Õí ³ÍùÇ Ó¨³í áñÙ³Ý ³Ùë ³ÃÇí 01/09/19\r\n êϽµÝ³Ï³Ý Ùݳóáñ¹ 01/09/19 CR USD 358,048.19\r\n F226 --1\r\n²Ùë ³ÃÇí ö ³ë ï ³ÃÕÃÇ ö ³ë ï ³ÃÕÃÇ ¶ áõ Ù³ñ DB/ êï ³óáÕÇ/ ì׳ñáÕÇ êï ³óáÕÇ/ ì׳ñáÕÇ êï ³óáÕÇ/ í ׳ñáÕÇ Ü å³ï ³ÏÁ\r\n Ñ ³Ù³ñ Ñ ÕÙ³Ý Ñ ³Ù³ñÁ CR Ñ ³ßí Ç Ñ ³Ù³ñ ³Ýáõ ÝÁ/³Ýí ³Ýáõ ÙÁ µ³ÝÏ\r\n PEPSICO HOLDINGS LLC BLICRUMM / HSBC BANK INVOICE 03/00362660-19 DD 07.08.19A CC. TO\r\n 02/09/19 190902021464049 190902049382049 7,336.83 DB 38410000000213 141580,RU SSIA,MOSCOW (RR) OOO CONTRACT N PS/AD 001/02-18D D 14.02.18\r\n SANDORA LTD 57262, CITIUAUK / CITIBANK INV 32015 DD 06.08.19 ACC. TO CONT RACT N\r\n 02/09/19 190902021461049 190902049391049 12,260.20 DB 38410000000213 UKRAINA, N IKOLAEVSKAYA (UKRAINE) S-19-3972 DD 01.06.2019 FOR NATURAL\r\n JSC PERMALKO, AVTBRUMMXXX / URALSIB INVOICE 255 DD 03.09.19 ACC. TO C\r\n 03/09/19 190903041599049 190903047747049 20,082.24 DB 38410000000213 RUSSIA,614990,G.PERM, BANK OAO ONTRACT N282-15 DTD. 16.09.2015 FO R\r\n OOO RODNIK I K AVTBRUMMXXX / URALSIB INVOICES 184-190 DD 20.08.19 ACC . TO\r\n 03/09/19 190903041597049 190903047761049 93,139.20 DB 38410000000213 RUSSIA,MOSKOVSKA YA BANK OAO CONTRACT N62-M DD 10.05.2016F OR\r\n GLOBAL SPIRITS GROUP MUNIUA22 / TASCOMBANK INVOICES 18,19 DD 23.08.19 ACC. TOC\r\n 03/09/19 190903041591049 190903047819049 41,015.88 DB 38410000000213 LLC 12 VYACHESLAV JSC (FORMERLY BANK ONTRACT N 06/2019-A DD 13.07.19 FOR\r\n ABRAHAM JACOBI- THE RZBAATWW RAIFFEISEN\r\n 04/09/19 ASW07394/040919 190904088136000 14,307.58 CR 38410000000197 BEER STORE 3-22 S.Y. BANK INTERNATIONAL AG\r\n M.D. AVIATION SERVICES RZBAATWW RAIFFEISEN INV:03092019 DATE 03/09/19\r\n 04/09/19 ASW97492/030919 190904088137000 14,371.58 CR 38410000000197 LTD 30 SHD. GOSHEN BANK INTERNATIONAL AG\r\n GLOBAL SPIRITS GROUP MUNIUA22 / TASCOMBANK INVOICE 12 DD 09.08.19 ACC. TO CONT RACT\r\n 05/09/19 190905032684049 190905035088049 300.00 DB 38410000000213 LLC 12 VYACHESLAV JSC (FORMERLY BANK N 06/2019-A DD 13.07.19 FOR AD VERTISING\r\n LLC WORLD TRADE BAGAGE22 / BANK OF INVOICE 809 DD 27.08.19 ACC TO CON TRACT\r\n 05/09/19 190905032676049 190905035147049 6,160.00 DB 38410000000213 COMPANY GEORGI GEORGIA N 071218 DD 07/12/18 FOR TRAN SPORTATION\r\n´³ÝϳÛÇÝ ·³ÕïÝÇù*\r\n 1\r\n"
#using tabulizer package
statement <- extract_tables(
file = f,
method = "decide")
str(statement)
#> List of 1
#> $ : chr [1:20, 1:9] "2Ã1ë3ÃÇÃ" "" "" "02/09/19" ...
statement
#> [[1]]
#> [,1] [,2] [,3]
#> [1,] "2Ã1ë3ÃÇÃ" "ö 3ëï3ÃÕÃÇ" "ö 3ëï3ÃÕÃÇ"
#> [2,] "" "Ñ3Ã13ñ" "ÑÕÃ13Ã5 Ñ3Ã13ñÃ1"
#> [3,] "" "" ""
#> [4,] "02/09/19" "190902021464049" "190902049382049"
#> [5,] "" "" ""
#> [6,] "02/09/19" "190902021461049" "190902049391049"
#> [7,] "" "" ""
#> [8,] "03/09/19" "190903041599049" "190903047747049"
#> [9,] "" "" ""
#> [10,] "03/09/19" "190903041597049" "190903047761049"
#> [11,] "" "" ""
#> [12,] "03/09/19" "190903041591049" "190903047819049"
#> [13,] "" "" ""
#> [14,] "04/09/19" "ASW07394/040919" "190904088136000"
#> [15,] "" "" ""
#> [16,] "04/09/19" "ASW97492/030919" "190904088137000"
#> [17,] "" "" ""
#> [18,] "05/09/19" "190905032684049" "190905035088049"
#> [19,] "" "" ""
#> [20,] "05/09/19" "190905032676049" "190905035147049"
#> [,4] [,5] [,6]
#> [1,] "¶ áõÃ13ñ DB/" "" "êï3óáÕÇ/ì×3ñáÕÇ"
#> [2,] "" "CR" "Ñ3ßÃÇ Ñ3Ã13ñ"
#> [3,] "" "" ""
#> [4,] "7,336.83" "DB" "38410000000213"
#> [5,] "" "" ""
#> [6,] "12,260.20" "DB" "38410000000213"
#> [7,] "" "" ""
#> [8,] "20,082.24" "DB" "38410000000213"
#> [9,] "" "" ""
#> [10,] "93,139.20" "DB" "38410000000213"
#> [11,] "" "" ""
#> [12,] "41,015.88" "DB" "38410000000213"
#> [13,] "" "" ""
#> [14,] "14,307.58" "CR" "38410000000197"
#> [15,] "" "" ""
#> [16,] "14,371.58" "CR" "38410000000197"
#> [17,] "" "" ""
#> [18,] "300.00" "DB" "38410000000213"
#> [19,] "" "" ""
#> [20,] "6,160.00" "DB" "38410000000213"
#> [,7] [,8]
#> [1,] "êï3óáÕÇ/ì×3ñáÕÇ" "êï3óáÕÇ/Ã×3ñáÕÇ"
#> [2,] "3Ã5áõÃ5Ã1/3Ã5Ã3Ã5áõÃ1Ã1" "μ3Ã5Ã7"
#> [3,] "PEPSICO HOLDINGS LLC" "BLICRUMM / HSBC BANK"
#> [4,] "141580,RU SSIA,MOSCOW" "(RR) OOO"
#> [5,] "SANDORA LTD57262," "CITIUAUK / CITIBANK"
#> [6,] "UKRAINA, N IKOLAEVSKAYA" "(UKRAINE)"
#> [7,] "JSC PERMALKO," "AVTBRUMMXXX / URALSIB"
#> [8,] "RUSSIA,614990,G.PERM," "BANK OAO"
#> [9,] "OOO RODNIK I K" "AVTBRUMMXXX / URALSIB"
#> [10,] "RUSSIA,MOSKOVSKA YA" "BANK OAO"
#> [11,] "GLOBAL SPIRITS GROUP" "MUNIUA22 / TASCOMBANK"
#> [12,] "LLC12 VYACHESLAV" "JSC (FORMERLY BANK"
#> [13,] "ABRAHAM JACOBI- THE" "RZBAATWW RAIFFEISEN"
#> [14,] "BEER STORE 3-22 S.Y." "BANK INTERNATIONAL AG"
#> [15,] "M.D. AVIATION SERVICES" "RZBAATWW RAIFFEISEN"
#> [16,] "LTD 30 SHD. GOSHEN" "BANK INTERNATIONAL AG"
#> [17,] "GLOBAL SPIRITS GROUP" "MUNIUA22 / TASCOMBANK"
#> [18,] "LLC12 VYACHESLAV" "JSC (FORMERLY BANK"
#> [19,] "LLC WORLD TRADE" "BAGAGE22 / BANK OF"
#> [20,] "COMPANYGEORGI" "GEORGIA"
#> [,9]
#> [1,] "Üå3ï3Ã7Ã1"
#> [2,] ""
#> [3,] "INVOICE 03/00362660-19 DD 07.08.19A CC. TO"
#> [4,] "CONTRACT N PS/AD 001/02-18D D 14.02.18"
#> [5,] "INV 32015 DD 06.08.19 ACC. TO CONT RACT N"
#> [6,] "S-19-3972 DD 01.06.2019 FOR NATURAL"
#> [7,] "INVOICE 255 DD 03.09.19 ACC. TO C"
#> [8,] "ONTRACT N282-15 DTD. 16.09.2015 FO R"
#> [9,] "INVOICES 184-190 DD 20.08.19 ACC . TO"
#> [10,] "CONTRACT N62-M DD 10.05.2016F OR"
#> [11,] "INVOICES 18,19 DD 23.08.19 ACC. TOC"
#> [12,] "ONTRACT N 06/2019-A DD 13.07.19 FOR"
#> [13,] ""
#> [14,] ""
#> [15,] "INV:03092019DATE 03/09/19"
#> [16,] ""
#> [17,] "INVOICE 12 DD 09.08.19 ACC. TO CONT RACT"
#> [18,] "N 06/2019-A DD 13.07.19 FOR AD VERTISING"
#> [19,] "INVOICE 809 DD 27.08.19 ACC TO CON TRACT"
#> [20,] "N 071218 DD 07/12/18 FOR TRAN SPORTATION"
由 reprex package (v0.3.0)
于 2020-01-07 创建
两种选择return 长行非结构化和混乱的数据。有没有其他方法可以从 pdf 文件中提取这些类型的数据(以将表格作为数据框获取),或者我必须清理和整理这些数据?您可以在此处找到该文件:statement USD
您的两个选择是自己整理文本,这可能但很难,或者尝试 pdftools package. This requires you to sign up for the API。使用 pdftables 可以更快地获得结果,但是可以转换的 PDF 数量会受到限制。如果您有大量文档要处理,使用 pdftools::pdf_data
.
可能更容易获得页面上所有元素的位置
我正在尝试从 pdf 文件中提取数据(表格)并将它们存储为数据框。
library(pdftools)
library(tabulizerjars)
library(tabulizer)
library(tidyverse)
f <- file.path("D:/Araratbank/Statement USD-pages-1.pdf")
#using pdf tools package
text <- pdf_text(f)
text
#> [1] " ´ ³ÝϳÛÇÝ ·³Õï ÝÇù +\r\n γï³ñáÕ`\r\n îå»ó` سñ·³ñÛ³Ý ²Ýݳ èáµ»ñïÇF226 17/12/19 13:45:39\r\n ø²Ôì²Ìø ´²ÜβÚÆÜ Ð²ÞìÆò\r\n ïñ³Ù³¹ñÙ³Ý ³Ùë³ÃÇíÁ 17/12/19 13:46:16\r\n ´ ³ÝÏ AM24149, ÚáõÝǵ³ÝÏ äñÇí» Ù³ëݳ×ÛáõÕ\r\n Ð³×³Ë áñ¹Ç ³Ýáõ ÝÁ/³Ýí ³Ýáõ ÙÁ §²¸²ØÆàôئ êäÀ\r\n гë ó» вڲêî²Ü ºñ¨³Ý èáõµÇÝÛ³Ýó ÷áÕ. 21/3-19\r\n г׳Õáñ¹Ç Ñ ³ßí Ç Ñ ³Ù³ñÁ/² ñÅáõ ÛÃÁ 24149000206001 USD\r\n ø³Õí ³ÍùÇ Ñ ³Ù³ñ\r\n Ü ³Ë áñ¹ ù³Õí ³ÍùÇ Ó¨³í áñÙ³Ý ³Ùë ³ÃÇí 01/09/19\r\n êϽµÝ³Ï³Ý Ùݳóáñ¹ 01/09/19 CR USD 358,048.19\r\n F226 --1\r\n²Ùë ³ÃÇí ö ³ë ï ³ÃÕÃÇ ö ³ë ï ³ÃÕÃÇ ¶ áõ Ù³ñ DB/ êï ³óáÕÇ/ ì׳ñáÕÇ êï ³óáÕÇ/ ì׳ñáÕÇ êï ³óáÕÇ/ í ׳ñáÕÇ Ü å³ï ³ÏÁ\r\n Ñ ³Ù³ñ Ñ ÕÙ³Ý Ñ ³Ù³ñÁ CR Ñ ³ßí Ç Ñ ³Ù³ñ ³Ýáõ ÝÁ/³Ýí ³Ýáõ ÙÁ µ³ÝÏ\r\n PEPSICO HOLDINGS LLC BLICRUMM / HSBC BANK INVOICE 03/00362660-19 DD 07.08.19A CC. TO\r\n 02/09/19 190902021464049 190902049382049 7,336.83 DB 38410000000213 141580,RU SSIA,MOSCOW (RR) OOO CONTRACT N PS/AD 001/02-18D D 14.02.18\r\n SANDORA LTD 57262, CITIUAUK / CITIBANK INV 32015 DD 06.08.19 ACC. TO CONT RACT N\r\n 02/09/19 190902021461049 190902049391049 12,260.20 DB 38410000000213 UKRAINA, N IKOLAEVSKAYA (UKRAINE) S-19-3972 DD 01.06.2019 FOR NATURAL\r\n JSC PERMALKO, AVTBRUMMXXX / URALSIB INVOICE 255 DD 03.09.19 ACC. TO C\r\n 03/09/19 190903041599049 190903047747049 20,082.24 DB 38410000000213 RUSSIA,614990,G.PERM, BANK OAO ONTRACT N282-15 DTD. 16.09.2015 FO R\r\n OOO RODNIK I K AVTBRUMMXXX / URALSIB INVOICES 184-190 DD 20.08.19 ACC . TO\r\n 03/09/19 190903041597049 190903047761049 93,139.20 DB 38410000000213 RUSSIA,MOSKOVSKA YA BANK OAO CONTRACT N62-M DD 10.05.2016F OR\r\n GLOBAL SPIRITS GROUP MUNIUA22 / TASCOMBANK INVOICES 18,19 DD 23.08.19 ACC. TOC\r\n 03/09/19 190903041591049 190903047819049 41,015.88 DB 38410000000213 LLC 12 VYACHESLAV JSC (FORMERLY BANK ONTRACT N 06/2019-A DD 13.07.19 FOR\r\n ABRAHAM JACOBI- THE RZBAATWW RAIFFEISEN\r\n 04/09/19 ASW07394/040919 190904088136000 14,307.58 CR 38410000000197 BEER STORE 3-22 S.Y. BANK INTERNATIONAL AG\r\n M.D. AVIATION SERVICES RZBAATWW RAIFFEISEN INV:03092019 DATE 03/09/19\r\n 04/09/19 ASW97492/030919 190904088137000 14,371.58 CR 38410000000197 LTD 30 SHD. GOSHEN BANK INTERNATIONAL AG\r\n GLOBAL SPIRITS GROUP MUNIUA22 / TASCOMBANK INVOICE 12 DD 09.08.19 ACC. TO CONT RACT\r\n 05/09/19 190905032684049 190905035088049 300.00 DB 38410000000213 LLC 12 VYACHESLAV JSC (FORMERLY BANK N 06/2019-A DD 13.07.19 FOR AD VERTISING\r\n LLC WORLD TRADE BAGAGE22 / BANK OF INVOICE 809 DD 27.08.19 ACC TO CON TRACT\r\n 05/09/19 190905032676049 190905035147049 6,160.00 DB 38410000000213 COMPANY GEORGI GEORGIA N 071218 DD 07/12/18 FOR TRAN SPORTATION\r\n´³ÝϳÛÇÝ ·³ÕïÝÇù*\r\n 1\r\n"
#using tabulizer package
statement <- extract_tables(
file = f,
method = "decide")
str(statement)
#> List of 1
#> $ : chr [1:20, 1:9] "2Ã1ë3ÃÇÃ" "" "" "02/09/19" ...
statement
#> [[1]]
#> [,1] [,2] [,3]
#> [1,] "2Ã1ë3ÃÇÃ" "ö 3ëï3ÃÕÃÇ" "ö 3ëï3ÃÕÃÇ"
#> [2,] "" "Ñ3Ã13ñ" "ÑÕÃ13Ã5 Ñ3Ã13ñÃ1"
#> [3,] "" "" ""
#> [4,] "02/09/19" "190902021464049" "190902049382049"
#> [5,] "" "" ""
#> [6,] "02/09/19" "190902021461049" "190902049391049"
#> [7,] "" "" ""
#> [8,] "03/09/19" "190903041599049" "190903047747049"
#> [9,] "" "" ""
#> [10,] "03/09/19" "190903041597049" "190903047761049"
#> [11,] "" "" ""
#> [12,] "03/09/19" "190903041591049" "190903047819049"
#> [13,] "" "" ""
#> [14,] "04/09/19" "ASW07394/040919" "190904088136000"
#> [15,] "" "" ""
#> [16,] "04/09/19" "ASW97492/030919" "190904088137000"
#> [17,] "" "" ""
#> [18,] "05/09/19" "190905032684049" "190905035088049"
#> [19,] "" "" ""
#> [20,] "05/09/19" "190905032676049" "190905035147049"
#> [,4] [,5] [,6]
#> [1,] "¶ áõÃ13ñ DB/" "" "êï3óáÕÇ/ì×3ñáÕÇ"
#> [2,] "" "CR" "Ñ3ßÃÇ Ñ3Ã13ñ"
#> [3,] "" "" ""
#> [4,] "7,336.83" "DB" "38410000000213"
#> [5,] "" "" ""
#> [6,] "12,260.20" "DB" "38410000000213"
#> [7,] "" "" ""
#> [8,] "20,082.24" "DB" "38410000000213"
#> [9,] "" "" ""
#> [10,] "93,139.20" "DB" "38410000000213"
#> [11,] "" "" ""
#> [12,] "41,015.88" "DB" "38410000000213"
#> [13,] "" "" ""
#> [14,] "14,307.58" "CR" "38410000000197"
#> [15,] "" "" ""
#> [16,] "14,371.58" "CR" "38410000000197"
#> [17,] "" "" ""
#> [18,] "300.00" "DB" "38410000000213"
#> [19,] "" "" ""
#> [20,] "6,160.00" "DB" "38410000000213"
#> [,7] [,8]
#> [1,] "êï3óáÕÇ/ì×3ñáÕÇ" "êï3óáÕÇ/Ã×3ñáÕÇ"
#> [2,] "3Ã5áõÃ5Ã1/3Ã5Ã3Ã5áõÃ1Ã1" "μ3Ã5Ã7"
#> [3,] "PEPSICO HOLDINGS LLC" "BLICRUMM / HSBC BANK"
#> [4,] "141580,RU SSIA,MOSCOW" "(RR) OOO"
#> [5,] "SANDORA LTD57262," "CITIUAUK / CITIBANK"
#> [6,] "UKRAINA, N IKOLAEVSKAYA" "(UKRAINE)"
#> [7,] "JSC PERMALKO," "AVTBRUMMXXX / URALSIB"
#> [8,] "RUSSIA,614990,G.PERM," "BANK OAO"
#> [9,] "OOO RODNIK I K" "AVTBRUMMXXX / URALSIB"
#> [10,] "RUSSIA,MOSKOVSKA YA" "BANK OAO"
#> [11,] "GLOBAL SPIRITS GROUP" "MUNIUA22 / TASCOMBANK"
#> [12,] "LLC12 VYACHESLAV" "JSC (FORMERLY BANK"
#> [13,] "ABRAHAM JACOBI- THE" "RZBAATWW RAIFFEISEN"
#> [14,] "BEER STORE 3-22 S.Y." "BANK INTERNATIONAL AG"
#> [15,] "M.D. AVIATION SERVICES" "RZBAATWW RAIFFEISEN"
#> [16,] "LTD 30 SHD. GOSHEN" "BANK INTERNATIONAL AG"
#> [17,] "GLOBAL SPIRITS GROUP" "MUNIUA22 / TASCOMBANK"
#> [18,] "LLC12 VYACHESLAV" "JSC (FORMERLY BANK"
#> [19,] "LLC WORLD TRADE" "BAGAGE22 / BANK OF"
#> [20,] "COMPANYGEORGI" "GEORGIA"
#> [,9]
#> [1,] "Üå3ï3Ã7Ã1"
#> [2,] ""
#> [3,] "INVOICE 03/00362660-19 DD 07.08.19A CC. TO"
#> [4,] "CONTRACT N PS/AD 001/02-18D D 14.02.18"
#> [5,] "INV 32015 DD 06.08.19 ACC. TO CONT RACT N"
#> [6,] "S-19-3972 DD 01.06.2019 FOR NATURAL"
#> [7,] "INVOICE 255 DD 03.09.19 ACC. TO C"
#> [8,] "ONTRACT N282-15 DTD. 16.09.2015 FO R"
#> [9,] "INVOICES 184-190 DD 20.08.19 ACC . TO"
#> [10,] "CONTRACT N62-M DD 10.05.2016F OR"
#> [11,] "INVOICES 18,19 DD 23.08.19 ACC. TOC"
#> [12,] "ONTRACT N 06/2019-A DD 13.07.19 FOR"
#> [13,] ""
#> [14,] ""
#> [15,] "INV:03092019DATE 03/09/19"
#> [16,] ""
#> [17,] "INVOICE 12 DD 09.08.19 ACC. TO CONT RACT"
#> [18,] "N 06/2019-A DD 13.07.19 FOR AD VERTISING"
#> [19,] "INVOICE 809 DD 27.08.19 ACC TO CON TRACT"
#> [20,] "N 071218 DD 07/12/18 FOR TRAN SPORTATION"
由 reprex package (v0.3.0)
于 2020-01-07 创建两种选择return 长行非结构化和混乱的数据。有没有其他方法可以从 pdf 文件中提取这些类型的数据(以将表格作为数据框获取),或者我必须清理和整理这些数据?您可以在此处找到该文件:statement USD
您的两个选择是自己整理文本,这可能但很难,或者尝试 pdftools package. This requires you to sign up for the API。使用 pdftables 可以更快地获得结果,但是可以转换的 PDF 数量会受到限制。如果您有大量文档要处理,使用 pdftools::pdf_data
.