解析银行对帐单 PDF
Parsing a bank statement PDF
我有几份来自我们用户的银行对帐单。我想找出一种方法来解析交易行。我以前使用 PDFBox 使用 TextArea、TextStripper,但我不确定如何处理银行对帐单,因为它们的行数不确定,并且行的大小可能固定,也可能不固定。
我写了这样一个解析器来解析我们的 chase pdf 信用卡报表,以加快 tax-preparation 时间,在一个名为 Apache Tika 的开源项目的帮助下。
只需要在您的 pom.xml 依赖项中包含 tika 和 pdf 解析器:
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.17</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.17</version>
</dependency>
PDF 提取器也相当简单:
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
public class PdfExtractor {
private static Logger logger = LoggerFactory.getLogger(PdfExtractor.class);
public static void main(String args[]) throws Exception {
StopWatch sw = new StopWatch();
List<String> files = new ArrayList<>();
files.add("C:/Users/m/Downloads/20170115.pdf");
files.add("C:/Users/m/Downloads/20170215.pdf");
files.add("C:/Users/m/Downloads/20170315.pdf");
files.add("C:/Users/m/Downloads/20170415.pdf");
files.add("C:/Users/m/Downloads/20170515.pdf");
files.add("C:/Users/m/Downloads/20170615.pdf");
files.add("C:/Users/m/Downloads/20170715.pdf");
files.add("C:/Users/m/Downloads/20170815.pdf");
files.add("C:/Users/m/Downloads/20170915.pdf");
files.add("C:/Users/m/Downloads/20171015.pdf");
files.add("C:/Users/m/Downloads/20171115.pdf");
files.add("C:/Users/m/Downloads/20171215.pdf");
files.add("C:/Users/m/Downloads/20180115.pdf");
InputStream is;
List<ChasePdfParser.ChaseRecord> full = new ArrayList<>();
for (String fileName : files) {
logger.info("Now processing " + fileName);
is = new FileInputStream(fileName);
ContentHandler contenthandler = new BodyContentHandler();
Metadata metadata = new Metadata();
PDFParser pdfparser = new PDFParser();
pdfparser.parse(is, contenthandler, metadata, new ParseContext());
String data = contenthandler.toString();
List<ChasePdfParser.ChaseRecord> chaseRecords = ChasePdfParser.parse(data);
full.addAll(chaseRecords);
is.close();
}
logger.info("Total processing time: " + PrettyPrinter.toMsSoundsGood(sw.getTime()));
full.forEach(cr -> System.err.println(cr.date + "|" + cr.desc + "|" + cr.amt));
}
}
行解析器也相当straight-forward,因为每一行都有所有必要的信息,所以很容易解析:
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class ChasePdfParser {
private static Logger logger = LoggerFactory.getLogger(ChasePdfParser.class);
private static int FOR_TAX_YEAR = 2017;
private static String YEAR_EXTENSION = "/" + FOR_TAX_YEAR;
private static DateTimeFormatter check = DateTimeFormatter.ofPattern("MM/dd/uuuu");
private static List<String> exclusions = new ArrayList<>(Arrays.asList("Payment Thank You", "AUTOMATIC PAYMENT"));
public static List<ChaseRecord> parse(String data) {
List<ChaseRecord> l = new ArrayList<>();
for (String line : data.split("\n")) {
if (line.isEmpty()) continue;
String[] split = line.split("\s");
if (split == null || split.length == 0) continue;
String test = split[0];
if (!isMMDD(test)) continue;
if(skip(line)) continue;
if (split.length < 4) continue;
ChaseRecord cr = new ChaseRecord();
cr.date = extractDate(test);
try {
String last = split[split.length - 1];
last = last.replaceAll(",", "");
cr.amt = Double.parseDouble(last);
} catch (NumberFormatException e) {
e.printStackTrace();
}
cr.desc = String.join(" ", Arrays.copyOfRange(split, 1, split.length - 1));
cr.desc = cr.desc.replaceAll("\s\s+", " ");
l.add(cr);
}
return l;
}
private static boolean skip(String s) {
if (s == null || s.isEmpty()) {
return true;
}
for (String e : exclusions) {
if (s.contains(e)) {
return true;
}
}
return false;
}
protected static LocalDate extractDate(String s) {
if (!isMMDD(s)) {
return null;
}
LocalDate localDate = LocalDate.parse(s + YEAR_EXTENSION, check);
return localDate;
}
public static boolean isMMDD(String s) {
if (s == null || s.isEmpty() || s.length() != 5) {
return false;
}
try {
s += YEAR_EXTENSION;
LocalDate.parse(s, check);
return true;
} catch (Exception e) {
return false;
}
}
public static class ChaseRecord {
public LocalDate date;
public String desc;
public Double amt;
@Override
public String toString() {
return "ChaseRecord{" +
"date=" + date +
", desc='" + desc + '\'' +
", amt=" + amt +
'}';
}
}
}
我有几份来自我们用户的银行对帐单。我想找出一种方法来解析交易行。我以前使用 PDFBox 使用 TextArea、TextStripper,但我不确定如何处理银行对帐单,因为它们的行数不确定,并且行的大小可能固定,也可能不固定。
我写了这样一个解析器来解析我们的 chase pdf 信用卡报表,以加快 tax-preparation 时间,在一个名为 Apache Tika 的开源项目的帮助下。
只需要在您的 pom.xml 依赖项中包含 tika 和 pdf 解析器:
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.17</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.17</version>
</dependency>
PDF 提取器也相当简单:
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
public class PdfExtractor {
private static Logger logger = LoggerFactory.getLogger(PdfExtractor.class);
public static void main(String args[]) throws Exception {
StopWatch sw = new StopWatch();
List<String> files = new ArrayList<>();
files.add("C:/Users/m/Downloads/20170115.pdf");
files.add("C:/Users/m/Downloads/20170215.pdf");
files.add("C:/Users/m/Downloads/20170315.pdf");
files.add("C:/Users/m/Downloads/20170415.pdf");
files.add("C:/Users/m/Downloads/20170515.pdf");
files.add("C:/Users/m/Downloads/20170615.pdf");
files.add("C:/Users/m/Downloads/20170715.pdf");
files.add("C:/Users/m/Downloads/20170815.pdf");
files.add("C:/Users/m/Downloads/20170915.pdf");
files.add("C:/Users/m/Downloads/20171015.pdf");
files.add("C:/Users/m/Downloads/20171115.pdf");
files.add("C:/Users/m/Downloads/20171215.pdf");
files.add("C:/Users/m/Downloads/20180115.pdf");
InputStream is;
List<ChasePdfParser.ChaseRecord> full = new ArrayList<>();
for (String fileName : files) {
logger.info("Now processing " + fileName);
is = new FileInputStream(fileName);
ContentHandler contenthandler = new BodyContentHandler();
Metadata metadata = new Metadata();
PDFParser pdfparser = new PDFParser();
pdfparser.parse(is, contenthandler, metadata, new ParseContext());
String data = contenthandler.toString();
List<ChasePdfParser.ChaseRecord> chaseRecords = ChasePdfParser.parse(data);
full.addAll(chaseRecords);
is.close();
}
logger.info("Total processing time: " + PrettyPrinter.toMsSoundsGood(sw.getTime()));
full.forEach(cr -> System.err.println(cr.date + "|" + cr.desc + "|" + cr.amt));
}
}
行解析器也相当straight-forward,因为每一行都有所有必要的信息,所以很容易解析:
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class ChasePdfParser {
private static Logger logger = LoggerFactory.getLogger(ChasePdfParser.class);
private static int FOR_TAX_YEAR = 2017;
private static String YEAR_EXTENSION = "/" + FOR_TAX_YEAR;
private static DateTimeFormatter check = DateTimeFormatter.ofPattern("MM/dd/uuuu");
private static List<String> exclusions = new ArrayList<>(Arrays.asList("Payment Thank You", "AUTOMATIC PAYMENT"));
public static List<ChaseRecord> parse(String data) {
List<ChaseRecord> l = new ArrayList<>();
for (String line : data.split("\n")) {
if (line.isEmpty()) continue;
String[] split = line.split("\s");
if (split == null || split.length == 0) continue;
String test = split[0];
if (!isMMDD(test)) continue;
if(skip(line)) continue;
if (split.length < 4) continue;
ChaseRecord cr = new ChaseRecord();
cr.date = extractDate(test);
try {
String last = split[split.length - 1];
last = last.replaceAll(",", "");
cr.amt = Double.parseDouble(last);
} catch (NumberFormatException e) {
e.printStackTrace();
}
cr.desc = String.join(" ", Arrays.copyOfRange(split, 1, split.length - 1));
cr.desc = cr.desc.replaceAll("\s\s+", " ");
l.add(cr);
}
return l;
}
private static boolean skip(String s) {
if (s == null || s.isEmpty()) {
return true;
}
for (String e : exclusions) {
if (s.contains(e)) {
return true;
}
}
return false;
}
protected static LocalDate extractDate(String s) {
if (!isMMDD(s)) {
return null;
}
LocalDate localDate = LocalDate.parse(s + YEAR_EXTENSION, check);
return localDate;
}
public static boolean isMMDD(String s) {
if (s == null || s.isEmpty() || s.length() != 5) {
return false;
}
try {
s += YEAR_EXTENSION;
LocalDate.parse(s, check);
return true;
} catch (Exception e) {
return false;
}
}
public static class ChaseRecord {
public LocalDate date;
public String desc;
public Double amt;
@Override
public String toString() {
return "ChaseRecord{" +
"date=" + date +
", desc='" + desc + '\'' +
", amt=" + amt +
'}';
}
}
}