如何制作一个 python 程序将一堆 pdf 转换为 html？

Question

我试图做一个程序来读取文件夹中的所有 pdf 并将它们全部转换为 html，例如 file1.pdf、file2.pdf、file3.pdf 然后运行程序并创建类似 file1.html、file2.html、file3.htm 的内容。当然，在不丢失主要 pdf 的情况下，直到现在我只能对一个文件执行此操作，我不知道如何使用循环为文件夹中的每个文件制作。这是我的代码：

import shlex
import subprocess
import os
import platform

def run(command):
    if platform.system() != 'Windows':
        args = shlex.split(command)
    else:
        args = command
    s = subprocess.Popen(args,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    output, errors = s.communicate()
    return s.returncode == 0, output, errors

# Change this to your PDF file base directory
base_directory = 'C:\PROJECT\pdfs'
if not os.path.isdir(base_directory):
    print "%s is not a directory" % base_directory
    exit(1)
# Change this to your pdf2htmlEX executable location
#bin_path = 'C:\Python27\pdf2htmlEX\pdf2htmlEX.exe'
#if not os.path.isfile(bin_path):
#    print "Could not find %s" % bin_path
#    exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
    for file_name in file_name_list:
        # If this is not a PDF file
        if not file_name.endswith('.pdf'):
            # Skip it
            continue
        file_path = os.path.join(dir_path, file_name)
        # Convert your PDF to HTML here
        args = (file_name, file_path)
        success, output, errors = run("pdf2txt.py -o %s.html %s" %args)
        if not success:
            print "Could not convert %s to HTML" % file_path
            print "%s" % errors

Answer 1

https://github.com/coolwanglu/pdf2htmlEX编译pdf2html项目，python

系统调用cmd pdf2html

Answer 2

这是一个完整的解决方案，使用 os.walk and pdf2htmlEX:

import shlex
import subprocess
import os
import platform

def run(command):
    if platform.system() != 'Windows':
        args = shlex.split(command)
    else:
        args = command
    s = subprocess.Popen(args,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    output, errors = s.communicate()
    return s.returncode == 0, output, errors

# Change this to your PDF file base directory
base_directory = 'C:\Users\Admin\Desktop\learningpython\PROJECT'
if not os.path.isdir(base_directory):
    print "%s is not a directory" % base_directory
    exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\Python27\pdf2htmlEX-master\pdf2htmlEX.exe'
if not os.path.isfile(bin_path):
    print "Could not find %s" % bin_path
    exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
    for file_name in file_name_list:
        # If this is not a PDF file
        if not file_name.endswith('.pdf'):
            # Skip it
            continue
        file_path = os.path.join(dir_path, file_name)
        # Convert your PDF to HTML here
        args = (bin_path, dir_path, file_path)
        success, output, errors = run("%s --dest-dir %s %s" % args)
        if not success:
            print "Could not convert %s to HTML" % file_path
            print "%s" % errors

如何制作一个 python 程序将一堆 pdf 转换为 html？

How to make a python program to convert a bunch of pdfs to html?

html

windows

pdf

converter

python-2.7