如何制作一个 python 程序将一堆 pdf 转换为 html?
How to make a python program to convert a bunch of pdfs to html?
我试图做一个程序来读取文件夹中的所有 pdf 并将它们全部转换为 html,例如 file1.pdf、file2.pdf、file3.pdf 然后 运行 程序并创建类似 file1.html、file2.html、file3.htm 的内容。当然,在不丢失主要 pdf 的情况下,直到现在我只能对一个文件执行此操作,我不知道如何使用循环为文件夹中的每个文件制作。
这是我的代码:
import shlex
import subprocess
import os
import platform
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\PROJECT\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
#bin_path = 'C:\Python27\pdf2htmlEX\pdf2htmlEX.exe'
#if not os.path.isfile(bin_path):
# print "Could not find %s" % bin_path
# exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (file_name, file_path)
success, output, errors = run("pdf2txt.py -o %s.html %s" %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
https://github.com/coolwanglu/pdf2htmlEX编译pdf2html项目,python
系统调用cmd pdf2html
这是一个完整的解决方案,使用 os.walk
and pdf2htmlEX:
import shlex
import subprocess
import os
import platform
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\Users\Admin\Desktop\learningpython\PROJECT'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\Python27\pdf2htmlEX-master\pdf2htmlEX.exe'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, dir_path, file_path)
success, output, errors = run("%s --dest-dir %s %s" % args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
我试图做一个程序来读取文件夹中的所有 pdf 并将它们全部转换为 html,例如 file1.pdf、file2.pdf、file3.pdf 然后 运行 程序并创建类似 file1.html、file2.html、file3.htm 的内容。当然,在不丢失主要 pdf 的情况下,直到现在我只能对一个文件执行此操作,我不知道如何使用循环为文件夹中的每个文件制作。 这是我的代码:
import shlex
import subprocess
import os
import platform
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\PROJECT\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
#bin_path = 'C:\Python27\pdf2htmlEX\pdf2htmlEX.exe'
#if not os.path.isfile(bin_path):
# print "Could not find %s" % bin_path
# exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (file_name, file_path)
success, output, errors = run("pdf2txt.py -o %s.html %s" %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
https://github.com/coolwanglu/pdf2htmlEX编译pdf2html项目,python
系统调用cmd pdf2html这是一个完整的解决方案,使用 os.walk
and pdf2htmlEX:
import shlex
import subprocess
import os
import platform
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\Users\Admin\Desktop\learningpython\PROJECT'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\Python27\pdf2htmlEX-master\pdf2htmlEX.exe'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, dir_path, file_path)
success, output, errors = run("%s --dest-dir %s %s" % args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors