使用 Tesseract OCR 从扫描的 pdf 文件夹中提取文本

Question

我有使用 Tesseract OCR 从扫描的 pdf files/normal pdf 文件中 extract/convert 文本的代码。但是我想让我的代码转换一个 pdf 文件夹而不是单个 pdf 文件，然后提取的文本文件将存储在我想要的文件夹中。

查看下面我的代码：

filePath = '/Users/CodingStark/scanned/scanned-file.pdf'
pages = convert_from_path(filePath, 500)


image_counter = 1
  
# Iterate through all the pages stored above 
for page in pages: 
  
    filename = "page_"+str(image_counter)+".jpg"
          
    page.save(filename, 'JPEG') 
  
    image_counter = image_counter + 1
    

filelimit = image_counter-1
  
# Creating a text file to write the output 
outfile = "scanned-file.txt"
  

f = open(outfile, "a") 
  
# Iterate from 1 to total number of pages 
for i in range(1, filelimit + 1): 

    filename = "page_"+str(i)+".jpg"
          
    # Recognize the text as string in image using pytesserct 
    text = str(((pytesseract.image_to_string(Image.open(filename))))) 

    text = text.replace('-\n', '')     
  

    f.write(text) 
#Close the file after writing all the text. 
f.close()

我想自动化我的代码，以便它转换扫描文件夹中的所有 pdf 文件，而那些提取的文本文件将位于我想要的文件夹中。另外，请问有什么方法可以删除代码后的所有jpg文件吗？因为它需要大量的内存空间。非常感谢！！

更新了答案

def tesseractOCR_pdf(pdf):

    filePath = pdf
    
    pages = convert_from_path(filePath, 500)

    # Counter to store images of each page of PDF to image 
    image_counter = 1

    # Iterate through all the pages stored above 
    for page in pages:
        # Declaring filename for each page of PDF as JPG 
        # For each page, filename will be: 
        # PDF page 1 -> page_1.jpg 
        # PDF page 2 -> page_2.jpg 
        # PDF page 3 -> page_3.jpg 
        # .... 
        # PDF page n -> page_n.jpg 

        filename = "page_"+str(image_counter)+".jpg"
        
        # Save the image of the page in system 
        page.save(filename, 'JPEG') 
        # Increment the counter to update filename 
        image_counter = image_counter + 1

    # Variable to get count of total number of pages 
    filelimit = image_counter-1


    # Create an empty string for stroing purposes
    text = ""
    # Iterate from 1 to total number of pages 
    for i in range(1, filelimit + 1): 
        # Set filename to recognize text from 
        # Again, these files will be: 
        # page_1.jpg 
        # page_2.jpg 
        # .... 
        # page_n.jpg 
        filename = "page_"+str(i)+".jpg"

        # Recognize the text as string in image using pytesserct 
        text += str(((pytesseract.image_to_string(Image.open(filename))))) 

        text = text.replace('-\n', '')     

    
    #Delete all the jpg files that created from above
    for i in glob.glob("*.jpg"):
        os.remove(i)
        
    return text

def tesseractOCR_img(img):

    filePath = img
    
    text = str(pytesseract.image_to_string(filePath,lang='eng',config='--psm 6'))
    
    text = text.replace('-\n', '')
    
    return text

def Tesseract_ALL(docDir, txtDir):
    if docDir == "": docDir = os.getcwd() + "\" #if no docDir passed in 
        
    for doc in os.listdir(docDir): #iterate through docs in doc directory
        try:
            fileExtension = doc.split(".")[-1]
            
            if fileExtension == "pdf":
                pdfFilename = docDir + doc 
                text = tesseractOCR_pdf(pdfFilename) #get string of text content of pdf
                textFilename = txtDir + doc + ".txt"
                textFile = open(textFilename, "w") #make text file
                textFile.write(text) #write text to text file
            else:   
#             elif (fileExtension == "tif") | (fileExtension == "tiff") | (fileExtension == "jpg"):
                imgFilename = docDir + doc 
                text = tesseractOCR_img(imgFilename) #get string of text content of img
                textFilename = txtDir + doc + ".txt"
                textFile = open(textFilename, "w") #make text file
                textFile.write(text) #write text to text file
        except:
            print("Error in file: "+ str(doc))
            
    for filename in os.listdir(txtDir):
        fileExtension = filename.split(".")[-2]
        if fileExtension == "pdf":
            os.rename(txtDir + filename, txtDir + filename.replace('.pdf', ''))
        elif fileExtension == "tif":
            os.rename(txtDir + filename, txtDir + filename.replace('.tif', ''))
        elif fileExtension == "tiff":
            os.rename(txtDir + filename, txtDir + filename.replace('.tiff', ''))
        elif fileExtension == "jpg":
            os.rename(txtDir + filename, txtDir + filename.replace('.jpg', ''))

#Below are the code to run the functions
#Specific telling the function where the documents located and where you want the txt files to be at
docDir = "pdf_folder"
txtDir = "text_folder"

Tesseract_ALL(docDir, txtDir)

Answer 1

这是从路径读取的循环，

import glob,os
import os, subprocess

pdf_dir = "dir"
os.chdir(pdf_dir)
for pdf_file in glob.glob(os.path.join(pdf_dir, "*.PDF")):
      //// put here what you want to do for each pdf file

使用 Tesseract OCR 从扫描的 pdf 文件夹中提取文本

Use Tesseract OCR to extract text from a scanned pdf folders

python

pdf

text

tesseract

python-tesseract

更新了答案