base64 字符串上的 pytesseract python

pytesseract on base64 string python

我有一大堆用于 png 图像的 base64 格式的图像字符串。它们是 phone 个数字(请参阅 http://www.trulia.com/profile/gerald-drexler-broker-neillsville-wi-10703037/overview 我的工作示例,使用数字中的 src 标记)。我想 运行 他们通过 pytesseract 来提取数字。

我从这里的答案中得到了一些指导:Loading Base64 String into Python Image Library

我尝试了几种公式,但我似乎无法弄清楚如何将字符串正确加载到 PIL 到 运行 pytesseract 中。这是一个尝试的例子:

from PIL import Image
import base64
import pytesseract
import cStringIO

imgstring = 'data: image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGcAAAAVCAYAAABbq/AzAAAACXBIWXMAAA7EAAAOxAGVKw4bAAADiUlEQVRoge3YTWgeVRQG4IcQJJQSQihBNIQiXYUSpJSgIF1IkSKllCJFQggutBQRQRfFH3RTRFwVERdBRHcuREVEupASRIP4C7VoFSmWSq2gtRGjNm21Ls79+k0mM3cmLrpxXhjm++aec973/sy55w4dOnTo0KFDhw4FLOIe3IWrmWtbwWcfLuDhinj7avzvTjyLDXp24F2cxx84gQMVdgfxHVbwFe5tqaEObXxynDCAQ/g2af+8hjM3ftcwIzoPQ5iouObEQG1INvP4PvlVBT+IYxVxhlJ7Vad6GMICHsQkNqWO/I7Zgt1s6twejGEvfhMLrI2GKjT5NHHCs6l/dyTtM8nnzkL/msbvGhbFTOdwDIfT7514GyN4ryb403glE+8JvN/AWcY8Xi/8P45HK+J+0FJDFZp8mjgHxRu1o2TzkH62aDN+YFS8tlMZQdP4S6yCMhZqgr8oVlAdtuFvDGdsyngHL6Xfw0L31pq4Qy00VCHn04bzlmSzsWQznmw2lJ7XjZ8BbMclfJ0R/CRexS8ZmzJGcb/+nnFcrJ6B1P4lroiJb8I4nhGD8FzhGZwu2Z5OHJtbaFiv7jacvTG6qWSzKdmMZbjX4D78kGnfisvYUtNeN/PDYr8Yxo2iCDiH5ws2ZxN/HY7ob8insKvQtj09Hyz5DOkXLm00rEd3G04iZR3Vn8xpkdKuWjtptW+O1HAyI/Y1vJFpzwYvYUakx17nTjb4Dor0MIH9YgPtpZxJ1emjl6YnGzTMWF2NzbTQ3ZZzRKTfc6KIOSoKiMu4oeRbO36DWK4g62GLGJTbM8LXg2/EKtuIpXRfzthfSe3LOJPub+EpkUr+ERNXTMkT6X6mQcObuLnw/NcWuttyLuGBUoxZkcovZXhWYQA/6efDMh7Hh/ikbcAGTCe+Jf38++M6/Iur7k98ZnWqk/5/oX7SexouJu7edbGF7v/KOYBH8HKGoxJjqqu1cVES5g5s1L+Wh3GbyNvjYm+5oH+QnEq8oxW+t4rSfkqkiDHsFmltvmC3R2zcu5LdbnHm2N9SQxWafJo4iYwwIhbTtCiXF1S/AI3bwqfWnnNeEAepJuRK6VOi4vlZnGmKK+4QPq6JuVmcZ86KBXI+2R6wtoNz4iS+Ik7txYFv0lCFNj45TuJguqL/ZeMxa/eaHhonZ07/C8H1wonE26EFPhKfSK4H9ia+Dh06dOjw/8G/sXcmUir28IcAAAAASUVORK5CYII='
imgstring = imgstring.split('base64,')[-1].strip()
pic = cStringIO.StringIO()
image_string = cStringIO.StringIO(base64.b64decode(imgstring))
image = Image.open(image_string)
image.save('pic.png', image.format, quality = 100)
picture = Image.open('pic.png', mode='r')
picture.load()
picture.seek(0)

print pytesseract.image_to_string(Image.open(picture))

在我看来,我必须以艰难的方式解决这个问题,但即使在保存、加载等之后,我仍然得到 AttributeError: read

将这些加载到内存中以供 pytesseract 消化它们的最有效方法是什么?我什至还没有进入 tesseract 阶段,我不知道它有多快或多慢,但我有数百万个要处理。

Traceback (most recent call last):
  File "C:\Users\Jeff\Desktop\QS2\tess.py", line 16, in <module>
    print pytesseract.image_to_string(Image.open(picture))
  File "C:\Python27\lib\site-packages\PIL\Image.py", line 2223, in open
    prefix = fp.read(16)
  File "C:\Python27\lib\site-packages\PIL\Image.py", line 605, in __getattr__
    raise AttributeError(name)
AttributeError: read

PNG 透明度似乎引起了问题。在白色背景上叠加可以解决此问题。

from PIL import Image
import base64
import pytesseract
import cStringIO

imgstring = 'data: image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGcAAAAVCAYAAABbq/AzAAAACXBIWXMAAA7EAAAOxAGVKw4bAAADiUlEQVRoge3YTWgeVRQG4IcQJJQSQihBNIQiXYUSpJSgIF1IkSKllCJFQggutBQRQRfFH3RTRFwVERdBRHcuREVEupASRIP4C7VoFSmWSq2gtRGjNm21Ls79+k0mM3cmLrpxXhjm++aec973/sy55w4dOnTo0KFDhw4FLOIe3IWrmWtbwWcfLuDhinj7avzvTjyLDXp24F2cxx84gQMVdgfxHVbwFe5tqaEObXxynDCAQ/g2af+8hjM3ftcwIzoPQ5iouObEQG1INvP4PvlVBT+IYxVxhlJ7Vad6GMICHsQkNqWO/I7Zgt1s6twejGEvfhMLrI2GKjT5NHHCs6l/dyTtM8nnzkL/msbvGhbFTOdwDIfT7514GyN4ryb403glE+8JvN/AWcY8Xi/8P45HK+J+0FJDFZp8mjgHxRu1o2TzkH62aDN+YFS8tlMZQdP4S6yCMhZqgr8oVlAdtuFvDGdsyngHL6Xfw0L31pq4Qy00VCHn04bzlmSzsWQznmw2lJ7XjZ8BbMclfJ0R/CRexS8ZmzJGcb/+nnFcrJ6B1P4lroiJb8I4nhGD8FzhGZwu2Z5OHJtbaFiv7jacvTG6qWSzKdmMZbjX4D78kGnfisvYUtNeN/PDYr8Yxo2iCDiH5ws2ZxN/HY7ob8insKvQtj09Hyz5DOkXLm00rEd3G04iZR3Vn8xpkdKuWjtptW+O1HAyI/Y1vJFpzwYvYUakx17nTjb4Dor0MIH9YgPtpZxJ1emjl6YnGzTMWF2NzbTQ3ZZzRKTfc6KIOSoKiMu4oeRbO36DWK4g62GLGJTbM8LXg2/EKtuIpXRfzthfSe3LOJPub+EpkUr+ERNXTMkT6X6mQcObuLnw/NcWuttyLuGBUoxZkcovZXhWYQA/6efDMh7Hh/ikbcAGTCe+Jf38++M6/Iur7k98ZnWqk/5/oX7SexouJu7edbGF7v/KOYBH8HKGoxJjqqu1cVES5g5s1L+Wh3GbyNvjYm+5oH+QnEq8oxW+t4rSfkqkiDHsFmltvmC3R2zcu5LdbnHm2N9SQxWafJo4iYwwIhbTtCiXF1S/AI3bwqfWnnNeEAepJuRK6VOi4vlZnGmKK+4QPq6JuVmcZ86KBXI+2R6wtoNz4iS+Ik7txYFv0lCFNj45TuJguqL/ZeMxa/eaHhonZ07/C8H1wonE26EFPhKfSK4H9ia+Dh06dOjw/8G/sXcmUir28IcAAAAASUVORK5CYII='
imgstring = imgstring.split('base64,')[-1].strip()
pic = cStringIO.StringIO()
image_string = cStringIO.StringIO(base64.b64decode(imgstring))
image = Image.open(image_string)

# Overlay on white background, see 
bg = Image.new("RGB", image.size, (255,255,255))
bg.paste(image,image)

print pytesseract.image_to_string(bg)

# Save the image passed to pytesseract for debugging purposes
bg.save('pic.png')

Python3.*

的问题
from PIL import Image
import base64
import pytesseract
import io

imgstring = 'data: image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGcAAAAVCAYAAABbq/AzAAAACXBIWXMAAA7EAAAOxAGVKw4bAAADiUlEQVRoge3YTWgeVRQG4IcQJJQSQihBNIQiXYUSpJSgIF1IkSKllCJFQggutBQRQRfFH3RTRFwVERdBRHcuREVEupASRIP4C7VoFSmWSq2gtRGjNm21Ls79+k0mM3cmLrpxXhjm++aec973/sy55w4dOnTo0KFDhw4FLOIe3IWrmWtbwWcfLuDhinj7avzvTjyLDXp24F2cxx84gQMVdgfxHVbwFe5tqaEObXxynDCAQ/g2af+8hjM3ftcwIzoPQ5iouObEQG1INvP4PvlVBT+IYxVxhlJ7Vad6GMICHsQkNqWO/I7Zgt1s6twejGEvfhMLrI2GKjT5NHHCs6l/dyTtM8nnzkL/msbvGhbFTOdwDIfT7514GyN4ryb403glE+8JvN/AWcY8Xi/8P45HK+J+0FJDFZp8mjgHxRu1o2TzkH62aDN+YFS8tlMZQdP4S6yCMhZqgr8oVlAdtuFvDGdsyngHL6Xfw0L31pq4Qy00VCHn04bzlmSzsWQznmw2lJ7XjZ8BbMclfJ0R/CRexS8ZmzJGcb/+nnFcrJ6B1P4lroiJb8I4nhGD8FzhGZwu2Z5OHJtbaFiv7jacvTG6qWSzKdmMZbjX4D78kGnfisvYUtNeN/PDYr8Yxo2iCDiH5ws2ZxN/HY7ob8insKvQtj09Hyz5DOkXLm00rEd3G04iZR3Vn8xpkdKuWjtptW+O1HAyI/Y1vJFpzwYvYUakx17nTjb4Dor0MIH9YgPtpZxJ1emjl6YnGzTMWF2NzbTQ3ZZzRKTfc6KIOSoKiMu4oeRbO36DWK4g62GLGJTbM8LXg2/EKtuIpXRfzthfSe3LOJPub+EpkUr+ERNXTMkT6X6mQcObuLnw/NcWuttyLuGBUoxZkcovZXhWYQA/6efDMh7Hh/ikbcAGTCe+Jf38++M6/Iur7k98ZnWqk/5/oX7SexouJu7edbGF7v/KOYBH8HKGoxJjqqu1cVES5g5s1L+Wh3GbyNvjYm+5oH+QnEq8oxW+t4rSfkqkiDHsFmltvmC3R2zcu5LdbnHm2N9SQxWafJo4iYwwIhbTtCiXF1S/AI3bwqfWnnNeEAepJuRK6VOi4vlZnGmKK+4QPq6JuVmcZ86KBXI+2R6wtoNz4iS+Ik7txYFv0lCFNj45TuJguqL/ZeMxa/eaHhonZ07/C8H1wonE26EFPhKfSK4H9ia+Dh06dOjw/8G/sXcmUir28IcAAAAASUVORK5CYII='
imgstring = imgstring.split('base64,')[-1].strip()
pic = io.StringIO()
image_string = io.BytesIO(base64.b64decode(imgstring))
image = Image.open(image_string)

# Overlay on white background, see 
bg = Image.new("RGB", image.size, (255,255,255))
bg.paste(image,image)

print(pytesseract.image_to_string(bg))

# Save the image passed to pytesseract for debugging purposes
bg.save('pic.png')