预测从图像中提取的数据,

Predict extracted data from the image,

    import nltk
nltk.download('stopwords')
import pandas as pd
import numpy as np
df = pd.read_csv("All.csv")
df
get_features = df.iloc[0:2]

get_features

df = df[pd.notnull(df['Bank Detail'])]
df.info
col = ['Bank Detail', 'Classes']
df = df[col]
df['classes'] = df['Bank Detail'].factorize()[0]
df
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
from sklearn.metrics import accuracy_score
import pickle
from sklearn.model_selection import train_test_split
import cv2
import pytesseract
from pytesseract import image_to_string


stopWords = set(nltk.corpus.stopwords.words('english'))

vect = TfidfVectorizer(sublinear_tf=True, encoding='utf-8',
                             decode_error='ignore',stop_words=stopWords)
X_train, X_test, y_train, y_test = train_test_split(df["Bank Detail"], df["Classes"], 
test_size=0.33, random_state=42)

xTrain = X_train
yTrain = y_train

tfidf = vect.fit(xTrain.values.astype('U'))
xTrainvect = vect.fit_transform(xTrain)
yTrainvect = yTrain

xTestvect = vect.transform(X_test)
yTestvect = y_test

model = MultinomialNB(alpha=0.01, fit_prior=True)
model.fit(xTrainvect, yTrainvect)

ypred = model.predict(xTestvect)
score = accuracy_score(yTestvect, ypred)
print ("Accuracy: ",score)

test = "DEBIT CARD PURCHASE AT BUFFALO WILD WINGS, FARMINGTON HI, MI ON 061919 . "
new_pred = model.predict(vect.transform([test]))
print(new_pred)


img = cv2.imread("chase_bank.jpg")
get_text = pytesseract.image_to_string(img)

a_list = nltk.tokenize.word_tokenize(get_text)
print(a_list)

text_length = len(get_text)
print(text_length)

dates = []
#getting dates
for s in a_list:
  if(model.predict(vect.transform([s])) == 'Date'):
  #print(s)
  dates.append(s)

打印(日期)

I am working on bank statements.My task is to extract 3 columns data dates, amounts and description and insert them into excel sheets.How can i find dates and description and amounts from Extracted text from image, so I could append to the list?If anyone know better approach I would like to use.Please guide me this project, would highly appreciate your effort. Thank you.

import nltk
nltk.download('punkt')

pip install ocrspace

import nltk
nltk.download('stopwords')

import pandas as pd
import numpy as np

df = pd.read_csv("All.csv")
df

get_features = df.iloc[0:2]

get_features

df = df[pd.notnull(df['Bank Detail'])]

df.info

col = ['Bank Detail', 'Classes']
df = df[col]

df['classes'] = df['Bank Detail'].factorize()[0]
df

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
from sklearn.metrics import accuracy_score
import pickle
from sklearn.model_selection import train_test_split
import cv2
import pytesseract
from pytesseract import image_to_string

stopWords = set(nltk.corpus.stopwords.words('english'))

vect = TfidfVectorizer(sublinear_tf=True, encoding='utf-8',
                             decode_error='ignore',stop_words=stopWords)

X_train, X_test, y_train, y_test = train_test_split(df["Bank Detail"], 
df["Classes"], test_size=0.33, random_state=42)

xTrain = X_train
yTrain = y_train

tfidf = vect.fit(xTrain.values.astype('U'))
xTrainvect = vect.fit_transform(xTrain)
yTrainvect = yTrain

xTestvect = vect.transform(X_test)
yTestvect = y_test


model = MultinomialNB(alpha=0.01, fit_prior=True)
model.fit(xTrainvect, yTrainvect)

ypred = model.predict(xTestvect)
score = accuracy_score(yTestvect, ypred)
print ("Accuracy: ",score)

def test_predict():
 test = "DEBIT CARD PURCHASE AT BUFFALO WILD WINGS, FARMINGTON HI, MI ON 
   061919 . "
  new_pred = model.predict(vect.transform([test]))
  print(new_pred)
test_predict()
img = cv2.imread("bank_sheet.jpg")

#extracing text data from the image
def extracting_text_from_image():
  import ocrspace
  api = ocrspace.API()
  api = ocrspace.API('6f80b6ff6288957', ocrspace.Language.English)
  api.ocr_file('bank_sheet.jpg')
  banktext = api.ocr_file('bank_sheet.jpg')
  print(banktext)


  extracting_text_from_image()

  #get_text = pytesseract.image_to_string(banktest)
  dates = []
  amounts = []
  description = []


  # tokenizing the string and saperately getting dates, description and amount

def getting_dates_description_amount_separately():
     a_list = nltk.tokenize.line_tokenize(banktest)
    for s in a_list:
     if(model.predict(vect.transform([s])) == 'Date'):
         #print(s)
          dates.append(s)

     if(model.predict(vect.transform([s])) == 'Amount'):
        amounts.append(s)

     if(model.predict(vect.transform([s])) == 'Description'):
          description.append(s)
          #print(s)
   print(dates, '\n')
   print(description, '\n')
   print(amounts, '\n')       

   #print(a_list)



  #print(type(a_list))

 getting_dates_description_amount_separately()

 import pickle

 with open('untitled19.pkl', 'wb') as f:
    pickle.dump(model,f)

  import csv

 with open('mycsv.csv', 'w', newline='') as f:
    fieldname = ['Dates', 'Description', 'Amount']
    thewriter = csv.DictWriter(f, fieldnames=fieldname)


  thewriter.writeheader()
  thewriter.writerow({'Dates':dates, 'Description': description, 
   'Amount':amounts})

   data_frame = pd.read_csv("mycsv.csv")
   data_frame