Python: tf-idf-cosine: 如何在 CSV 文件中实现文档相似度
Python: tf-idf-cosine: How to implement document similarity in CSV file
我有 book.csv 文件,其中包含一些书籍的书目列表。我在数据库中也有用户 table,其中包含用户信息需求。我的目标是做 tf-idf,数据库 table 作为查询的用户信息需求与作为文档的 book.csv 行之间的余弦相似度,当 user_Id 是插入。所以我在将 csv raws 设置为文档时遇到了一些问题。请帮助解决此错误IndexError: list index out of range
。另一个问题是,即使我向右插入 User_Id,它也会重播错误消息,直到我达到该用户的号码。即如果用户在数据库中位于第 3 位 table 我必须像这样尝试三次
insert User_Id
JU/MF3024/04
no such User exist
insert User_Id
JU/MF3024/04
insert User_Id
JU/MF3024/04
no such User exist
Fit Vectorizer to train set [[0 1]
[1 0]]
Transform Vectorizer to test set [[0 0]
[0 0]
这是我在 python 2.7.11 中的实现代码。我使用了 Python: tf-idf-cosine: to find document similarity
中的一些代码
from sklearn.feature_extraction. text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
import numpy.linalg as LA
import pandas as pd
from nltk.corpus import stopwords
from collections import defaultdict
import csv
import mysql.connector as sql
from mysql.connector import connection
with open("Book.csv", "rb") as books:
reader = csv.reader(books, delimiter =',')
reader. next()
count = 0
docs = {}
for row in reader:
docs = row[1].split()#I want to consider each row as document similar to train set on the above linked post
query = "" # like test_set on the above post
config = {'user': 'root', 'password': '929255@Tenth', 'host': '127.0.0.1','database': 'juls', 'raise_on_warnings': True,}
db = ql.connect(**config)
cursor = db.cursor()
query = "SELECT * FROM user"
cursor.execute(query)
result = cursor.fetchall()
for r in result:
User_Id = r[0]
First_Name = r[1]
Last_Name = r[2]
College = r[3]
Department = r[4]
Info_need = r[5]
email = r[6]
print "insert User_Id"
Id = str(raw_input())
if Id not in User_Id:
print "no such User exist"
pass
elif Id =="":
print "User ID is blank"
pass
else:
query = "SELECT Info_need from user WHERE User_Id = '%s'" % Id
cursor.execute(query)
stopWords = set(stopwords.words('english'))
vectorizer = CountVectorizer(stop_words = stopWords)
transformer = TfidfTransformer()
trainVectorizerArray = vectorizer.fit_transform(docs).toarray()
testVectorizerArray = vectorizer.transform(query).toarray()
print 'Fit Vectorizer to train set', trainVectorizerArray
print 'Transform Vectorizer to test set', testVectorizerArray
cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
for vector in trainVectorizerArray:
for testV in testVectorizerArray:
cosine = cx(vector, testV)
transformer.fit(trainVectorizerArray)
transformer.fit(testVectorizerArray)
tfidf = transformer.transform(testVectorizerArray)
print tfidf.todense()
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_train = tfidf_vectorizer.fit_transform(docs)
print "RANKED TF-IDF"
print tfidf[0:1]
cosine_similarities = linear_kernel( tfidf[ 0: 1], tfidf). flatten()
print cosine_similarities
related_docs_indices = cosine_similarities.argsort()[:-5:-1]
print related_docs_indices
print cosine_similarities[related_docs_indices]
print docs[14]
我已经解决了将 csv 原始文件作为文档的问题。所以下面的代码是我的解决方案之一。请帮助解决 post.
中的其他问题
stopWords = set(stopwords.words('english'))
lines = []
with open('Booklist.csv', 'rb') as f:
reader = csv.reader(f)
for row in reader:
if reader.line_num == 1:
continue # to skip frist line
your_list = row[1]
lines.append((your_list))
def build_lexicon(corpus):
lexicon = set()
for doc in corpus:
lexicon.update([word for word in doc.split() if not word in stopWords]) # remove stop words
return lexicon
vocabulary = build_lexicon(lines)
print 'My vocabulary vector is [' + ',' .join(list(vocabulary)) + ']' # prints whole vocabulary words in the column (row[1]) without stop words
for doc in lines:
print 'The doc %d is: %s' % ((lines.index(doc) +1), doc) #prints each line as document which is my intention
这个答案是针对问题中数据库部分提取用户信息的需要
db = sql.connect(**config)
cursor = db.cursor()
Id = str(raw_input("insert User_Id: "))
val= cursor.execute("SELECT MajorSubjectInterest, SubsidiarySubjectInterest from user WHERE User_Id = '%s'" % (Id))
result = cursor.fetchall()
for r in result:
Major = r[0]
Subsidiary = r[1]
if Subsidiary =="": #SubsidiarySubjectInterest field in the table is allowed Null input
need = Major
else:
need = Major + '; ' + Subsidiary
queries.append(need) after I extracted user information need I want it to be list and so I add it to empty list. it is also possible like this 'needs = need.split(';',5)'
db.close()
我有 book.csv 文件,其中包含一些书籍的书目列表。我在数据库中也有用户 table,其中包含用户信息需求。我的目标是做 tf-idf,数据库 table 作为查询的用户信息需求与作为文档的 book.csv 行之间的余弦相似度,当 user_Id 是插入。所以我在将 csv raws 设置为文档时遇到了一些问题。请帮助解决此错误IndexError: list index out of range
。另一个问题是,即使我向右插入 User_Id,它也会重播错误消息,直到我达到该用户的号码。即如果用户在数据库中位于第 3 位 table 我必须像这样尝试三次
insert User_Id
JU/MF3024/04
no such User exist
insert User_Id
JU/MF3024/04
insert User_Id
JU/MF3024/04
no such User exist
Fit Vectorizer to train set [[0 1]
[1 0]]
Transform Vectorizer to test set [[0 0]
[0 0]
这是我在 python 2.7.11 中的实现代码。我使用了 Python: tf-idf-cosine: to find document similarity
中的一些代码from sklearn.feature_extraction. text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
import numpy.linalg as LA
import pandas as pd
from nltk.corpus import stopwords
from collections import defaultdict
import csv
import mysql.connector as sql
from mysql.connector import connection
with open("Book.csv", "rb") as books:
reader = csv.reader(books, delimiter =',')
reader. next()
count = 0
docs = {}
for row in reader:
docs = row[1].split()#I want to consider each row as document similar to train set on the above linked post
query = "" # like test_set on the above post
config = {'user': 'root', 'password': '929255@Tenth', 'host': '127.0.0.1','database': 'juls', 'raise_on_warnings': True,}
db = ql.connect(**config)
cursor = db.cursor()
query = "SELECT * FROM user"
cursor.execute(query)
result = cursor.fetchall()
for r in result:
User_Id = r[0]
First_Name = r[1]
Last_Name = r[2]
College = r[3]
Department = r[4]
Info_need = r[5]
email = r[6]
print "insert User_Id"
Id = str(raw_input())
if Id not in User_Id:
print "no such User exist"
pass
elif Id =="":
print "User ID is blank"
pass
else:
query = "SELECT Info_need from user WHERE User_Id = '%s'" % Id
cursor.execute(query)
stopWords = set(stopwords.words('english'))
vectorizer = CountVectorizer(stop_words = stopWords)
transformer = TfidfTransformer()
trainVectorizerArray = vectorizer.fit_transform(docs).toarray()
testVectorizerArray = vectorizer.transform(query).toarray()
print 'Fit Vectorizer to train set', trainVectorizerArray
print 'Transform Vectorizer to test set', testVectorizerArray
cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
for vector in trainVectorizerArray:
for testV in testVectorizerArray:
cosine = cx(vector, testV)
transformer.fit(trainVectorizerArray)
transformer.fit(testVectorizerArray)
tfidf = transformer.transform(testVectorizerArray)
print tfidf.todense()
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_train = tfidf_vectorizer.fit_transform(docs)
print "RANKED TF-IDF"
print tfidf[0:1]
cosine_similarities = linear_kernel( tfidf[ 0: 1], tfidf). flatten()
print cosine_similarities
related_docs_indices = cosine_similarities.argsort()[:-5:-1]
print related_docs_indices
print cosine_similarities[related_docs_indices]
print docs[14]
我已经解决了将 csv 原始文件作为文档的问题。所以下面的代码是我的解决方案之一。请帮助解决 post.
中的其他问题stopWords = set(stopwords.words('english'))
lines = []
with open('Booklist.csv', 'rb') as f:
reader = csv.reader(f)
for row in reader:
if reader.line_num == 1:
continue # to skip frist line
your_list = row[1]
lines.append((your_list))
def build_lexicon(corpus):
lexicon = set()
for doc in corpus:
lexicon.update([word for word in doc.split() if not word in stopWords]) # remove stop words
return lexicon
vocabulary = build_lexicon(lines)
print 'My vocabulary vector is [' + ',' .join(list(vocabulary)) + ']' # prints whole vocabulary words in the column (row[1]) without stop words
for doc in lines:
print 'The doc %d is: %s' % ((lines.index(doc) +1), doc) #prints each line as document which is my intention
这个答案是针对问题中数据库部分提取用户信息的需要
db = sql.connect(**config)
cursor = db.cursor()
Id = str(raw_input("insert User_Id: "))
val= cursor.execute("SELECT MajorSubjectInterest, SubsidiarySubjectInterest from user WHERE User_Id = '%s'" % (Id))
result = cursor.fetchall()
for r in result:
Major = r[0]
Subsidiary = r[1]
if Subsidiary =="": #SubsidiarySubjectInterest field in the table is allowed Null input
need = Major
else:
need = Major + '; ' + Subsidiary
queries.append(need) after I extracted user information need I want it to be list and so I add it to empty list. it is also possible like this 'needs = need.split(';',5)'
db.close()