使用 python 对文件目录自动执行一系列文件比较功能

Question

下面的函数必须是运行才能输出两个文件之间以度数和弧度表示的余弦相似度。我想自动执行此操作以针对第一个文件处理同一目录中的 500 个文件（针对第一个文件一次处理一个文件）：RUMO4009_M.txt。我希望保留所有已处理文件的输出，以查看 500 个文件中每个文件与第一个文件相比的输出。示例输出：

File RUMO4009_M.txt : 2836 lines, 509 words, 278 distinct words File RUMO5038_M.txt : 2672 lines, 479 words, 212 distinct words The cosine between the documents is 0.768315. The angle between the documents is 0.694592 radians or 40 degrees.

import math
import string
import sys

##################################
### Read a text file ###
##################################
def read_file(filename):
    try:
        with open(filename, 'r') as f:
            data = f.read()
        return data
    except IOError:
        print("Error opening or reading input file: ", filename)
        sys.exit()

# using a global variable to map upper case to lower case which will help functions speed up further down.
translation_table = str.maketrans(string.punctuation+string.ascii_uppercase," "*len(string.punctuation)+string.ascii_lowercase)
        
#################################################
### Split the text lines into words ###
#################################################
def get_words_from_line_list(text):
    text = text.translate(translation_table)
    word_list = text.split()
    return word_list
get_words_from_line_list(read_file('RUMO4009_M.txt'))

#################################################
### Split the text lines into words ###
#################################################
def get_words_from_line_list(text):
    text = text.translate(translation_table)
    word_list = text.split()
    return word_list
get_words_from_line_list(read_file('RUMO5038_M.txt'))


#################################################
### Define dotProduct and vector_angle ###
#################################################

def dotProduct(P1, P2):
    tot=0.0

    for key in P1:

        if key in P2:
            tot+=(P1[key]*P2[key])

    return tot

# angle in radians between document vectors
def vector_angle(P1, P2): 
    num=dotProduct(P1, P2)
    den=math.sqrt(dotProduct(P1, P1)*dotProduct(P2, P2))

    return math.acos(num/den)



##############################################
### Count frequency of each word ###
##############################################
def count_frequency(word_list):
    D = {}
    for new_word in word_list:
        if new_word in D:
            D[new_word] = D[new_word] + 1
        else:
            D[new_word] = 1
    return D

count_frequency(get_words_from_line_list(read_file('RUMO4009_M.txt')))

##############################################
### Count frequency of each word ###
##############################################
def count_frequency(word_list):
    D = {}
    for new_word in word_list:
        if new_word in D:
            D[new_word] = D[new_word] + 1
        else:
            D[new_word] = 1
    return D

count_frequency(get_words_from_line_list(read_file('RUMO5038_M.txt')))


###### Define Word Frequencies #########

def word_frequencies_for_file(filename,verbose=False):

    line_list = read_file(filename)
    word_list = get_words_from_line_list(line_list) 
    freq_mapping = count_frequency(word_list)

    print("File", filename, ":", ) 
    print(len(line_list), "lines, ", ) 
    print(len(word_list), "words, ", ) 
    print(len(freq_mapping), "distinct words")
    return freq_mapping 

word_frequencies_for_file('RUMO4009_M.txt')


def word_frequencies_for_file(filename,verbose=False):

    line_list = read_file(filename)
    word_list = get_words_from_line_list(line_list) 
    freq_mapping = count_frequency(word_list)

    print("File", filename, ":", ) 
    print(len(line_list), "lines, ", ) 
    print(len(word_list), "words, ", ) 
    print(len(freq_mapping), "distinct words")
    return freq_mapping 

word_frequencies_for_file('RUMO5038_M.txt')


##### Cosine Similarity function #######

def document_similarity(filename_1, filename_2, verbose=True):
    """DOCSTRING"""
    '''document_similarity function compares two docs for similarities in line, word, and distinct words count.
    The line: sorted_word_list_1 = word_frequencies_for_file(filename_1, verbose) is biggest
    issue with efficiency.'''
    sorted_word_list_1 = word_frequencies_for_file(filename_1, verbose)
    sorted_word_list_2 = word_frequencies_for_file(filename_2, verbose)
    cosine = vector_angle(sorted_word_list_1,sorted_word_list_2)
    # Use f-strings; see https://realpython.com/python-f-strings/ for more information
    if verbose:
        print(f"The cosine between the documents is {cosine : 0.6f}.")
        print(f"The angle between the documents is {math.acos(cosine) : 0.6f} radians or {math.acos(cosine)*180/math.pi : .0f} degrees.")


document_similarity('RUMO4009_M.txt','RUMO5038_M.txt')

输出：

文件RUMO4009_M.txt： 2836行， 509个字， 278个不同的词文件 RUMO5038_M.txt : 2672行， 479个字， 212个不同的词文件 RUMO4009_M.txt : 2836行， 509个字， 278个不同的词文件 RUMO5038_M.txt : 2672行， 479个字， 212个不同的词文档之间的余弦为 0.768315。文档之间的角度为 0.694592 弧度或 40 度。

Answer 1

### COSINE SIMILARITY
### USE THIS BELOW TO LOOP THROUGH DIRECTORY OF TXT DOCUMENTS 
### COMPARED TO SINGLE SPECIFIED TXT FILE

### %%writefile example.txt

# Import libraries
import math
import string
import sys
import glob

# Locate filenames
filenames = glob.glob('*.txt')
print(filenames)

# Using a global variable to map upper case to lower case which will help functions speed up further down.
translation_table = str.maketrans(string.punctuation+string.ascii_uppercase," "*len(string.punctuation)+string.ascii_lowercase)

##################################
### Read a text file ###
##################################

def read_file(filename):
    try:
        with open(filename, 'r') as f:
            data = f.read()
        return data
    except IOError:
        print("Error opening or reading input file: ", filename)
        sys.exit()
        

#################################################
### Split the text lines into words ###
#################################################

def get_words_from_line_list(text):
    text = text.translate(translation_table)
    word_list = text.split()
    return word_list


#################################################
### Define dotProduct and vector_angle ###
#################################################

# dotProduct
def dotProduct(P1, P2):
    tot=0.0

    for key in P1:

        if key in P2:
            tot+=(P1[key]*P2[key])

    return tot

# angle in radians between document vectors
def vector_angle(P1, P2): 
    num=dotProduct(P1, P2)
    den=math.sqrt(dotProduct(P1, P1)*dotProduct(P2, P2))

    return math.acos(num/den)


##############################################
### Count frequency of each word ###
##############################################

def count_frequency(word_list):
    D = {}
    for new_word in word_list:
        if new_word in D:
            D[new_word] = D[new_word] + 1
        else:
            D[new_word] = 1
    return D


##############################################
### Define Word Frequencies ###
##############################################

def word_frequencies_for_file(filename,verbose=False):

    line_list = read_file(filename)
    word_list = get_words_from_line_list(line_list) 
    freq_mapping = count_frequency(word_list)

    print("File", filename, ":", ) 
    print(len(line_list), "lines, ", ) 
    print(len(word_list), "words, ", ) 
    print(len(freq_mapping), "distinct words")
    return freq_mapping 


##############################################
### Cosine Similarity function  ###
##############################################

def document_similarity(filename_1, filename_2, verbose=True):
    """DOCSTRING"""
    '''document_similarity function compares two docs for similarities in line, word, and distinct words count.
    The line: sorted_word_list_1 = word_frequencies_for_file(filename_1, verbose) is biggest
    issue with efficiency.'''
    sorted_word_list_1 = word_frequencies_for_file(filename_1, verbose)
    sorted_word_list_2 = word_frequencies_for_file(filename_2, verbose)
    angle = vector_angle(sorted_word_list_1,sorted_word_list_2)
    # Use f-strings; see https://realpython.com/python-f-strings/ for more information
    if verbose:
        print(f"The cosine between the documents is {math.cos(angle) : 0.6f}.")
        print(f"The angle between the documents is {angle : 0.6f} radians or {angle*180/math.pi : .0f} degrees.")
    return math.cos(angle)
        

# create empty list
cosine_array=[]

# LOOP THROUGH DIRECTORY OF TXT DOCUMENTS
# s represents all target TXT files to compare against source (main) TXT 1 i.e.: 'RUMO4009.M.txt')
for s in filenames:
    get_words_from_line_list(read_file('RUMO4009_M.txt')) #Change TXT 1 here to compare all files against
    get_words_from_line_list(read_file(s))
    count_frequency(get_words_from_line_list(read_file('RUMO4009_M.txt'))) #Change TXT 1 to compare all files against
    count_frequency(get_words_from_line_list(read_file(s)))
    new_cosine = document_similarity('RUMO4009_M.txt',s) #Change TXT 1 to compare all files against; keep s variable
    cosine_array.append((new_cosine,s))

### NOW SORT RESULTS IN COSINE DESCENDING ORDER ### 
# Most similar target txt files (to source file) will be listed first
# 1.0 value is highest value of similarity; if your source txt is in the target directory,
# it will be compared with itself and have the highest similarity possible: a cosine of 1.0.
# 0.0 value would be the lowest cosine value of similarity

cosine_array.sort(reverse=True,key=lambda x: x[0])

### NOW PRINT OUT SORTED RESULTS ### 

cosine_array


### NOW PLACE SORTED RESULTS INTO A PANDAS DATAFRAME AND EXPORT TO CSV FILE ###

import pandas as pd

df=pd.DataFrame(cosine_array)
df=df.rename(columns={0:'Cosine between Source and Target TXT',1:'Target TXT'})
df.to_csv("results.csv",index=False)
df

使用 python 对文件目录自动执行一系列文件比较功能

Use python to automate series of file comparison functions against directory of files

python

automation

function