使用 Python 比较两个文件夹中的文件
Compare files in two folders using Python
我正在尝试比较两个文件夹 'test1' 和 'test2' 中的所有文件对(具有相同文件名的文件)并打印它们可能存在的任何差异。我有下面的这段代码,部分工作正常。它获取具有相同文件名的文件,但只比较第一个文件对而不是文件夹中的所有文件。我如何解决它?示例 csv 文件可能如下所示
import os
from collections import defaultdict
import csv
def relative_files(path):
for root, dirnames, files in os.walk(path):
relroot = os.path.relpath(root, path)
for filename in files:
yield os.path.join(relroot, filename)
def difference_in_files (root_one, root_two):
files_one = set(relative_files(root_one))
for same in files_one.intersection(relative_files(root_two)):
try:
with open(os.path.join(root_one, same)) as csvfile, open(os.path.join(root_two, same)) as csvfile2:
d = defaultdict(list)
d2 = defaultdict(list)
header = next(csvfile).rstrip().split("\t")
h1 = next((i for i, x in enumerate(header) if x == "h1"),None)
h2 = next((i for i, x in enumerate(header) if x == "h2"),None)
header2 = next(csvfile2).rstrip().split("\t")
h12 = next((i for i, x in enumerate(header2) if x == "h1"),None)
h22 = next((i for i, x in enumerate(header2) if x == "h2"),None)
if h1 is not None and h2 is not None:
r = csv.reader(csvfile,delimiter="\t")
for row in r:
d[row[h1]].append(row[h2])
if h12 is not None and h22 is not None:
r = csv.reader(csvfile2,delimiter="\t")
for row in r:
d2[row[h12]].append(row[h22])
d2 = {k: list(set(v)) for k,v in dict(d2).items()}
d = {k: list(set(v)) for k,v in dict(d).items()}
diff = dict([ (key, d2.get(key, d.get(key))) for key in set(d.keys()+d2.keys()) if (key in d and (not key in d2 or d2[key] != d[key])) or (key in d2 and (not key in d or d[key] != d2[key])) ])
diff2 = dict([ (key, d.get(key, d2.get(key))) for key in set(d2.keys()+d.keys()) if (key in d2 and (not key in d or d[key] != d2[key])) or (key in d and (not key in d2 or d2[key] != d[key])) ])
return diff, diff2
except TypeError:
pass
if __name__ == '__main__':
root_one = 'test1'
root_two = 'test2'
difference_in_files (root_one, root_two)
test1/csv1.csv
h1,h2,h3
aa,90,io
bb,86,0n
test1.csv2.csv
h1,h8,h2
jj,kj,64
df,hj,12
test2/csv1.csv
h1,h2,h3
aa,90,io
bb,66,0n
test2.csv2.csv
h1,h8,h2
jj,kj,64
df,hj,12
mm,h9,09
它只比较两个文件中的 csv1 而不是 csv2。
所以才做一个正式的回答。问题是:
return diff, diff2 is in the for loop. It will be executed at the end of
the first iteration of the loop. Thus no other iterations will be
executed.
我正在尝试比较两个文件夹 'test1' 和 'test2' 中的所有文件对(具有相同文件名的文件)并打印它们可能存在的任何差异。我有下面的这段代码,部分工作正常。它获取具有相同文件名的文件,但只比较第一个文件对而不是文件夹中的所有文件。我如何解决它?示例 csv 文件可能如下所示
import os
from collections import defaultdict
import csv
def relative_files(path):
for root, dirnames, files in os.walk(path):
relroot = os.path.relpath(root, path)
for filename in files:
yield os.path.join(relroot, filename)
def difference_in_files (root_one, root_two):
files_one = set(relative_files(root_one))
for same in files_one.intersection(relative_files(root_two)):
try:
with open(os.path.join(root_one, same)) as csvfile, open(os.path.join(root_two, same)) as csvfile2:
d = defaultdict(list)
d2 = defaultdict(list)
header = next(csvfile).rstrip().split("\t")
h1 = next((i for i, x in enumerate(header) if x == "h1"),None)
h2 = next((i for i, x in enumerate(header) if x == "h2"),None)
header2 = next(csvfile2).rstrip().split("\t")
h12 = next((i for i, x in enumerate(header2) if x == "h1"),None)
h22 = next((i for i, x in enumerate(header2) if x == "h2"),None)
if h1 is not None and h2 is not None:
r = csv.reader(csvfile,delimiter="\t")
for row in r:
d[row[h1]].append(row[h2])
if h12 is not None and h22 is not None:
r = csv.reader(csvfile2,delimiter="\t")
for row in r:
d2[row[h12]].append(row[h22])
d2 = {k: list(set(v)) for k,v in dict(d2).items()}
d = {k: list(set(v)) for k,v in dict(d).items()}
diff = dict([ (key, d2.get(key, d.get(key))) for key in set(d.keys()+d2.keys()) if (key in d and (not key in d2 or d2[key] != d[key])) or (key in d2 and (not key in d or d[key] != d2[key])) ])
diff2 = dict([ (key, d.get(key, d2.get(key))) for key in set(d2.keys()+d.keys()) if (key in d2 and (not key in d or d[key] != d2[key])) or (key in d and (not key in d2 or d2[key] != d[key])) ])
return diff, diff2
except TypeError:
pass
if __name__ == '__main__':
root_one = 'test1'
root_two = 'test2'
difference_in_files (root_one, root_two)
test1/csv1.csv
h1,h2,h3
aa,90,io
bb,86,0n
test1.csv2.csv
h1,h8,h2
jj,kj,64
df,hj,12
test2/csv1.csv
h1,h2,h3
aa,90,io
bb,66,0n
test2.csv2.csv
h1,h8,h2
jj,kj,64
df,hj,12
mm,h9,09
它只比较两个文件中的 csv1 而不是 csv2。
所以才做一个正式的回答。问题是:
return diff, diff2 is in the for loop. It will be executed at the end of the first iteration of the loop. Thus no other iterations will be executed.