[Python]比较两个 zip 文件的函数,一个位于 FTP 目录,另一个在我的本地机器上

[Python]Function that compares two zip files, one located in FTP dir, the other on my local machine

我在创建比较两个 zip 文件的函数时遇到问题(如果它们相同,不仅是按名称)。这是我的代码示例:

def validate_zip_files(self):
    host = '192.168.0.1'
    port = 2323
    username = '123'
    password = '123'
    ftp = FTP()
    ftp.connect(host, port)
    ftp.login(username,password)
    ftp.cwd('test')
    print ftp.pwd()
    ftp.retrbinary('RETR test', open('test.zip', 'wb').write)
    file1=open('test.zip', 'wb')
    file2=open('/home/user/file/text.zip', 'wb')
    return filecmp.cmp(file1, file2, shallow=True)

其中一个问题是第二个 zip 位于不同的位置('/home/user/file/text.zip'),我正在我的 python 脚本所在的目录中下载 zip 文件。我不是 100% 确定 filecmp.cmp 适用于 .zip 文件。

任何想法都会很棒 :) 谢谢。

我不会直接比较文件,而是继续比较文件的散列值。这消除了 filecmp 的依赖性,这可能 - 如您所说 - 不适用于压缩文件。

import hashlib

def compare_files(a,b):
    fileA = hashlib.sha256(open(a, 'rb').read()).digest()
    fileB = hashlib.sha256(open(b, 'rb').read()).digest()
    if fileA == fileB:
        return True
    else:
        return False

查看我的 gist 比较两个 zip 文件的内容,并从一个 zip 生成补丁文件到另一个。例如,如果两个 zip 文件共享一个条目但内容不同,我的要点将能够找到它;如果他们有不同的条目,要点也可以做到。要点忽略了修改时间的差异。尽管如此,如果您只关心 浅层 比较,那么 hashlib 可能是更好的选择。

要点中的代码供您参考:

import os
import argparse
import collections
import tempfile
import zipfile
import filecmp
import shutil
import shlex

ZipCmpResult = collections.namedtuple('ZipCmpResult',
                                      ['to_rm', 'to_cmp', 'to_add'])


def make_parser():
    parser = argparse.ArgumentParser(
        description='Make patch zip file from two similar zip files.')
    parser.add_argument(
        '--oldfile',
        default=os.path.join('share', 'old.zip'),
        help='default: %(default)s')
    parser.add_argument(
        '--newfile',
        default=os.path.join('share', 'new.zip'),
        help='default: %(default)s')
    parser.add_argument(
        '--toname',
        default=os.path.join('share', 'patch'),
        help='default: %(default)s')
    return parser


def zipcmp(old, new):
    with zipfile.ZipFile(old) as zinfile:
        old_names = set(zinfile.namelist())
    with zipfile.ZipFile(new) as zinfile:
        new_names = set(zinfile.namelist())
    to_rm = old_names - new_names
    to_cmp = old_names & new_names
    to_add = new_names - old_names
    return ZipCmpResult(to_rm, to_cmp, to_add)


def compare_files(old, new, cmpresult):
    with tempfile.TemporaryDirectory() as tmpdir, \
         zipfile.ZipFile(old) as zinfile_old, \
         zipfile.ZipFile(new) as zinfile_new:
        old_dest = os.path.join(tmpdir, 'old')
        new_dest = os.path.join(tmpdir, 'new')
        os.mkdir(old_dest)
        os.mkdir(new_dest)
        for filename in cmpresult.to_cmp:
            zinfile_old.extract(filename, path=old_dest)
            zinfile_new.extract(filename, path=new_dest)
            if not filecmp.cmp(
                    os.path.join(old_dest, filename),
                    os.path.join(new_dest, filename),
                    shallow=False):
                cmpresult.to_add.add(filename)


def mkpatch(new, cmpresult, to_name):
    with zipfile.ZipFile(new) as zinfile, \
         zipfile.ZipFile(to_name + '.zip', 'w') as zoutfile:
        for filename in cmpresult.to_add:
            with zinfile.open(filename) as infile, \
                 zoutfile.open(filename, 'w') as outfile:
                shutil.copyfileobj(infile, outfile)
    with open(to_name + '.sh', 'w', encoding='utf-8') as outfile:
        outfile.write('#!/bin/sh\n')
        for filename in cmpresult.to_rm:
            outfile.write('rm {}\n'.format(shlex.quote(filename)))


def main():
    args = make_parser().parse_args()
    cmpresult = zipcmp(args.oldfile, args.newfile)
    compare_files(args.oldfile, args.newfile, cmpresult)
    mkpatch(args.newfile, cmpresult, args.toname)


if __name__ == '__main__':
    main()