如何递归提取zip文件？

Question

我有一个 zip 文件，其中包含三个 zip 文件，如下所示：

zipfile.zip\  
    dirA.zip\
         a  
    dirB.zip\
         b  
    dirC.zip\
         c

我想将 zip 文件中的所有内部 zip 文件提取到具有这些名称（dirA、dirB、dirC）的目录中。
基本上，我想以以下架构结束：

output\  
    dirA\
         a  
    dirB\
         b  
    dirC\
         c

我尝试了以下方法：

import os, re
from zipfile import ZipFile

os.makedirs(directory)  # where directory is "\output"
with ZipFile(self.archive_name, "r") as archive:
    for id, files in data.items():
        if files:
            print("Creating", id)
            dirpath = os.path.join(directory, id)

            os.mkdir(dirpath)

            for file in files:
                match = pattern.match(filename)
                new = match.group(2)
                new_filename = os.path.join(dirpath, new)

                content = archive.open(file).read()
            with open(new_filename, "wb") as outfile:
                outfile.write(content)

但它只提取 zip 文件，我最终得到：

output\  
    dirA\
         dirA.zip 
    dirB\
         dirB.zip 
    dirC\
         dirC.zip

任何包括代码段的建议都将不胜感激，因为我尝试了很多不同的东西但没有成功阅读文档。

Answer 1

解压缩 zip 文件时，您可能希望将内部 zip 文件写入内存，而不是将它们写入磁盘。为此，我使用了 BytesIO.

查看此代码：

import os
import io
import zipfile

def extract(filename):
    z = zipfile.ZipFile(filename)
    for f in z.namelist():
        # get directory name from file
        dirname = os.path.splitext(f)[0]  
        # create new directory
        os.mkdir(dirname)  
        # read inner zip file into bytes buffer 
        content = io.BytesIO(z.read(f))
        zip_file = zipfile.ZipFile(content)
        for i in zip_file.namelist():
            zip_file.extract(i, dirname)

如果您运行 extract("zipfile.zip") 和 zipfile.zip 为：

zipfile.zip/
    dirA.zip/
        a
    dirB.zip/
        b
    dirC.zip/
        c

输出应该是：

dirA/
  a
dirB/
  b
dirC/
  c

Answer 2

对于提取嵌套 zip 文件（任何嵌套级别）并清理原始 zip 文件的函数：

import zipfile, re, os

def extract_nested_zip(zippedFile, toFolder):
    """ Extract a zip file including any nested zip files
        Delete the zip file(s) after extraction
    """
    with zipfile.ZipFile(zippedFile, 'r') as zfile:
        zfile.extractall(path=toFolder)
    os.remove(zippedFile)
    for root, dirs, files in os.walk(toFolder):
        for filename in files:
            if re.search(r'\.zip$', filename):
                fileSpec = os.path.join(root, filename)
                extract_nested_zip(fileSpec, root)

Answer 3

我尝试了其他一些解决方案，但无法使它们起作用 "in place"。我将 post 我的解决方案来处理 "in place" 版本。注意：它会删除 zip 文件和 'replaces' 它们具有相同名称的目录，所以如果你想保留你的 zip 文件，请备份。

策略很简单。解压缩目录（和子目录）中的所有 zip 文件并冲洗并重复，直到没有 zip 文件保留。如果 zip 文件包含 zip 文件，则需要冲洗和重复。

import os
import io
import zipfile
import re

def unzip_directory(directory):
    """" This function unzips (and then deletes) all zip files in a directory """
    for root, dirs, files in os.walk(directory):
        for filename in files:
            if re.search(r'\.zip$', filename):
                to_path = os.path.join(root, filename.split('.zip')[0])
                zipped_file = os.path.join(root, filename)
                if not os.path.exists(to_path):
                    os.makedirs(to_path)
                    with zipfile.ZipFile(zipped_file, 'r') as zfile:
                        zfile.extractall(path=to_path)
                    # deletes zip file
                    os.remove(zipped_file)

def exists_zip(directory):
    """ This function returns T/F whether any .zip file exists within the directory, recursively """
    is_zip = False
    for root, dirs, files in os.walk(directory):
        for filename in files:
            if re.search(r'\.zip$', filename):
                is_zip = True
    return is_zip

def unzip_directory_recursively(directory, max_iter=1000):
    print("Does the directory path exist? ", os.path.exists(directory))
    """ Calls unzip_directory until all contained zip files (and new ones from previous calls)
    are unzipped
    """
    iterate = 0
    while exists_zip(directory) and iterate < max_iter:
        unzip_directory(directory)
        iterate += 1
    pre = "Did not " if iterate < max_iter else "Did"
    print(pre, "time out based on max_iter limit of", max_iter, ". Took iterations:", iterate)

假设您的 zip 文件已备份，您可以通过调用 unzip_directory_recursively(your_directory).

使这一切正常进行

Answer 4

这对我有用。只需将此脚本与嵌套的 zip 放在同一目录下即可。它会将 zip 解压缩到与原始 zip 同名的目录中，并清理原始 zip。它还将计算嵌套 zip 中的文件总数

import os

from zipfile import ZipFile


def unzip (path, total_count):
    for root, dirs, files in os.walk(path):
        for file in files:
            file_name = os.path.join(root, file)
            if (not file_name.endswith('.zip')):
                total_count += 1
            else:
                currentdir = file_name[:-4]
                if not os.path.exists(currentdir):
                    os.makedirs(currentdir)
                with ZipFile(file_name) as zipObj:
                    zipObj.extractall(currentdir)
                os.remove(file_name)
                total_count = unzip(currentdir, total_count)
    return total_count

total_count = unzip ('.', 0)
print(total_count)

如何递归提取zip文件？

How to extract zip file recursively?

python

zip

unzip

python-3.x