使用 python 格式 tar.gz.part* 的 tarfile 提取所有部分文件
Extract all part-files using python tarfile of format tar.gz.part*
在远程服务器中,由于某些限制,我使用命令 as stated here:
生成了按 2000 MB 拆分的 tarfile
tar -cvzf - tdd*20210914*.csv | split -b 2000M - archives/20210914.tar.gz.part
现在,我有一个文件列表:[20210914.tar.gz.partaa, 20210914.tar.gz.partab, 20210914.tar.gz.partac]
,并且需要提取windows机器中的所有部分文件,使用python.
def extract(infile : str, path : str):
tar = tarfile.open(infile, "r:gz")
tar.extractall(path = path)
tar.close()
extract("20210914.tar.gz.partaa", path = "tmp") # where file is first file
但是,我得到了 EOFError: Compressed file ended before the end-of-stream marker was reached
,这是预期的,因为(我想)还有两个文件需要提取。
我的问题:如何修改读取所有文件,并解压到同一目录的功能?
我试图将第二个文件直接传递给函数,但出现了以下错误:
OSError Traceback (most recent call last)
~\.conda\envs\python37\lib\tarfile.py in gzopen(cls, name, mode, fileobj, compresslevel, **kwargs)
1643 try:
-> 1644 t = cls.taropen(name, mode, fileobj, **kwargs)
1645 except OSError:
~\.conda\envs\python37\lib\tarfile.py in taropen(cls, name, mode, fileobj, **kwargs)
1620 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
-> 1621 return cls(name, mode, fileobj, **kwargs)
1622
~\.conda\envs\python37\lib\tarfile.py in __init__(self, name, mode, fileobj, format, tarinfo, dereference, ignore_zeros, encoding, errors, pax_headers, debug, errorlevel, copybufsize)
1483 self.firstmember = None
-> 1484 self.firstmember = self.next()
1485
~\.conda\envs\python37\lib\tarfile.py in next(self)
2286 try:
-> 2287 tarinfo = self.tarinfo.fromtarfile(self)
2288 except EOFHeaderError as e:
~\.conda\envs\python37\lib\tarfile.py in fromtarfile(cls, tarfile)
1093
-> 1094 buf = tarfile.fileobj.read(BLOCKSIZE)
1095 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
~\.conda\envs\python37\lib\gzip.py in read(self, size)
286 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
--> 287 return self._buffer.read(size)
288
~\.conda\envs\python37\lib\_compression.py in readinto(self, b)
67 with memoryview(b) as view, view.cast("B") as byte_view:
---> 68 data = self.read(len(byte_view))
69 byte_view[:len(data)] = data
~\.conda\envs\python37\lib\gzip.py in read(self, size)
473 self._init_read()
--> 474 if not self._read_gzip_header():
475 self._size = self._pos
~\.conda\envs\python37\lib\gzip.py in _read_gzip_header(self)
421 if magic != b'73':
--> 422 raise OSError('Not a gzipped file (%r)' % magic)
423
OSError: Not a gzipped file (b'|\x19')
During handling of the above exception, another exception occurred:
ReadError Traceback (most recent call last)
<ipython-input-77-29d5169be949> in <module>
----> 1 extract("20210914.tar.gz.partab", path = "tmp") # where file is first file
<ipython-input-75-60cd4e78bf4e> in extract(infile, path, chunk, **kwargs)
1 def extract(infile : str, path : str, chunk : int = 2000, **kwargs):
----> 2 tar = tarfile.open(infile, "r:gz")
3 tar.extractall(path = path)
4 tar.close()
~\.conda\envs\python37\lib\tarfile.py in open(cls, name, mode, fileobj, bufsize, **kwargs)
1589 else:
1590 raise CompressionError("unknown compression type %r" % comptype)
-> 1591 return func(name, filemode, fileobj, **kwargs)
1592
1593 elif "|" in mode:
~\.conda\envs\python37\lib\tarfile.py in gzopen(cls, name, mode, fileobj, compresslevel, **kwargs)
1646 fileobj.close()
1647 if mode == 'r':
-> 1648 raise ReadError("not a gzip file")
1649 raise
1650 except:
ReadError: not a gzip file
split
做它的名字所说的 - 将文件分成几部分,你应该首先连接你拥有的所有部分,然后将其视为普通 *.tar.gz 文件。您可以使用 python 连接它们,如下所示,创建文件 concater.py
import sys
with open('total.tar.gz','wb') as f:
for fname in sys.argv[1:]:
with open(fname,'rb') as g:
f.write(g.read())
然后做
python concater.py 20210914.tar.gz.partaa 20210914.tar.gz.partab 20210914.tar.gz.partac
应该创建 total.tar.gz
将被视为单个 *.tar.gz 文件。 sys.argv
保留当前脚本名称,后跟命令行参数,因此我首先放弃它们(即脚本名称)
在远程服务器中,由于某些限制,我使用命令 as stated here:
生成了按 2000 MB 拆分的 tarfiletar -cvzf - tdd*20210914*.csv | split -b 2000M - archives/20210914.tar.gz.part
现在,我有一个文件列表:[20210914.tar.gz.partaa, 20210914.tar.gz.partab, 20210914.tar.gz.partac]
,并且需要提取windows机器中的所有部分文件,使用python.
def extract(infile : str, path : str):
tar = tarfile.open(infile, "r:gz")
tar.extractall(path = path)
tar.close()
extract("20210914.tar.gz.partaa", path = "tmp") # where file is first file
但是,我得到了 EOFError: Compressed file ended before the end-of-stream marker was reached
,这是预期的,因为(我想)还有两个文件需要提取。
我的问题:如何修改读取所有文件,并解压到同一目录的功能?
我试图将第二个文件直接传递给函数,但出现了以下错误:
OSError Traceback (most recent call last)
~\.conda\envs\python37\lib\tarfile.py in gzopen(cls, name, mode, fileobj, compresslevel, **kwargs)
1643 try:
-> 1644 t = cls.taropen(name, mode, fileobj, **kwargs)
1645 except OSError:
~\.conda\envs\python37\lib\tarfile.py in taropen(cls, name, mode, fileobj, **kwargs)
1620 raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
-> 1621 return cls(name, mode, fileobj, **kwargs)
1622
~\.conda\envs\python37\lib\tarfile.py in __init__(self, name, mode, fileobj, format, tarinfo, dereference, ignore_zeros, encoding, errors, pax_headers, debug, errorlevel, copybufsize)
1483 self.firstmember = None
-> 1484 self.firstmember = self.next()
1485
~\.conda\envs\python37\lib\tarfile.py in next(self)
2286 try:
-> 2287 tarinfo = self.tarinfo.fromtarfile(self)
2288 except EOFHeaderError as e:
~\.conda\envs\python37\lib\tarfile.py in fromtarfile(cls, tarfile)
1093
-> 1094 buf = tarfile.fileobj.read(BLOCKSIZE)
1095 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
~\.conda\envs\python37\lib\gzip.py in read(self, size)
286 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
--> 287 return self._buffer.read(size)
288
~\.conda\envs\python37\lib\_compression.py in readinto(self, b)
67 with memoryview(b) as view, view.cast("B") as byte_view:
---> 68 data = self.read(len(byte_view))
69 byte_view[:len(data)] = data
~\.conda\envs\python37\lib\gzip.py in read(self, size)
473 self._init_read()
--> 474 if not self._read_gzip_header():
475 self._size = self._pos
~\.conda\envs\python37\lib\gzip.py in _read_gzip_header(self)
421 if magic != b'73':
--> 422 raise OSError('Not a gzipped file (%r)' % magic)
423
OSError: Not a gzipped file (b'|\x19')
During handling of the above exception, another exception occurred:
ReadError Traceback (most recent call last)
<ipython-input-77-29d5169be949> in <module>
----> 1 extract("20210914.tar.gz.partab", path = "tmp") # where file is first file
<ipython-input-75-60cd4e78bf4e> in extract(infile, path, chunk, **kwargs)
1 def extract(infile : str, path : str, chunk : int = 2000, **kwargs):
----> 2 tar = tarfile.open(infile, "r:gz")
3 tar.extractall(path = path)
4 tar.close()
~\.conda\envs\python37\lib\tarfile.py in open(cls, name, mode, fileobj, bufsize, **kwargs)
1589 else:
1590 raise CompressionError("unknown compression type %r" % comptype)
-> 1591 return func(name, filemode, fileobj, **kwargs)
1592
1593 elif "|" in mode:
~\.conda\envs\python37\lib\tarfile.py in gzopen(cls, name, mode, fileobj, compresslevel, **kwargs)
1646 fileobj.close()
1647 if mode == 'r':
-> 1648 raise ReadError("not a gzip file")
1649 raise
1650 except:
ReadError: not a gzip file
split
做它的名字所说的 - 将文件分成几部分,你应该首先连接你拥有的所有部分,然后将其视为普通 *.tar.gz 文件。您可以使用 python 连接它们,如下所示,创建文件 concater.py
import sys
with open('total.tar.gz','wb') as f:
for fname in sys.argv[1:]:
with open(fname,'rb') as g:
f.write(g.read())
然后做
python concater.py 20210914.tar.gz.partaa 20210914.tar.gz.partab 20210914.tar.gz.partac
应该创建 total.tar.gz
将被视为单个 *.tar.gz 文件。 sys.argv
保留当前脚本名称,后跟命令行参数,因此我首先放弃它们(即脚本名称)