如何在不使用任何外部库的情况下解压超过 100mb 的大文件
how to decompress big file of more than 100mb in not using any external libraries
我试过使用 NuGet 包提取 tgz 文件,但 tgz 包含名称中包含不支持的字符的文件,例如:1111-11-1111:11:11。111.AA
使用 sharpcompress 库验证了这个问题。
所以我必须遵循下面的要点link
https://gist.github.com/ForeverZer0/a2cd292bd2f3b5e114956c00bb6e872b
这是我按照link提取tgz文件的方法。这是一段非常好的代码并且运行良好。但是当我尝试提取超过 100MB 的大尺寸 tgz 文件时,出现错误,比如流太长。
该错误表示您尝试向 MemoryStream
中输入过多字节,其最大容量为 int.MaxValue
(约 2GB)。
如果找不到合适的库,想使用提供的代码,可以修改如下。
请注意,整个 GZipStream
首先复制到 MemoryStream
。为什么?正如代码中的注释所述:
// A GZipStream is not seekable, so copy it first to a MemoryStream
然而,在后续代码中,只使用了两个操作要求流是可搜索的:stream.Seek(x, SeekOrigin.Current)
(其中 x 始终为正数)和 stream.Position
。这两个操作都可以通过读取流来模拟,而无需查找。例如,要向前查找,您可以读取该字节数并丢弃:
private static void FakeSeekForward(Stream stream, int offset) {
if (stream.CanSeek)
stream.Seek(offset, SeekOrigin.Current);
else {
int bytesRead = 0;
var buffer = new byte[offset];
while (bytesRead < offset)
{
int read = stream.Read(buffer, bytesRead, offset - bytesRead);
if (read == 0)
throw new EndOfStreamException();
bytesRead += read;
}
}
}
要跟踪当前流位置,您只需存储读取的字节数即可。然后我们可以删除对 MemoryStream
的转换,link 中的代码变为:
public class Tar
{
/// <summary>
/// Extracts a <i>.tar.gz</i> archive to the specified directory.
/// </summary>
/// <param name="filename">The <i>.tar.gz</i> to decompress and extract.</param>
/// <param name="outputDir">Output directory to write the files.</param>
public static void ExtractTarGz(string filename, string outputDir)
{
using (var stream = File.OpenRead(filename))
ExtractTarGz(stream, outputDir);
}
/// <summary>
/// Extracts a <i>.tar.gz</i> archive stream to the specified directory.
/// </summary>
/// <param name="stream">The <i>.tar.gz</i> to decompress and extract.</param>
/// <param name="outputDir">Output directory to write the files.</param>
public static void ExtractTarGz(Stream stream, string outputDir)
{
using (var gzip = new GZipStream(stream, CompressionMode.Decompress))
{
// removed convertation to MemoryStream
ExtractTar(gzip, outputDir);
}
}
/// <summary>
/// Extractes a <c>tar</c> archive to the specified directory.
/// </summary>
/// <param name="filename">The <i>.tar</i> to extract.</param>
/// <param name="outputDir">Output directory to write the files.</param>
public static void ExtractTar(string filename, string outputDir)
{
using (var stream = File.OpenRead(filename))
ExtractTar(stream, outputDir);
}
/// <summary>
/// Extractes a <c>tar</c> archive to the specified directory.
/// </summary>
/// <param name="stream">The <i>.tar</i> to extract.</param>
/// <param name="outputDir">Output directory to write the files.</param>
public static void ExtractTar(Stream stream, string outputDir) {
var buffer = new byte[100];
// store current position here
long pos = 0;
while (true) {
pos += stream.Read(buffer, 0, 100);
var name = Encoding.ASCII.GetString(buffer).Trim('[=12=]');
if (String.IsNullOrWhiteSpace(name))
break;
FakeSeekForward(stream, 24);
pos += 24;
pos += stream.Read(buffer, 0, 12);
var size = Convert.ToInt64(Encoding.UTF8.GetString(buffer, 0, 12).Trim('[=12=]').Trim(), 8);
FakeSeekForward(stream, 376);
pos += 376;
var output = Path.Combine(outputDir, name);
if (!Directory.Exists(Path.GetDirectoryName(output)))
Directory.CreateDirectory(Path.GetDirectoryName(output));
if (!name.Equals("./", StringComparison.InvariantCulture)) {
using (var str = File.Open(output, FileMode.OpenOrCreate, FileAccess.Write)) {
var buf = new byte[size];
pos += stream.Read(buf, 0, buf.Length);
str.Write(buf, 0, buf.Length);
}
}
var offset = (int) (512 - (pos % 512));
if (offset == 512)
offset = 0;
FakeSeekForward(stream, offset);
pos += offset;
}
}
private static void FakeSeekForward(Stream stream, int offset) {
if (stream.CanSeek)
stream.Seek(offset, SeekOrigin.Current);
else {
int bytesRead = 0;
var buffer = new byte[offset];
while (bytesRead < offset)
{
int read = stream.Read(buffer, bytesRead, offset - bytesRead);
if (read == 0)
throw new EndOfStreamException();
bytesRead += read;
}
}
}
}
我试过使用 NuGet 包提取 tgz 文件,但 tgz 包含名称中包含不支持的字符的文件,例如:1111-11-1111:11:11。111.AA
使用 sharpcompress 库验证了这个问题。
所以我必须遵循下面的要点link
https://gist.github.com/ForeverZer0/a2cd292bd2f3b5e114956c00bb6e872b
这是我按照link提取tgz文件的方法。这是一段非常好的代码并且运行良好。但是当我尝试提取超过 100MB 的大尺寸 tgz 文件时,出现错误,比如流太长。
该错误表示您尝试向 MemoryStream
中输入过多字节,其最大容量为 int.MaxValue
(约 2GB)。
如果找不到合适的库,想使用提供的代码,可以修改如下。
请注意,整个 GZipStream
首先复制到 MemoryStream
。为什么?正如代码中的注释所述:
// A GZipStream is not seekable, so copy it first to a MemoryStream
然而,在后续代码中,只使用了两个操作要求流是可搜索的:stream.Seek(x, SeekOrigin.Current)
(其中 x 始终为正数)和 stream.Position
。这两个操作都可以通过读取流来模拟,而无需查找。例如,要向前查找,您可以读取该字节数并丢弃:
private static void FakeSeekForward(Stream stream, int offset) {
if (stream.CanSeek)
stream.Seek(offset, SeekOrigin.Current);
else {
int bytesRead = 0;
var buffer = new byte[offset];
while (bytesRead < offset)
{
int read = stream.Read(buffer, bytesRead, offset - bytesRead);
if (read == 0)
throw new EndOfStreamException();
bytesRead += read;
}
}
}
要跟踪当前流位置,您只需存储读取的字节数即可。然后我们可以删除对 MemoryStream
的转换,link 中的代码变为:
public class Tar
{
/// <summary>
/// Extracts a <i>.tar.gz</i> archive to the specified directory.
/// </summary>
/// <param name="filename">The <i>.tar.gz</i> to decompress and extract.</param>
/// <param name="outputDir">Output directory to write the files.</param>
public static void ExtractTarGz(string filename, string outputDir)
{
using (var stream = File.OpenRead(filename))
ExtractTarGz(stream, outputDir);
}
/// <summary>
/// Extracts a <i>.tar.gz</i> archive stream to the specified directory.
/// </summary>
/// <param name="stream">The <i>.tar.gz</i> to decompress and extract.</param>
/// <param name="outputDir">Output directory to write the files.</param>
public static void ExtractTarGz(Stream stream, string outputDir)
{
using (var gzip = new GZipStream(stream, CompressionMode.Decompress))
{
// removed convertation to MemoryStream
ExtractTar(gzip, outputDir);
}
}
/// <summary>
/// Extractes a <c>tar</c> archive to the specified directory.
/// </summary>
/// <param name="filename">The <i>.tar</i> to extract.</param>
/// <param name="outputDir">Output directory to write the files.</param>
public static void ExtractTar(string filename, string outputDir)
{
using (var stream = File.OpenRead(filename))
ExtractTar(stream, outputDir);
}
/// <summary>
/// Extractes a <c>tar</c> archive to the specified directory.
/// </summary>
/// <param name="stream">The <i>.tar</i> to extract.</param>
/// <param name="outputDir">Output directory to write the files.</param>
public static void ExtractTar(Stream stream, string outputDir) {
var buffer = new byte[100];
// store current position here
long pos = 0;
while (true) {
pos += stream.Read(buffer, 0, 100);
var name = Encoding.ASCII.GetString(buffer).Trim('[=12=]');
if (String.IsNullOrWhiteSpace(name))
break;
FakeSeekForward(stream, 24);
pos += 24;
pos += stream.Read(buffer, 0, 12);
var size = Convert.ToInt64(Encoding.UTF8.GetString(buffer, 0, 12).Trim('[=12=]').Trim(), 8);
FakeSeekForward(stream, 376);
pos += 376;
var output = Path.Combine(outputDir, name);
if (!Directory.Exists(Path.GetDirectoryName(output)))
Directory.CreateDirectory(Path.GetDirectoryName(output));
if (!name.Equals("./", StringComparison.InvariantCulture)) {
using (var str = File.Open(output, FileMode.OpenOrCreate, FileAccess.Write)) {
var buf = new byte[size];
pos += stream.Read(buf, 0, buf.Length);
str.Write(buf, 0, buf.Length);
}
}
var offset = (int) (512 - (pos % 512));
if (offset == 512)
offset = 0;
FakeSeekForward(stream, offset);
pos += offset;
}
}
private static void FakeSeekForward(Stream stream, int offset) {
if (stream.CanSeek)
stream.Seek(offset, SeekOrigin.Current);
else {
int bytesRead = 0;
var buffer = new byte[offset];
while (bytesRead < offset)
{
int read = stream.Read(buffer, bytesRead, offset - bytesRead);
if (read == 0)
throw new EndOfStreamException();
bytesRead += read;
}
}
}
}