并行下载大量文件的有效方法
Efficient way to download a huge load of files in parallel
我正在尝试从 Internet 下载大量文件(图片)。我正在为 async/parallel 苦苦挣扎,因为
a) 我不能说是否有文件。我刚收到一百万 link 提供的单张图片(300kb 到 3MB)或 404 页面不存在。因此,为了避免下载 0 字节文件,我询问同一页面两次,一次是 404,然后是图片。另一种方法是下载所有 0 字节文件并随后删除数百万个文件 - 这使 windows 10 一直坚持执行此任务,直到我重新启动。
b) 虽然(非常慢)下载正在进行中,但每当我查看任何“成功下载的文件”时,它都是用 0 字节创建的并且不包含图片。我需要更改什么才能在下载下一个文件之前真正下载文件?
我该如何解决这两个问题?有没有更好的方法来下载数以百万计的文件(compression/creating .zip 在服务器上是不可能的)
//loopResult = Parallel.ForEach(_downloadLinkList, new ParallelOptions { MaxDegreeOfParallelism = 10 }, DownloadFilesParallel);
private async void DownloadFilesParallel(string path)
{
string downloadToDirectory = "";
string x = ""; //in case x fails, i get 404 from webserver and therefore no download is needed
System.Threading.Interlocked.Increment(ref downloadCount);
OnNewListEntry(downloadCount.ToString() + " / " + linkCount.ToString() + " heruntergeladen"); //tell my gui to update
try
{
using(WebClient webClient = new WebClient())
{
downloadToDirectory = Path.Combine(savePathLocalComputer, Path.GetFileName(path)); //path on local computer
webClient.Credentials = CredentialCache.DefaultNetworkCredentials;
x = await webClient.DownloadStringTaskAsync(new Uri(path)); //if this throws an exception, ignore this link
Directory.CreateDirectory(Path.GetDirectoryName(downloadToDirectory)); //if request is successfull, create -if needed- the folder on local pc
await webClient.DownloadFileTaskAsync(new Uri(path), @downloadToDirectory); //should download the file, release 1 parallel task to get the next file. instead there is a 0-byte file and the next one will be downloaded
}
}
catch(WebException wex)
{
}
catch(Exception ex)
{
System.Diagnostics.Debug.WriteLine(ex.Message);
}
finally
{
}
}
//图片是sfw,link是nsfw
这是使用 HttpClient
并限制最大并发下载量的示例。
private static readonly HttpClient client = new HttpClient();
private async Task DownloadAndSaveFileAsync(string path, SemaphoreSlim semaphore, IProgress<int> status)
{
try
{
status?.Report(semaphore.CurrentCount);
using (HttpResponseMessage response = await client.GetAsync(path, HttpCompletionOption.ResponseHeadersRead).ConfigureAwait(false))
{
if (response.IsSuccessStatusCode) // ignoring if not success
{
string filePath = Path.Combine(savePathLocalComputer, Path.GetFileName(path));
string dir = Path.GetDirectoryName(filePath);
if (!Directory.Exists(dir)) Directory.CreateDirectory(dir);
using (Stream responseStream = await response.Content.ReadAsStreamAsync().ConfigureAwait(false))
using (FileStream fileStream = File.Create(filePath))
{
await responseStream.CopyToAsync(fileStream).ConfigureAwait(false);
}
}
}
}
finally
{
semaphore.Release();
}
}
并发数
client.BaseAddress = "http://somesite";
int downloadCount = 0;
List<string> pathList = new List<string>();
// fill the list here
List<Task> tasks = new List<Task>();
int maxConcurrentTasks = Environment.ProcessorCount * 2; // 16 for me
IProgress<int> status = new Progress<int>(availableTasks =>
{
downloadCount++;
OnNewListEntry(downloadCount + " / " + pathList.Count + " heruntergeladen\r\nRunning " + (maxConcurrentTasks - availableTasks) + " downloads.");
});
using (SemaphoreSlim semaphore = new SemaphoreSlim(maxConcurrentTasks))
{
foreach (string path in pathList)
{
await semaphore.WaitAsync();
tasks.Add(DownloadAndSaveFileAsync(path, semaphore, status));
}
try
{
await Task.WhenAll(tasks);
}
catch (Exception ex)
{
// handle the Exception here
}
}
Progress
这里只是在 UI 线程上执行回调。因此内部不需要 Interlocked
并且更新 UI.
是安全的
如果.NET Framework(在.NET Core 中没有效果但不需要)使其更快,您可以将此行添加到应用程序启动代码中
ServicePointManager.DefaultConnectionLimit = 10;
我正在尝试从 Internet 下载大量文件(图片)。我正在为 async/parallel 苦苦挣扎,因为
a) 我不能说是否有文件。我刚收到一百万 link 提供的单张图片(300kb 到 3MB)或 404 页面不存在。因此,为了避免下载 0 字节文件,我询问同一页面两次,一次是 404,然后是图片。另一种方法是下载所有 0 字节文件并随后删除数百万个文件 - 这使 windows 10 一直坚持执行此任务,直到我重新启动。
b) 虽然(非常慢)下载正在进行中,但每当我查看任何“成功下载的文件”时,它都是用 0 字节创建的并且不包含图片。我需要更改什么才能在下载下一个文件之前真正下载文件?
我该如何解决这两个问题?有没有更好的方法来下载数以百万计的文件(compression/creating .zip 在服务器上是不可能的)
//loopResult = Parallel.ForEach(_downloadLinkList, new ParallelOptions { MaxDegreeOfParallelism = 10 }, DownloadFilesParallel);
private async void DownloadFilesParallel(string path)
{
string downloadToDirectory = "";
string x = ""; //in case x fails, i get 404 from webserver and therefore no download is needed
System.Threading.Interlocked.Increment(ref downloadCount);
OnNewListEntry(downloadCount.ToString() + " / " + linkCount.ToString() + " heruntergeladen"); //tell my gui to update
try
{
using(WebClient webClient = new WebClient())
{
downloadToDirectory = Path.Combine(savePathLocalComputer, Path.GetFileName(path)); //path on local computer
webClient.Credentials = CredentialCache.DefaultNetworkCredentials;
x = await webClient.DownloadStringTaskAsync(new Uri(path)); //if this throws an exception, ignore this link
Directory.CreateDirectory(Path.GetDirectoryName(downloadToDirectory)); //if request is successfull, create -if needed- the folder on local pc
await webClient.DownloadFileTaskAsync(new Uri(path), @downloadToDirectory); //should download the file, release 1 parallel task to get the next file. instead there is a 0-byte file and the next one will be downloaded
}
}
catch(WebException wex)
{
}
catch(Exception ex)
{
System.Diagnostics.Debug.WriteLine(ex.Message);
}
finally
{
}
}
//图片是sfw,link是nsfw
这是使用 HttpClient
并限制最大并发下载量的示例。
private static readonly HttpClient client = new HttpClient();
private async Task DownloadAndSaveFileAsync(string path, SemaphoreSlim semaphore, IProgress<int> status)
{
try
{
status?.Report(semaphore.CurrentCount);
using (HttpResponseMessage response = await client.GetAsync(path, HttpCompletionOption.ResponseHeadersRead).ConfigureAwait(false))
{
if (response.IsSuccessStatusCode) // ignoring if not success
{
string filePath = Path.Combine(savePathLocalComputer, Path.GetFileName(path));
string dir = Path.GetDirectoryName(filePath);
if (!Directory.Exists(dir)) Directory.CreateDirectory(dir);
using (Stream responseStream = await response.Content.ReadAsStreamAsync().ConfigureAwait(false))
using (FileStream fileStream = File.Create(filePath))
{
await responseStream.CopyToAsync(fileStream).ConfigureAwait(false);
}
}
}
}
finally
{
semaphore.Release();
}
}
并发数
client.BaseAddress = "http://somesite";
int downloadCount = 0;
List<string> pathList = new List<string>();
// fill the list here
List<Task> tasks = new List<Task>();
int maxConcurrentTasks = Environment.ProcessorCount * 2; // 16 for me
IProgress<int> status = new Progress<int>(availableTasks =>
{
downloadCount++;
OnNewListEntry(downloadCount + " / " + pathList.Count + " heruntergeladen\r\nRunning " + (maxConcurrentTasks - availableTasks) + " downloads.");
});
using (SemaphoreSlim semaphore = new SemaphoreSlim(maxConcurrentTasks))
{
foreach (string path in pathList)
{
await semaphore.WaitAsync();
tasks.Add(DownloadAndSaveFileAsync(path, semaphore, status));
}
try
{
await Task.WhenAll(tasks);
}
catch (Exception ex)
{
// handle the Exception here
}
}
Progress
这里只是在 UI 线程上执行回调。因此内部不需要 Interlocked
并且更新 UI.
如果.NET Framework(在.NET Core 中没有效果但不需要)使其更快,您可以将此行添加到应用程序启动代码中
ServicePointManager.DefaultConnectionLimit = 10;