阻止集合显示重复条目
Blocking Collection shows Duplicate entries
我首先检索 table 中的总行数(比如 100),然后将它们分成块(比如 25)。然后我创建一个任务 (taskFetch),它使用 Parallel.Foreach() 方法从 MyTable 中以块的形式将行提取到 DataTable(每个包含 25 条记录)中。还有另一个嵌套的 Parallel,Foreach() 使用 Partitioner.Create() 从每个 DataTable 中检索数据并将其添加到阻塞集合 (sourceCollection) 中。出于测试目的,我在控制台上输出了结果,没问题。
但是,当我尝试从 sourceCollection 检索数据时,我发现了重复的条目。实际上,我在 table 中有超过 1500,000 条记录。我不完全认为重复的条目是添加的,但我采取的方式让我有点怀疑。
代码
public async Task BulkMigrationAsync(string clearPVK, string EncZPK)
{
BlockingCollection<MigrationObject> sourceCollection = new BlockingCollection<MigrationObject>();
List<Task> tasksGeneratedPinBlock = new List<Task>();
int rangeFrom = 1;
int recordsInSet = 25;
int rangeTo = 25;
int setsCount = 0;
Dictionary<int, Tuple<int, int>> chunks = new Dictionary<int, Tuple<int, int>>();
SqlConnection conn = new SqlConnection(_Database.ConnectionString);
SqlCommand cmd = new SqlCommand("SELECT COUNT(*) FROM MyTable); // Suppose If retrieves 100 rows
// getting total records from MyTable
using (conn)
{
conn.Open();
setsCount = Convert.ToInt32(cmd.ExecuteScalar()) / recordsInSet ; // then setsCount will be 4
conn.Close();
}
for (int i = 0; i < setsCount; i++)
{
chunks.Add(i, new Tuple<int, int>(rangeFrom, rangeTo));
rangeFrom = rangeTo + 1;
rangeTo = rangeTo + recordsInSet;
}
// Each chunk would contain 100000 records to be preocessed later
// chunks => {0, (1, 25)}
// {1, (26, 50)} // a chunk, chunk.Value.Item1 = 26 and chunk.Value.Item2 = 50
// {2, (51, 75)}
// {3, (76, 100)}
// fetching results in dataTable from DB in chunks and ADDING to sourceCollection
Task taskFetch = Task.Factory.StartNew(() =>
{
Parallel.ForEach(chunks, (chunk) =>
{
DataTable dt = new DataTable();
SqlConnection localConn = new SqlConnection(_Database.ConnectionString);
string command = @"SELECT * FROM ( SELECT RELATIONSHIP_NUM, CUST_ID, CODE, BLOCK_NEW, ROW_NUMBER() over (
order by RELATIONSHIP_NUM, CUST_ID) as RowNum FROM MyTable) SUB
WHERE SUB.RowNum BETWEEN chunk.Value.Item1 AND chunk.Value.Item2";
SqlDataAdapter da = new SqlDataAdapter(command, localConn);
try
{
using (da)
using (localConn)
{
da.Fill(dt);
}
}
finally
{
if (localConn.State != ConnectionState.Closed)
localConn.Close();
localConn.Dispose();
}
Parallel.ForEach(Partitioner.Create(0, dt.Rows.Count),
(range, state) =>
{
MigrationObject migSource = new MigrationObject();
for (int i = range.Item1; i < range.Item2; i++)
{
migSource.PAN = dt.Rows[i]["CUST_ID"].ToString();
migSource.PinOffset = dt.Rows[i]["CODE"].ToString();
migSource.PinBlockNew = dt.Rows[i]["BLOCK_NEW"].ToString();
migSource.RelationshipNum = dt.Rows[i]["RELATIONSHIP_NUM"].ToString();
Console.WriteLine(@"PAN " + migSource.PAN + " Rel " + migSource.RelationshipNum
+ " for ranges : " + range.Item1 + " TO " + range.Item2);
sourceCollection.TryAdd(migSource);
}
});
});
});
await taskFetch;
sourceCollection.CompleteAdding();
while (!sourceCollection.IsCompleted)
{
MigrationObject mig;
if (sourceCollection.TryTake(out mig)) // Seems to be the problem area because may be im not handling out
{
await Task.Delay(50);
Console.WriteLine(" Rel " + mig.RelationshipNum + " PAN " + mig.PAN);
}
}
}
我的错,实际上问题区域是:
Parallel.ForEach(Partitioner.Create(0, dt.Rows.Count),
(range, state) =>
{
MigrationObject migSource = new MigrationObject(); // creating the object outside For loop.
for (int i = range.Item1; i < range.Item2; i++)
{
migSource.PAN = dt.Rows[i]["CUST_ID"].ToString();
migSource.PinOffset = dt.Rows[i]["CODE"].ToString();
migSource.PinBlockNew = dt.Rows[i]["BLOCK_NEW"].ToString();
migSource.RelationshipNum = dt.Rows[i]["RELATIONSHIP_NUM"].ToString();
Console.WriteLine(@"PAN " + migSource.PAN + " Rel " + migSource.RelationshipNum
+ " for ranges : " + range.Item1 + " TO " + range.Item2);
sourceCollection.TryAdd(migSource);
}
});
相反,我应该包含“MigrationObject migSource = new MigrationObject();”在 For 循环内:
Parallel.ForEach(Partitioner.Create(0, dt.Rows.Count),
(range, state) =>
{
for (int i = range.Item1; i < range.Item2; i++)
{
MigrationObject migSource = new MigrationObject();
migSource.PAN = dt.Rows[i]["CUST_ID"].ToString();
migSource.PinOffset = dt.Rows[i]["CODE"].ToString();
migSource.PinBlockNew = dt.Rows[i]["BLOCK_NEW"].ToString();
migSource.RelationshipNum = dt.Rows[i]["RELATIONSHIP_NUM"].ToString();
Console.WriteLine(@"PAN " + migSource.PAN + " Rel " + migSource.RelationshipNum
+ " for ranges : " + range.Item1 + " TO " + range.Item2);
sourceCollection.TryAdd(migSource);
}
});
我首先检索 table 中的总行数(比如 100),然后将它们分成块(比如 25)。然后我创建一个任务 (taskFetch),它使用 Parallel.Foreach() 方法从 MyTable 中以块的形式将行提取到 DataTable(每个包含 25 条记录)中。还有另一个嵌套的 Parallel,Foreach() 使用 Partitioner.Create() 从每个 DataTable 中检索数据并将其添加到阻塞集合 (sourceCollection) 中。出于测试目的,我在控制台上输出了结果,没问题。
但是,当我尝试从 sourceCollection 检索数据时,我发现了重复的条目。实际上,我在 table 中有超过 1500,000 条记录。我不完全认为重复的条目是添加的,但我采取的方式让我有点怀疑。
代码
public async Task BulkMigrationAsync(string clearPVK, string EncZPK)
{
BlockingCollection<MigrationObject> sourceCollection = new BlockingCollection<MigrationObject>();
List<Task> tasksGeneratedPinBlock = new List<Task>();
int rangeFrom = 1;
int recordsInSet = 25;
int rangeTo = 25;
int setsCount = 0;
Dictionary<int, Tuple<int, int>> chunks = new Dictionary<int, Tuple<int, int>>();
SqlConnection conn = new SqlConnection(_Database.ConnectionString);
SqlCommand cmd = new SqlCommand("SELECT COUNT(*) FROM MyTable); // Suppose If retrieves 100 rows
// getting total records from MyTable
using (conn)
{
conn.Open();
setsCount = Convert.ToInt32(cmd.ExecuteScalar()) / recordsInSet ; // then setsCount will be 4
conn.Close();
}
for (int i = 0; i < setsCount; i++)
{
chunks.Add(i, new Tuple<int, int>(rangeFrom, rangeTo));
rangeFrom = rangeTo + 1;
rangeTo = rangeTo + recordsInSet;
}
// Each chunk would contain 100000 records to be preocessed later
// chunks => {0, (1, 25)}
// {1, (26, 50)} // a chunk, chunk.Value.Item1 = 26 and chunk.Value.Item2 = 50
// {2, (51, 75)}
// {3, (76, 100)}
// fetching results in dataTable from DB in chunks and ADDING to sourceCollection
Task taskFetch = Task.Factory.StartNew(() =>
{
Parallel.ForEach(chunks, (chunk) =>
{
DataTable dt = new DataTable();
SqlConnection localConn = new SqlConnection(_Database.ConnectionString);
string command = @"SELECT * FROM ( SELECT RELATIONSHIP_NUM, CUST_ID, CODE, BLOCK_NEW, ROW_NUMBER() over (
order by RELATIONSHIP_NUM, CUST_ID) as RowNum FROM MyTable) SUB
WHERE SUB.RowNum BETWEEN chunk.Value.Item1 AND chunk.Value.Item2";
SqlDataAdapter da = new SqlDataAdapter(command, localConn);
try
{
using (da)
using (localConn)
{
da.Fill(dt);
}
}
finally
{
if (localConn.State != ConnectionState.Closed)
localConn.Close();
localConn.Dispose();
}
Parallel.ForEach(Partitioner.Create(0, dt.Rows.Count),
(range, state) =>
{
MigrationObject migSource = new MigrationObject();
for (int i = range.Item1; i < range.Item2; i++)
{
migSource.PAN = dt.Rows[i]["CUST_ID"].ToString();
migSource.PinOffset = dt.Rows[i]["CODE"].ToString();
migSource.PinBlockNew = dt.Rows[i]["BLOCK_NEW"].ToString();
migSource.RelationshipNum = dt.Rows[i]["RELATIONSHIP_NUM"].ToString();
Console.WriteLine(@"PAN " + migSource.PAN + " Rel " + migSource.RelationshipNum
+ " for ranges : " + range.Item1 + " TO " + range.Item2);
sourceCollection.TryAdd(migSource);
}
});
});
});
await taskFetch;
sourceCollection.CompleteAdding();
while (!sourceCollection.IsCompleted)
{
MigrationObject mig;
if (sourceCollection.TryTake(out mig)) // Seems to be the problem area because may be im not handling out
{
await Task.Delay(50);
Console.WriteLine(" Rel " + mig.RelationshipNum + " PAN " + mig.PAN);
}
}
}
我的错,实际上问题区域是:
Parallel.ForEach(Partitioner.Create(0, dt.Rows.Count),
(range, state) =>
{
MigrationObject migSource = new MigrationObject(); // creating the object outside For loop.
for (int i = range.Item1; i < range.Item2; i++)
{
migSource.PAN = dt.Rows[i]["CUST_ID"].ToString();
migSource.PinOffset = dt.Rows[i]["CODE"].ToString();
migSource.PinBlockNew = dt.Rows[i]["BLOCK_NEW"].ToString();
migSource.RelationshipNum = dt.Rows[i]["RELATIONSHIP_NUM"].ToString();
Console.WriteLine(@"PAN " + migSource.PAN + " Rel " + migSource.RelationshipNum
+ " for ranges : " + range.Item1 + " TO " + range.Item2);
sourceCollection.TryAdd(migSource);
}
});
相反,我应该包含“MigrationObject migSource = new MigrationObject();”在 For 循环内:
Parallel.ForEach(Partitioner.Create(0, dt.Rows.Count),
(range, state) =>
{
for (int i = range.Item1; i < range.Item2; i++)
{
MigrationObject migSource = new MigrationObject();
migSource.PAN = dt.Rows[i]["CUST_ID"].ToString();
migSource.PinOffset = dt.Rows[i]["CODE"].ToString();
migSource.PinBlockNew = dt.Rows[i]["BLOCK_NEW"].ToString();
migSource.RelationshipNum = dt.Rows[i]["RELATIONSHIP_NUM"].ToString();
Console.WriteLine(@"PAN " + migSource.PAN + " Rel " + migSource.RelationshipNum
+ " for ranges : " + range.Item1 + " TO " + range.Item2);
sourceCollection.TryAdd(migSource);
}
});