I have to run a once off C# calculation on millions of rows of data and save the results in another table. I haven't worked with threading in C# in a couple of years. I'm using .NET v4.5 and EF v5.
The original code is something along the lines of:
public static void Main()
{
Stopwatch sw = new Stopwatch();
sw.Start();
Entities db = new Entities();
DoCalc(db.Clients.ToList());
sw.Stop();
Console.WriteLine(sw.Elapsed);
}
private static void DoCalc(List<Client> clients)
{
Entities db = new Entities();
foreach(var c in clients)
{
var transactions = db.GetTransactions(c);
var result = calulate(transactions); //the actual calc
db.Results.Add(result);
db.SaveChanges();
}
}
Here is my attempt at multi-threading:
private static int numberOfThreads = 15;
public static void Main()
{
Stopwatch sw = new Stopwatch();
sw.Start();
Entities db = new Entities();
var splitUpClients = SplitUpClients(db.Clients());
Task[] allTasks = new Task[numberOfThreads];
for (int i = 0; i < numberOfThreads; i++)
{
Task task = Task.Factory.StartNew(() => DoCalc(splitupClients[i]));
allTasks[i] = task;
}
Task.WaitAll(allTasks);
sw.Stop();
Console.WriteLine(sw.Elapsed);
}
private static void DoCalc(List<Client> clients)
{
Entities db = new Entities();
foreach(var c in clients)
{
var transactions = db.GetTransactions(c);
var result = calulate(transactions);
db.Results.Add(result);
db.SaveChanges();
}
}
//splits the list of clients into n subgroups
private static List<List<Client>> SplitUpClients(List<Client> clients)
{
int maxPerGroup = (int)Math.Ceiling((double)clients.Count() / numberOfThreads);
return ts.Select((s, i) => new { Str = s, Index = i }).
GroupBy(o => o.Index / maxPerGroup, o => o.Str).
Select(coll => coll.ToList()).
ToList();
}
My question is:
Is this the safe and correct way to do it and are there any obvious shortcomings (especially with regard to EF)?
Also, how do I find the optimum number of threads? Is it the more the merrier?
using (Entities db = new Entities()) { ... }especially when you create them on a thread.