I have code where I'm trying to split PDFs into a list of jpg MemoryStream files. I the split portion working, where it takes around under a second or less and creates 100 pdf streams. However once I get to the the point where I'm turning the PDFs into images performance drags down to a snails pace. ImageMagick uses GhostScript to perform this action. My theory is that each time it calls out to GhostScript it has to set it up to be called it produces overhead. I'm wondering if there is a way to make batch calls. The way I understand it MagickImageCollection can only take one page at a time, which is why I do it in a separate method.
I'm open to using a different tool to split the images or convert them. I'm looking into BlackIce but I'm waiting to hear about our license.
namespace PDFTools;
using ImageMagick;
using iText.Kernel.Pdf;
using iText.Layout;
public class PDFUtilities(string temporaryDirectory)
{
private readonly string TemporaryDirectory = temporaryDirectory;
public async Task<List<byte[]>> ConvertPdfToImageAsync(Stream stream)
{
List<byte[]> results = new List<byte[]>();
MagickNET.SetTempDirectory(this.TemporaryDirectory);
List<MemoryStream> pdfPages = this.SplitPdf(stream);
List<MemoryStream> output = new MemoryStream();
var tasks = pdfPages.Select((pdfPage, index) => new OrderedTask
{
Index = index,
Task = this.ConvertPageToImageStreamAsync(pdfPage)
}).ToList();
_ = await Task.WhenAll(tasks.Select(static t => t.Task));
OrderedTask[] orderedTask = tasks
.OrderBy(static s => s.Index)
.ToArray();
foreach (OrderedTask task in orderedTask)
{
MemoryStream ms = await task.Task;
byte[] bytes = ms.ToArray();
results.Add(bytes);
}
return results;
}
private async Task<MemoryStream> ConvertPageToImageStreamAsync(MemoryStream file)
{
MemoryStream outputStream = new MemoryStream();
MagickImageCollection images = new MagickImageCollection();
await images.ReadAsync(file); // Only accepts one image at a time, when I tried multiple PDFS it only gets the last image.
foreach (MagickImage image in images)
{
image.Quality = 100;
await image.WriteAsync(outputStream, MagickFormat.Jpg);
}
outputStream.Position = 0;
file.Close();
return outputStream;
}
private List<MemoryStream> SplitPdf(Stream stream)
{
List<MemoryStream> pdfPages = new List<MemoryStream>();
using (PdfDocument pdfDocument = new PdfDocument(new PdfReader(stream)))
{
for (int pageNumber = 1; pageNumber <= pdfDocument.GetNumberOfPages(); pageNumber++)
{
using (MemoryStream tempStream = new MemoryStream())
{
using (PdfWriter writer = new PdfWriter(tempStream))
{
using (PdfDocument newPdf = new PdfDocument(writer))
{
_ = pdfDocument.CopyPagesTo(pageNumber, pageNumber, newPdf);
}
}
MemoryStream outputStream = new MemoryStream(tempStream.ToArray());
pdfPages.Add(outputStream);
}
}
}
return pdfPages;
}
}
internal class OrderedTask
{
required public int Index { get; set; }
required public Task<MemoryStream> Task { get; set; }
}