I tried to parallelize the loop, and I got a good result but still not enough. This post is a follow up to a recent one where I optimized other parts of the code using a lookup table and spacial and temporal relationships. This is not included in the following code for simplification.
The loop in question is in hist function. I want your help if you have any suggestion to optimize the loop and run it faster?
I think it is now important to mention the hardware I'll be using. It will be Ambarella’s CV25. I know there exist some hardware optimizations such as SIMD, but I'm not very familiar with that low level programming but I'm open for any solutions.
Here are more details about the hardware:
#include <opencv2/opencv.hpp>
#include <iostream>
#include <vector>
// Structure to hold cached parameters
struct Cache {
std::vector<int> data_b;
std::vector<int> data_g;
std::vector<int> data_r;
std::vector<uchar> lut_b;
std::vector<uchar> lut_g;
std::vector<uchar> lut_r;
};
// Function to compute simple example data and lookup tables
void compute_data(const cv::Mat& image, Cache& cache)
{
// Simple example to initialize data
cache.data_b.assign(256, 1);
cache.data_g.assign(256, 2);
cache.data_r.assign(256, 3);
// Compute lookup tables
cache.lut_b.resize(256);
cache.lut_g.resize(256);
cache.lut_r.resize(256);
for (int i = 0; i < 256; i++) {
cache.lut_b[i] = static_cast<uchar>(i);
cache.lut_g[i] = static_cast<uchar>(i);
cache.lut_r[i] = static_cast<uchar>(i);
}
}
void hist(cv::Mat& image, Cache& cache, bool use_cache)
{
if (!use_cache) {
compute_data(image, cache);
}
// Apply transformation using lookup tables in parallel
cv::parallel_for_(cv::Range(0, image.rows), [&](const cv::Range& range) {
for (int i = range.start; i < range.end; ++i)
{
cv::Vec3b* row = image.ptr<cv::Vec3b>(i);
for (int j = 0; j < image.cols; ++j)
{
cv::Vec3b& pxi = row[j];
pxi[0] = cache.lut_b[pxi[0]];
pxi[1] = cache.lut_g[pxi[1]];
pxi[2] = cache.lut_r[pxi[2]];
}
}
});
}
int main(int argc, char** argv)
{
// Open the video file
cv::VideoCapture cap("../video.mp4");
if (!cap.isOpened()) {
std::cerr << "Error opening video file" << std::endl;
return -1;
}
// Get the frame rate of the video
double fps = cap.get(cv::CAP_PROP_FPS);
int delay = static_cast<int>(1000 / fps);
// Create a window to display the video
cv::namedWindow("Processed Video", cv::WINDOW_NORMAL);
cv::Mat frame;
Cache cache;
int frame_count = 0;
int recompute_interval = 5; // Recompute every 5 frames
while (true) {
cap >> frame;
if (frame.empty()) {
break;
}
// Determine whether to use the cache or recompute the data
bool use_cache = (frame_count % recompute_interval != 0);
// Process the frame using cached or recomputed parameters
hist(frame, cache, use_cache);
// Display the processed frame
cv::imshow("Processed Video", frame);
// Break the loop if 'q' is pressed
if (cv::waitKey(delay) == 'q') {
break;
}
frame_count++;
}
cap.release();
cv::destroyAllWindows();
return 0;
}


parallel_for_does is create threads, split the range into the number of threads, and call your worker function once within each thread. Your task is to move that thread creation to the start of the program. \$\endgroup\$