Hi,
I am developing a small code to present at a code modernization workshop. The code here is the unoptimized version of the code. What surprises me is that Intel Advisor tells me that the loop on line 62 is not optimized because "unsigned types for induction variable and/or for lower/upper iteration bounds make loop uncoutable".
I don't really understand why this loop is uncoutable. Do you have an idea ?
I use Parallel Studio XE 2017 on Linux. The code is below:
icpc -g -std=c++11 -O3 -xHost kmeans.cpp main.cpp -o main
#include <cstddef> #include <cstdio> double kmeans_clustering(std::size_t nb_point, std::size_t nb_cluster, std::size_t nb_iteration); int main() { std::size_t nb_point = 1000000; std::size_t nb_cluster = 1000; std::size_t nb_iteration = 10; double time = kmeans_clustering(nb_point, nb_cluster, nb_iteration); std::printf("Time: %7.2f\n", time); return time; }
#include <chrono> #include <cmath> #include <limits> #include <random> #include <vector> struct Pixel { float red; float green; float blue; }; double kmeans_clustering(std::size_t nb_point, std::size_t nb_cluster, std::size_t nb_iteration) { std::vector<Pixel> point(nb_point); std::vector<std::size_t> cluster(nb_point); std::vector<Pixel> centroid(nb_cluster); std::vector<std::size_t> point_per_cluster(nb_cluster); std::default_random_engine engine{}; std::uniform_real_distribution<float> r_dist{0.0f, 1.0f}; std::uniform_int_distribution<std::size_t> i_dist{0, nb_cluster - 1}; for (std::size_t k = 0; k < nb_point; ++k) { point[k].red = r_dist(engine); point[k].green = r_dist(engine); point[k].blue = r_dist(engine); cluster[k] = i_dist(engine); } auto start = std::chrono::high_resolution_clock::now(); std::size_t iteration = 0; while (true) { // Compute the centroid of the clusters for (std::size_t i = 0; i < nb_cluster; ++i) { centroid[i].red = 0.0f; centroid[i].green = 0.0f; centroid[i].blue = 0.0f; point_per_cluster[i] = 0; } for (std::size_t k = 0; k < nb_point; ++k) { std::size_t i = cluster[k]; centroid[i].red += point[k].red; centroid[i].green += point[k].green; centroid[i].blue += point[k].blue; ++point_per_cluster[i]; } for (std::size_t i = 0; i < nb_cluster; ++i) { std::size_t nb_point_cluster = point_per_cluster[i]; centroid[i].red /= nb_point_cluster; centroid[i].green /= nb_point_cluster; centroid[i].blue /= nb_point_cluster; } // Exit once convergence is reached ++iteration; if (iteration > nb_iteration) { break; } // Reassign points to clusters for (std::size_t k = 0; k < nb_point; ++k) { float best_distance = std::numeric_limits<float>::max(); std::size_t best_centroid = -1; for (std::size_t i = 0; i < nb_cluster; ++i) { float x = point[k].red - centroid[i].red; float y = point[k].green - centroid[i].green; float z = point[k].blue - centroid[i].blue; float distance = std::pow(x, 2) + std::pow(y, 2) + std::pow(z, 2); if (distance < best_distance) { best_distance = distance; best_centroid = i; } } cluster[k] = best_centroid; } } auto end = std::chrono::high_resolution_clock::now(); double time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count(); return time; }