Quantcast
Channel: Intel® Software - Intel® C++ Compiler
Viewing all articles
Browse latest Browse all 1175

Bandwidth tests

$
0
0

Hi,

I am playing with some programs to compute "bandwidth" on my system which is a Dual-Xeon Skylake Gold 6140 (2 sockets of 18 cores) with 12 DIMMS (6 per socket) of RAM at 2666 MHz for a total of 96 GB. I wrote my own "stream" benchmark, and I am surprised by some results. On this platform, Intel Advisor (the roofline) claims 207 GB/s of memory bandwidth. The Intel Memory Latency Checker gives exactly the same result for the bandwidth. Here are the results given by my program.

Bandwidth, sum += a[i] * b[i]        : 182.698 Gb/s
Bandwidth, a[i] = 0.0                : 103.311 Gb/s
Bandwidth, a[i] = 2 * a[i]           : 128.075 Gb/s
Bandwidth, a[i] = b[i]               : 136.004 Gb/s
Bandwidth, a[i] = 2 * b[i]           : 102.294 Gb/s
Bandwidth, a[i] += 2 * b[i]          : 101.337 Gb/s
Bandwidth, a[i] = 2 * b[i] + 3 * c[i]: 114.601 Gb/s
Bandwidth, a[i] = b[i] + 3 * c[i]    : 114.525 Gb/s

I have a few questions:

1/ Is there a way to reach the peak performance of 207 GB/s with the reduction (sum += a[i] * b[i]) ? Can we tune prefetching to do so?

2/ Why is the bandwidth for setting a to 0.0 so low? Can we make it faster?

Best regards

 

PS: The following code has been compiled with

icpc -g -std=c++11 -O3 -xCORE-AVX512 -qopenmp -DNDEBUG main.cpp -o main

and launched with thread pinning with 1 thread per core.

export OMP_PLACES=cores
export OMP_PROC_BIND=spread
export OMP_NUM_THREADS=36
./main

Here is the full listing

#include <chrono>
#include <iostream>

int main() {
  const std::ptrdiff_t n = 1024 * 1024 * 1024;
  double *a = new double[n];
  double *b = new double[n];
  double *c = new double[n];
#pragma omp parallel for
  for (std::ptrdiff_t i = 0; i < n; ++i) {
    a[i] = 0.0;
    b[i] = 0.0;
    c[i] = 0.0;
  }

  const std::ptrdiff_t nb_times = 20;
  double sum = 0.0;
  auto point_begin = std::chrono::high_resolution_clock::now();
  for (std::ptrdiff_t k = 0; k < nb_times; ++k) {
#pragma omp parallel for reduction(+ : sum)
    for (std::ptrdiff_t i = 0; i < n; ++i) {
      sum += a[i] * b[i];
    }
    asm volatile("" : : "g"(a) : "memory");
    asm volatile("" : : "g"(b) : "memory");
  }
  auto point_end = std::chrono::high_resolution_clock::now();
  double time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>(
                             point_end - point_begin)
                             .count();

  std::cout << "Bandwidth, sum += a[i] * b[i]        : "<< (2 * n * sizeof(double) * nb_times) /
                   (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl;

  point_begin = std::chrono::high_resolution_clock::now();
  for (std::ptrdiff_t k = 0; k < nb_times; ++k) {
#pragma omp parallel for
    for (std::ptrdiff_t i = 0; i < n; ++i) {
      a[i] = 0.0;
    }
    asm volatile("" : : "g"(a) : "memory");
  }
  point_end = std::chrono::high_resolution_clock::now();
  time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>(
                             point_end - point_begin)
                             .count();

  std::cout << "Bandwidth, a[i] = 0.0                : "<< (1 * n * sizeof(double) * nb_times) /
                   (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl;

  point_begin = std::chrono::high_resolution_clock::now();
  for (std::ptrdiff_t k = 0; k < nb_times; ++k) {
#pragma omp parallel for
    for (std::ptrdiff_t i = 0; i < n; ++i) {
      a[i] = 2 * a[i];
    }
    asm volatile("" : : "g"(a) : "memory");
  }
  point_end = std::chrono::high_resolution_clock::now();
  time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>(
                             point_end - point_begin)
                             .count();

  std::cout << "Bandwidth, a[i] = 2 * a[i]           : "<< (2 * n * sizeof(double) * nb_times) /
                   (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl;

  point_begin = std::chrono::high_resolution_clock::now();
  for (std::ptrdiff_t k = 0; k < nb_times; ++k) {
#pragma omp parallel for
    for (std::ptrdiff_t i = 0; i < n; ++i) {
      a[i] = b[i];
    }
    asm volatile("" : : "g"(a) : "memory");
    asm volatile("" : : "g"(b) : "memory");
  }
  point_end = std::chrono::high_resolution_clock::now();
  time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>(
                             point_end - point_begin)
                             .count();

  std::cout << "Bandwidth, a[i] = b[i]               : "<< (2 * n * sizeof(double) * nb_times) /
                   (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl;

  point_begin = std::chrono::high_resolution_clock::now();
  for (std::ptrdiff_t k = 0; k < nb_times; ++k) {
#pragma omp parallel for
    for (std::ptrdiff_t i = 0; i < n; ++i) {
      a[i] = 2 * b[i];
    }
    asm volatile("" : : "g"(a) : "memory");
    asm volatile("" : : "g"(b) : "memory");
  }
  point_end = std::chrono::high_resolution_clock::now();
  time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>(
                             point_end - point_begin)
                             .count();

  std::cout << "Bandwidth, a[i] = 2 * b[i]           : "<< (2 * n * sizeof(double) * nb_times) /
                   (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl;

  point_begin = std::chrono::high_resolution_clock::now();
  for (std::ptrdiff_t k = 0; k < nb_times; ++k) {
#pragma omp parallel for
    for (std::ptrdiff_t i = 0; i < n; ++i) {
      a[i] += 2 * b[i];
    }
    asm volatile("" : : "g"(a) : "memory");
    asm volatile("" : : "g"(b) : "memory");
  }
  point_end = std::chrono::high_resolution_clock::now();
  time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>(
                             point_end - point_begin)
                             .count();

  std::cout << "Bandwidth, a[i] += 2 * b[i]          : "<< (2 * n * sizeof(double) * nb_times) /
                   (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl;

  point_begin = std::chrono::high_resolution_clock::now();
  for (std::ptrdiff_t k = 0; k < nb_times; ++k) {
#pragma omp parallel for
    for (std::ptrdiff_t i = 0; i < n; ++i) {
      a[i] = 2 * b[i] + 3 * c[i];
    }
    asm volatile("" : : "g"(a) : "memory");
    asm volatile("" : : "g"(b) : "memory");
    asm volatile("" : : "g"(c) : "memory");
  }
  point_end = std::chrono::high_resolution_clock::now();
  time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>(
                             point_end - point_begin)
                             .count();

  std::cout << "Bandwidth, a[i] = 2 * b[i] + 3 * c[i]: "<< (3 * n * sizeof(double) * nb_times) /
                   (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl;

  point_begin = std::chrono::high_resolution_clock::now();
  for (std::ptrdiff_t k = 0; k < nb_times; ++k) {
#pragma omp parallel for
    for (std::ptrdiff_t i = 0; i < n; ++i) {
      a[i] = b[i] + 3 * c[i];
    }
    asm volatile("" : : "g"(a) : "memory");
    asm volatile("" : : "g"(b) : "memory");
    asm volatile("" : : "g"(c) : "memory");
  }
  point_end = std::chrono::high_resolution_clock::now();
  time = 1.0e-9 * std::chrono::duration_cast<std::chrono::nanoseconds>(
                             point_end - point_begin)
                             .count();

  std::cout << "Bandwidth, a[i] = b[i] + 3 * c[i]    : "<< (3 * n * sizeof(double) * nb_times) /
                   (time * 1024 * 1024 * 1024)<< " Gb/s"<< std::endl;
  std::cout << "Check: "<< sum << std::endl;

  delete[] c;
  delete[] b;
  delete[] a;

  return 0;
}

 


Viewing all articles
Browse latest Browse all 1175

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>