Fellow code developers,
I've got several years of experience in code parallelization with mpi. Recently I begin to use OpenMP, and I quickly get a lot of problems. Right now the most troubling one is the intel compiler's owen optimization and the my hand-written OpenMP parallelization. Allow me to demonstrate the problem with a simple case:
Suppose we have two functions. Each function loops over a vector and there is no data dependency between these two functions. Now I use openmp to create two threads and let each thread handle one of these two functions. In theory, we should observe that the wall time for the two-thread version is half compared to the one-thread version. In my experiment, this claim is true only when the compiler optimization flag is set to be -O0. If the flag is -O1,it is not valid anymore.
If anyone can offer some insight of the problem, it will be greatly appreciated.
This is the test code:
main.cpp
#include <iostream>
#include <omp.h>
#include <vector>
#include <stdio.h>
#include <chrono>
#include "tools.h"
#define N 60000000
using namespace std;
using namespace chrono;
void func(int i, vector<vector<int> > &data) {
for (int j=1; j<N; ++j) {
data[i][j] = data[i][j-1] + data[i][j];
}
}
int main(int argc, char *argv[]) {
vector<vector<int> > data(24, vector<int>(N, 1));
string hostName, Ip;
if (GetHostInfo(hostName, Ip)) {
}
cout << "hostname: "<< hostName << ", ip: "<< Ip << endl;
auto start = system_clock::now();
#pragma omp parallel shared(data)
{
#pragma omp sections
{
#pragma omp section
func(0, data);
#pragma omp section
func(1, data);
#pragma omp section
func(2, data);
#pragma omp section
func(3, data);
#pragma omp section
func(4, data);
#pragma omp section
func(5, data);
#pragma omp section
func(6, data);
#pragma omp section
func(7, data);
#pragma omp section
func(8, data);
#pragma omp section
func(9, data);
#pragma omp section
func(10, data);
#pragma omp section
func(11, data);
#pragma omp section
func(12, data);
#pragma omp section
func(13, data);
#pragma omp section
func(14, data);
#pragma omp section
func(15, data);
#pragma omp section
func(16, data);
#pragma omp section
func(17, data);
#pragma omp section
func(18, data);
#pragma omp section
func(19, data);
#pragma omp section
func(20, data);
#pragma omp section
func(21, data);
#pragma omp section
func(22, data);
#pragma omp section
func(23, data);
}
}
auto end = system_clock::now();
auto duration = duration_cast<microseconds>(end-start);
cout << "time: "<< double(duration.count()) * microseconds::period::num / microseconds::period::den << "s\n";
return 0;
}
tools.h
#include <iostream> /* cout */
#include <unistd.h>/* gethostname */
#include <netdb.h> /* struct hostent */
#include <arpa/inet.h> /* inet_ntop */
#include <stdlib.h> /* system */
bool GetHostInfo(std::string& hostName, std::string& Ip) {
char name[256];
gethostname(name, sizeof(name));
hostName = name;
struct hostent* host = gethostbyname(name);
char ipStr[32];
const char* ret = inet_ntop(host->h_addrtype, host->h_addr_list[0], ipStr, sizeof(ipStr));
if (NULL==ret) {
std::cout << "hostname transform to ip failed";
return false;
}
Ip = ipStr;
return true;
}
/*
int main(int argc, char *argv[]) {
std::string hostName;
std::string Ip;
bool ret = GetHostInfo(hostName, Ip);
if (true == ret) {
std::cout << "hostname: "<< hostName << std::endl;
std::cout << "Ip: "<< Ip << std::endl;
}
system("cat /proc/cpuinfo | grep 'core id'");
return 0;
}
*/