矩陣乘法的並行化基本都是用加農算法,但是在共享內存的情況下,我覺得加農並沒有優勢。
加農保證了在每個變量全局單副本的情況下,並行度的提升。在共享內存時,沒有變量復制的成本,所以直接使用帶狀劃分可以避免迭代中間的barrier開銷,提高效率。
SMP下實現矩陣乘法
[cpp]
#include "stdafx.h"
#include "matrixOperation.h"
#include <omp.h>
int _tmain(int argc, _TCHAR* argv[])
{
const int size=5000;
double **a,**b,**c;
a=new double*[size];
b=new double*[size];
c=new double*[size];
for(int i=0;i<size;++i)
{
a[i]=new double[size];
b[i]=new double[size];
c[i]=new double[size];
}
cout<<"mem set"<<endl;
//read file
cout<<readMatrix("matrix",a,size)<<endl;
cout<<readMatrix("matrix",b,size)<<endl;
cout<<compareMatrix(a,b,size)<<endl;
//for more cache hits
//transposition b and place data needed in one cache block
matrixTransposition(b,size);
cout<<"data prepared"<<endl<<"calculating"<<endl;
long start=time(0);
// omp_set_nested(true);
#pragma omp parallel for num_threads(16) schedule(dynamic)
for(int i=0;i<size;++i)
{
// #pragma omp parallel for firstprivate(i) num_threads(4)
for(int j=0;j<size;++j)
{
c[i][j]=0;
for(int k=0;k<size;++k)
{
c[i][j]+=a[i][k]*b[j][k];//different from the original formulation
}
}
cout<<".";
}
long end=time(0);
cout<<end-start<<" seconds"<<endl;
writeMatrix("out",c,size);
for(int i=0;i<size;++i)
{
delete[] a[i];
delete[] b[i];
delete[] c[i];
}
delete[] a;
delete[] b;
delete[] c;
cin>>start;
return 0;
}
#include "stdafx.h"
#include "matrixOperation.h"
#include <omp.h>
int _tmain(int argc, _TCHAR* argv[])
{
const int size=5000;
double **a,**b,**c;
a=new double*[size];
b=new double*[size];
c=new double*[size];
for(int i=0;i<size;++i)
{
a[i]=new double[size];
b[i]=new double[size];
c[i]=new double[size];
}
cout<<"mem set"<<endl;
//read file
cout<<readMatrix("matrix",a,size)<<endl;
cout<<readMatrix("matrix",b,size)<<endl;
cout<<compareMatrix(a,b,size)<<endl;
//for more cache hits
//transposition b and place data needed in one cache block
matrixTransposition(b,size);
cout<<"data prepared"<<endl<<"calculating"<<endl;
long start=time(0);
// omp_set_nested(true);
#pragma omp parallel for num_threads(16) schedule(dynamic)
for(int i=0;i<size;++i)
{
// #pragma omp parallel for firstprivate(i) num_threads(4)
for(int j=0;j<size;++j)
{
c[i][j]=0;
for(int k=0;k<size;++k)
{
c[i][j]+=a[i][k]*b[j][k];//different from the original formulation
}
}
cout<<".";
}
long end=time(0);
cout<<end-start<<" seconds"<<endl;
writeMatrix("out",c,size);
for(int i=0;i<size;++i)
{
delete[] a[i];
delete[] b[i];
delete[] c[i];
}
delete[] a;
delete[] b;
delete[] c;
cin>>start;
return 0;
}
i7 2600處理器,5000*5000的矩陣相乘上面的參數效果較好,純計算時間在126秒左右。