针对向量化进行优化的实验:
在使用g++ -O3编译的条件下,向量化将运行时间降至baseline的38.4%
在使用icc -O3编译的条件下,向量化将运行时间降至baseline的25.3%

baseline:

#include 
#include 
#include 
#include 
using namespace std;

double matrix[102][102][102];
int main(int argc, char **argv) {
	
	if (argc != 3) {
		cout << "Usage: ./1  " << endl; 
		return 0;
	}
	
	const int dimX = 100; 
	const int dimY = 100; 
	const int dimZ = 100; 
	
	cout << "Loading data... ";
	ifstream fin(argv[1]);
	for (int i = 1; i < dimX + 1; i++) {
		for (int j = 1; j < dimY + 1; j++) {
			for (int k = 1; k < dimZ + 1; k++) {
				char buffer[100];
				fin >> buffer;
				matrix[i][j][k] = atof(buffer);
			}
		}
	}
	cout << "complete." << endl;
	
	int times = atoi(argv[2]);
	
	timeval t1;
	gettimeofday (&t1, NULL); 
	for (int n = 0; n < times; n++) {
		for (int i = 1; i < dimX + 1; i++) {
			for (int j = 1; j < dimY + 1; j++) {
				for (int k = 1; k < dimZ + 1; k++) {
					matrix[i][j][k] = (matrix[i][j][k-1] + matrix[i][j][k+1] + matrix[i][j-1][k] + matrix[i][j+1][k] + matrix[i-1][j][k] + matrix[i+1][j][k]) / 6;
				}
			}
		}
	}
	timeval t2;
	gettimeofday (&t2, NULL); 
	
	cout << "Time elapsed: " << (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec << " microseconds" << endl;
	return 0;
}

针对向量化优化的版本:

#include 
#include 
#include 
#include 
using namespace std;

double matrix[102][102][102];
int main(int argc, char **argv) {
	
	if (argc != 3) {
		cout << "Usage: ./1  " << endl; 
		return 0;
	}
	
	const int dimX = 100; 
	const int dimY = 100; 
	const int dimZ = 100; 
	
	cout << "Loading data... ";
	ifstream fin(argv[1]);
	for (int i = 1; i < dimX + 1; i++) {
		for (int j = 1; j < dimY + 1; j++) {
			for (int k = 1; k < dimZ + 1; k++) {
				char buffer[100];
				fin >> buffer;
				matrix[i][j][k] = atof(buffer);
			}
		}
	}
	cout << "complete." << endl;
	
	int times = atoi(argv[2]);
	
	timeval t1;
	gettimeofday (&t1, NULL); 
	for (int n = 0; n < times; n++) {
		for (int i = 1; i < dimX + 1; i+=2) {
			for (int j = 1; j < dimY + 1; j+=2) {
				for (int k = 1; k < dimZ + 1; k+=2) {
					matrix[i][j][k] = (matrix[i][j][k-1] + matrix[i][j][k+1] + matrix[i][j-1][k] + matrix[i][j+1][k] + matrix[i-1][j][k] + matrix[i+1][j][k]) / 6;
				}
				for (int k = 2; k < dimZ + 1; k+=2) {
					matrix[i][j][k] = (matrix[i][j][k-1] + matrix[i][j][k+1] + matrix[i][j-1][k] + matrix[i][j+1][k] + matrix[i-1][j][k] + matrix[i+1][j][k]) / 6;
				}
			}
			for (int j = 2; j < dimY + 1; j+=2) {
				for (int k = 1; k < dimZ + 1; k+=2) {
					matrix[i][j][k] = (matrix[i][j][k-1] + matrix[i][j][k+1] + matrix[i][j-1][k] + matrix[i][j+1][k] + matrix[i-1][j][k] + matrix[i+1][j][k]) / 6;
				}
				for (int k = 2; k < dimZ + 1; k+=2) {
					matrix[i][j][k] = (matrix[i][j][k-1] + matrix[i][j][k+1] + matrix[i][j-1][k] + matrix[i][j+1][k] + matrix[i-1][j][k] + matrix[i+1][j][k]) / 6;
				}
			}
		}
		for (int i = 2; i < dimX + 1; i+=2) {
			for (int j = 1; j < dimY + 1; j+=2) {
				for (int k = 1; k < dimZ + 1; k+=2) {
					matrix[i][j][k] = (matrix[i][j][k-1] + matrix[i][j][k+1] + matrix[i][j-1][k] + matrix[i][j+1][k] + matrix[i-1][j][k] + matrix[i+1][j][k]) / 6;
				}
				for (int k = 2; k < dimZ + 1; k+=2) {
					matrix[i][j][k] = (matrix[i][j][k-1] + matrix[i][j][k+1] + matrix[i][j-1][k] + matrix[i][j+1][k] + matrix[i-1][j][k] + matrix[i+1][j][k]) / 6;
				}
			}
			for (int j = 2; j < dimY + 1; j+=2) {
				for (int k = 1; k < dimZ + 1; k+=2) {
					matrix[i][j][k] = (matrix[i][j][k-1] + matrix[i][j][k+1] + matrix[i][j-1][k] + matrix[i][j+1][k] + matrix[i-1][j][k] + matrix[i+1][j][k]) / 6;
				}
				for (int k = 2; k < dimZ + 1; k+=2) {
					matrix[i][j][k] = (matrix[i][j][k-1] + matrix[i][j][k+1] + matrix[i][j-1][k] + matrix[i][j+1][k] + matrix[i-1][j][k] + matrix[i+1][j][k]) / 6;
				}
			}
		}
	}
	timeval t2;
	gettimeofday (&t2, NULL); 
	
	cout << "Time elapsed: " << (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec << " microseconds" << endl;
	return 0;
}