/* ising.cu
Single spin-flip Metropolis simulation of a two-dimensional
ferromagnetic Ising model on graphics processing units (GPUs)
using the NVIDIA CUDA framework.
Copyright (C) 2009 Martin Weigel
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
details.
You should have received a copy of the GNU General Public License along with
this program; if not, see .
Please check (and, if relevant, cite) the related preprint arXiv:1006.3865 at
http://arxiv.org/abs/1006.3865
*/
#include
#include
#include
#define DIM 2
#define L 1024
#define BLOCKL 16
#define GRIDL (L/BLOCKL)
#define BLOCKS ((GRIDL*GRIDL)/2)
#define THREADS ((BLOCKL*BLOCKL)/2)
#define N (L*L)
#define SPINS_PER_BLOCK (N/2)
#define TOTTHREADS (BLOCKS*THREADS)
#define SWEEPS_EQUI 100
#define SWEEPS_GLOBAL 100
#define SWEEPS_LOCAL 100
#define SWEEPS_EMPTY 1
#define TOT_SWEEPS (SWEEPS_GLOBAL*SWEEPS_LOCAL*SWEEPS_EMPTY)
#define BETA 0.4
#define SMALLMAGIC 1664525
#define BIGMAGIC 1013904223
#define sS(x,y) sS[(y+1)*(BLOCKL+2)+x+1]
#define RAN(n) (n = SMALLMAGIC*n + BIGMAGIC)
//#define MEAS
typedef int spin_t;
__constant__ float boltzD[2*DIM+1];
__global__ void metro_sublattice_shared(spin_t *s, int *ranvec, int *result, int offset)
{
int n = threadIdx.y*(BLOCKL/2)+threadIdx.x;
int xoffset = (2*blockIdx.x+(blockIdx.y+offset)%2)*BLOCKL;
int yoffset = blockIdx.y*BLOCKL;
__shared__ spin_t sS[(BLOCKL+2)*(BLOCKL+2)];
sS[(threadIdx.y+1)*(BLOCKL+2)+2*threadIdx.x+1] = s[(yoffset+threadIdx.y)*L+xoffset+2*threadIdx.x];
sS[(threadIdx.y+1)*(BLOCKL+2)+2*threadIdx.x+2] = s[(yoffset+threadIdx.y)*L+xoffset+2*threadIdx.x+1];
if(threadIdx.y == 0) {
if(blockIdx.y == 0) {
sS[2*threadIdx.x+1] = s[(L-1)*L+xoffset+2*threadIdx.x];
sS[2*threadIdx.x+2] = s[(L-1)*L+xoffset+2*threadIdx.x+1];
} else {
sS[2*threadIdx.x+1] = s[(yoffset-1)*L+xoffset+2*threadIdx.x];
sS[2*threadIdx.x+2] = s[(yoffset-1)*L+xoffset+2*threadIdx.x+1];
}
}
if(threadIdx.y == BLOCKL-1) {
if(blockIdx.y == GRIDL-1) {
sS[(BLOCKL+1)*(BLOCKL+2)+2*threadIdx.x+1] = s[xoffset+2*threadIdx.x];
sS[(BLOCKL+1)*(BLOCKL+2)+2*threadIdx.x+2] = s[xoffset+2*threadIdx.x+1];
} else {
sS[(BLOCKL+1)*(BLOCKL+2)+2*threadIdx.x+1] = s[(yoffset+BLOCKL)*L+xoffset+2*threadIdx.x];
sS[(BLOCKL+1)*(BLOCKL+2)+2*threadIdx.x+2] = s[(yoffset+BLOCKL)*L+xoffset+2*threadIdx.x+1];
}
}
if(threadIdx.x == 0) {
if(xoffset == 0) sS[(threadIdx.y+1)*(BLOCKL+2)] = s[(yoffset+threadIdx.y)*L+(L-1)];
else sS[(threadIdx.y+1)*(BLOCKL+2)] = s[(yoffset+threadIdx.y)*L+xoffset-1];
}
if(threadIdx.x == BLOCKL/2-1) {
if(xoffset == L-BLOCKL) sS[(threadIdx.y+1)*(BLOCKL+2)+BLOCKL+1] = s[(yoffset+threadIdx.y)*L];
else sS[(threadIdx.y+1)*(BLOCKL+2)+BLOCKL+1] = s[(yoffset+threadIdx.y)*L+xoffset+BLOCKL];
}
__shared__ int ranvecS[THREADS];
ranvecS[n] = ranvec[(blockIdx.y*(GRIDL/2)+blockIdx.x)*THREADS+n];
__syncthreads();
#ifdef MEAS
int ie = 0;
#endif
int x1 = (threadIdx.y%2)+2*threadIdx.x;
int x2 = ((threadIdx.y+1)%2)+2*threadIdx.x;
int y = threadIdx.y;
for(int i = 0; i < SWEEPS_LOCAL; ++i) {
int ide = sS(x1,y)*(sS(x1-1,y)+sS(x1+1,y)+sS(x1,y-1)+sS(x1,y+1));
if(ide <= 0 || fabs(RAN(ranvecS[n])*4.656612e-10f) < boltzD[ide]) {
sS(x1,y) = -sS(x1,y);
#ifdef MEAS
ie -= 2*ide;
#endif
}
__syncthreads();
ide = sS(x2,y)*(sS(x2-1,y)+sS(x2+1,y)+sS(x2,y-1)+sS(x2,y+1));
if(ide <= 0 || fabs(RAN(ranvecS[n])*4.656612e-10f) < boltzD[ide]) {
sS(x2,y) = -sS(x2,y);
#ifdef MEAS
ie -= 2*ide;
#endif
}
__syncthreads();
}
s[(yoffset+threadIdx.y)*L+xoffset+2*threadIdx.x] = sS[(threadIdx.y+1)*(BLOCKL+2)+2*threadIdx.x+1];
s[(yoffset+threadIdx.y)*L+xoffset+2*threadIdx.x+1] = sS[(threadIdx.y+1)*(BLOCKL+2)+2*threadIdx.x+2];
ranvec[(blockIdx.y*(GRIDL/2)+blockIdx.x)*THREADS+n] = ranvecS[n];
#ifdef MEAS
__syncthreads();
ranvecS[n] = ie;
__syncthreads();
for(int off = 1; off < THREADS; off *= 2) {
if(n%(2*off) == 0) {
ranvecS[n]+=ranvecS[n+off];
}
__syncthreads();
}
if(n == 0) result[blockIdx.y*(GRIDL/2)+blockIdx.x] += ranvecS[0];
#endif
}
int cpu_energy(spin_t *s)
{
int ie = 0;
for(int x = 0; x < L; ++x)
for(int y = 0; y < L; ++y)
ie += s[L*y+x]*(s[L*y+((x==0)?L-1:x-1)]+s[L*y+((x==L-1)?0:x+1)]+s[L*((y==0)?L-1:y-1)+x]+s[L*((y==L-1)?0:y+1)+x]);
return ie/2;
}
void simulate()
{
spin_t *s, *sD;
int ranvec[TOTTHREADS], *ranvecD, result[BLOCKS], *resultD;
float boltz[2*DIM+1];
dim3 grid(GRIDL/2, GRIDL);
dim3 block(BLOCKL/2, BLOCKL);
srand48(3145627);
s = (spin_t*)malloc(N*sizeof(spin_t));
for(int i = 0; i < N; ++i) s[i] = (drand48() < 0.5) ? 1 : -1;
ranvec[0] = 1;
for(int i = 1; i < TOTTHREADS; ++i) ranvec[i]=16807*ranvec[i-1];
for(int i = 0; i < BLOCKS; ++i) result[i] = 0;
for(int i = 0; i <= 2*DIM; ++i) boltz[i] = exp(-2*BETA*i);
cudaMemcpyToSymbol("boltzD", &boltz, (2*DIM+1)*sizeof(float));
cudaMalloc((void**)&sD, N*sizeof(spin_t));
cudaMemcpy(sD, s, N*sizeof(spin_t), cudaMemcpyHostToDevice);
cudaMalloc((void**)&ranvecD, TOTTHREADS*sizeof(int));
cudaMemcpy(ranvecD, ranvec, TOTTHREADS*sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&resultD, BLOCKS*sizeof(int));
cudaMemcpy(resultD, result, BLOCKS*sizeof(int), cudaMemcpyHostToDevice);
int ie = cpu_energy(s);
printf("equilibration\n");
for(int i = 0; i < SWEEPS_EQUI; ++i) {
for(int j = 0; j < SWEEPS_EMPTY; ++j) {
metro_sublattice_shared<<>>(sD, ranvecD, resultD, 0);
metro_sublattice_shared<<>>(sD, ranvecD, resultD, 1);
}
}
#ifdef MEAS
FILE *fp = fopen("gpu_en.dat","w");
#endif
printf("GPU simulation\n");
unsigned int timer;
cudaThreadSynchronize();
cutCreateTimer(&timer);
cutStartTimer(timer);
for(int i = 0; i < SWEEPS_GLOBAL; ++i) {
for(int j = 0; j < SWEEPS_EMPTY; ++j) {
metro_sublattice_shared<<>>(sD, ranvecD, resultD, 0);
metro_sublattice_shared<<>>(sD, ranvecD, resultD, 1);
}
#ifdef MEAS
int sum = ie;
cudaMemcpy(result, resultD, BLOCKS*sizeof(int), cudaMemcpyDeviceToHost);
for(int j = 0; j < BLOCKS; ++j) sum += result[j];
fwrite(&sum, sizeof(int), 1, fp);
#endif
}
#ifdef MEAS
fclose(fp);
#endif
CUT_CHECK_ERROR("Kernel execution failed");
cudaThreadSynchronize();
cutStopTimer(timer);
printf("GPU time: %f ms, %f ns per spin flip\n",cutGetTimerValue(timer),1000000/((float)N)*cutGetTimerValue(timer)/TOT_SWEEPS);
cudaMemcpy(s, sD, N*sizeof(spin_t), cudaMemcpyDeviceToHost);
cudaMemcpy(ranvec, ranvecD, TOTTHREADS*sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(sD);
cudaFree(ranvecD);
cudaFree(resultD);
int ran = 16807;
#ifdef MEAS
ie = cpu_energy(s);
fp = fopen("cpu_en.dat","w");
#endif
printf("CPU simulation\n");
cutCreateTimer(&timer);
cutStartTimer(timer);
for(int i = 0; i < SWEEPS_GLOBAL; ++i) {
for(int j = 0; j < SWEEPS_EMPTY*SWEEPS_LOCAL; ++j) {
for(int x = 0; x < L; ++x) {
for(int y = 0; y < L; ++y) {
int ide = s[L*y+x]*(s[L*y+((x==0)?L-1:x-1)]+s[L*y+((x==L-1)?0:x+1)]+s[L*((y==0)?L-1:y-1)+x]+s[L*((y==L-1)?0:y+1)+x]);
if(ide <= 0 || fabs(RAN(ran)*4.656612e-10f) < boltz[ide]) {
s[L*y+x] = -s[L*y+x];
#ifdef MEAS
ie -= 2*ide;
#endif
}
}
}
}
#ifdef MEAS
fwrite(&ie, sizeof(int), 1, fp);
#endif
}
cutStopTimer(timer);
#ifdef MEAS
fclose(fp);
#endif
printf("CPU time: %f ms, %f ns per spin flip\n",cutGetTimerValue(timer),1000000/((float)N)*cutGetTimerValue(timer)/TOT_SWEEPS);
free(s);
}
int main(int argc, char *argv[])
{
CUT_DEVICE_INIT(argc,argv);
simulate();
}