/* ising.cu
   
   Single spin-flip Metropolis simulation of a two-dimensional
   ferromagnetic Ising model on graphics processing units (GPUs)
   using the NVIDIA CUDA framework.

   Copyright (C) 2009 Martin Weigel

   This program is free software; you can redistribute it and/or modify it under
   the terms of the GNU General Public License as published by the Free Software
   Foundation; either version 2 of the License, or (at your option) any later
   version.

   This program is distributed in the hope that it will be useful, but WITHOUT
   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
   details.

   You should have received a copy of the GNU General Public License along with
   this program; if not, see <http://www.gnu.org/licenses/>.

   Please check (and, if relevant, cite) the related preprint arXiv:1006.3865 at

   http://arxiv.org/abs/1006.3865
*/

#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>

#define DIM 2
#define L 1024
#define BLOCKL 16
#define GRIDL (L/BLOCKL)
#define BLOCKS ((GRIDL*GRIDL)/2)
#define THREADS ((BLOCKL*BLOCKL)/2)
#define N (L*L)
#define SPINS_PER_BLOCK (N/2)
#define TOTTHREADS (BLOCKS*THREADS)
#define SWEEPS_EQUI 100
#define SWEEPS_GLOBAL 100
#define SWEEPS_LOCAL 100 
#define SWEEPS_EMPTY 1
#define TOT_SWEEPS (SWEEPS_GLOBAL*SWEEPS_LOCAL*SWEEPS_EMPTY)
#define BETA 0.4
#define SMALLMAGIC 1664525
#define BIGMAGIC 1013904223
#define sS(x,y) sS[(y+1)*(BLOCKL+2)+x+1]
#define RAN(n) (n = SMALLMAGIC*n + BIGMAGIC)
//#define MEAS

typedef int spin_t;

__constant__ float boltzD[2*DIM+1];

__global__ void metro_sublattice_shared(spin_t *s, int *ranvec, int *result, int offset)
{
  int n = threadIdx.y*(BLOCKL/2)+threadIdx.x;

  int xoffset = (2*blockIdx.x+(blockIdx.y+offset)%2)*BLOCKL;
  int yoffset = blockIdx.y*BLOCKL;

  __shared__ spin_t sS[(BLOCKL+2)*(BLOCKL+2)];  

  sS[(threadIdx.y+1)*(BLOCKL+2)+2*threadIdx.x+1] = s[(yoffset+threadIdx.y)*L+xoffset+2*threadIdx.x];
  sS[(threadIdx.y+1)*(BLOCKL+2)+2*threadIdx.x+2] = s[(yoffset+threadIdx.y)*L+xoffset+2*threadIdx.x+1];
  if(threadIdx.y == 0) {
    if(blockIdx.y == 0) {
      sS[2*threadIdx.x+1] = s[(L-1)*L+xoffset+2*threadIdx.x];
      sS[2*threadIdx.x+2] = s[(L-1)*L+xoffset+2*threadIdx.x+1];
    } else {
      sS[2*threadIdx.x+1] = s[(yoffset-1)*L+xoffset+2*threadIdx.x];
      sS[2*threadIdx.x+2] = s[(yoffset-1)*L+xoffset+2*threadIdx.x+1];
    }
  }
  if(threadIdx.y == BLOCKL-1) {
    if(blockIdx.y == GRIDL-1) {
      sS[(BLOCKL+1)*(BLOCKL+2)+2*threadIdx.x+1] = s[xoffset+2*threadIdx.x];
      sS[(BLOCKL+1)*(BLOCKL+2)+2*threadIdx.x+2] = s[xoffset+2*threadIdx.x+1];
    } else {
      sS[(BLOCKL+1)*(BLOCKL+2)+2*threadIdx.x+1] = s[(yoffset+BLOCKL)*L+xoffset+2*threadIdx.x];
      sS[(BLOCKL+1)*(BLOCKL+2)+2*threadIdx.x+2] = s[(yoffset+BLOCKL)*L+xoffset+2*threadIdx.x+1];
    }
  }
  if(threadIdx.x == 0) {
    if(xoffset == 0) sS[(threadIdx.y+1)*(BLOCKL+2)] = s[(yoffset+threadIdx.y)*L+(L-1)];
    else sS[(threadIdx.y+1)*(BLOCKL+2)] = s[(yoffset+threadIdx.y)*L+xoffset-1];
  }
  if(threadIdx.x == BLOCKL/2-1) {
    if(xoffset == L-BLOCKL) sS[(threadIdx.y+1)*(BLOCKL+2)+BLOCKL+1] = s[(yoffset+threadIdx.y)*L];
    else sS[(threadIdx.y+1)*(BLOCKL+2)+BLOCKL+1] = s[(yoffset+threadIdx.y)*L+xoffset+BLOCKL];
  }

  __shared__ int ranvecS[THREADS];
  ranvecS[n] = ranvec[(blockIdx.y*(GRIDL/2)+blockIdx.x)*THREADS+n];

  __syncthreads();

#ifdef MEAS
  int ie = 0;
#endif
  int x1 = (threadIdx.y%2)+2*threadIdx.x;
  int x2 = ((threadIdx.y+1)%2)+2*threadIdx.x;
  int y = threadIdx.y;

  for(int i = 0; i < SWEEPS_LOCAL; ++i) {
    int ide = sS(x1,y)*(sS(x1-1,y)+sS(x1+1,y)+sS(x1,y-1)+sS(x1,y+1));
    if(ide <= 0 || fabs(RAN(ranvecS[n])*4.656612e-10f) < boltzD[ide]) {
      sS(x1,y) = -sS(x1,y);
#ifdef MEAS
      ie -= 2*ide;
#endif
    }

    __syncthreads();

    ide = sS(x2,y)*(sS(x2-1,y)+sS(x2+1,y)+sS(x2,y-1)+sS(x2,y+1));
    if(ide <= 0 || fabs(RAN(ranvecS[n])*4.656612e-10f) < boltzD[ide]) {
      sS(x2,y) = -sS(x2,y);
#ifdef MEAS
      ie -= 2*ide;
#endif
    }

    __syncthreads();    
  }

  s[(yoffset+threadIdx.y)*L+xoffset+2*threadIdx.x] = sS[(threadIdx.y+1)*(BLOCKL+2)+2*threadIdx.x+1];
  s[(yoffset+threadIdx.y)*L+xoffset+2*threadIdx.x+1] = sS[(threadIdx.y+1)*(BLOCKL+2)+2*threadIdx.x+2];
  ranvec[(blockIdx.y*(GRIDL/2)+blockIdx.x)*THREADS+n] = ranvecS[n];

#ifdef MEAS
  __syncthreads();
  ranvecS[n] = ie;
  __syncthreads();

  for(int off = 1; off < THREADS; off *= 2) {
    if(n%(2*off) == 0) {
      ranvecS[n]+=ranvecS[n+off];
    }
    __syncthreads();
  }

  if(n == 0) result[blockIdx.y*(GRIDL/2)+blockIdx.x] += ranvecS[0];
#endif
}

int cpu_energy(spin_t *s)
{
  int ie = 0;
  for(int x = 0; x < L; ++x)
    for(int y = 0; y < L; ++y)
      ie += s[L*y+x]*(s[L*y+((x==0)?L-1:x-1)]+s[L*y+((x==L-1)?0:x+1)]+s[L*((y==0)?L-1:y-1)+x]+s[L*((y==L-1)?0:y+1)+x]);
    

  return ie/2;
}

void simulate()
{
  spin_t *s, *sD;
  int ranvec[TOTTHREADS], *ranvecD, result[BLOCKS], *resultD;
  float boltz[2*DIM+1];
  dim3 grid(GRIDL/2, GRIDL);
  dim3 block(BLOCKL/2, BLOCKL);
  
  srand48(3145627);
  s = (spin_t*)malloc(N*sizeof(spin_t));
  for(int i = 0; i < N; ++i) s[i] = (drand48() < 0.5) ? 1 : -1;
  ranvec[0] = 1;
  for(int i = 1; i < TOTTHREADS; ++i) ranvec[i]=16807*ranvec[i-1];
  for(int i = 0; i < BLOCKS; ++i) result[i] = 0;
  for(int i = 0; i <= 2*DIM; ++i) boltz[i] = exp(-2*BETA*i);
  cudaMemcpyToSymbol("boltzD", &boltz, (2*DIM+1)*sizeof(float));
  
  cudaMalloc((void**)&sD, N*sizeof(spin_t));
  cudaMemcpy(sD, s, N*sizeof(spin_t), cudaMemcpyHostToDevice);
  cudaMalloc((void**)&ranvecD, TOTTHREADS*sizeof(int));
  cudaMemcpy(ranvecD, ranvec, TOTTHREADS*sizeof(int), cudaMemcpyHostToDevice);
  cudaMalloc((void**)&resultD, BLOCKS*sizeof(int));
  cudaMemcpy(resultD, result, BLOCKS*sizeof(int), cudaMemcpyHostToDevice);
  
  int ie = cpu_energy(s);

  printf("equilibration\n");
  
  for(int i = 0; i < SWEEPS_EQUI; ++i) {
    for(int j = 0; j < SWEEPS_EMPTY; ++j) {
      metro_sublattice_shared<<<grid, block>>>(sD, ranvecD, resultD, 0);
      metro_sublattice_shared<<<grid, block>>>(sD, ranvecD, resultD, 1);
    }
  } 

#ifdef MEAS
  FILE *fp = fopen("gpu_en.dat","w");
#endif

  printf("GPU simulation\n");

  unsigned int timer;
  cudaThreadSynchronize();
  cutCreateTimer(&timer);
  cutStartTimer(timer);

  for(int i = 0; i < SWEEPS_GLOBAL; ++i) {
    for(int j = 0; j < SWEEPS_EMPTY; ++j) {
      metro_sublattice_shared<<<grid, block>>>(sD, ranvecD, resultD, 0);
      metro_sublattice_shared<<<grid, block>>>(sD, ranvecD, resultD, 1);
    }
#ifdef MEAS
    int sum = ie;
    cudaMemcpy(result, resultD, BLOCKS*sizeof(int), cudaMemcpyDeviceToHost);
    for(int j = 0; j < BLOCKS; ++j) sum += result[j];
    fwrite(&sum, sizeof(int), 1, fp);
#endif
  }
  
#ifdef MEAS
  fclose(fp);
#endif

  CUT_CHECK_ERROR("Kernel execution failed");
  cudaThreadSynchronize();
  cutStopTimer(timer);
  printf("GPU time: %f ms, %f ns per spin flip\n",cutGetTimerValue(timer),1000000/((float)N)*cutGetTimerValue(timer)/TOT_SWEEPS);

  cudaMemcpy(s, sD, N*sizeof(spin_t), cudaMemcpyDeviceToHost);
  cudaMemcpy(ranvec, ranvecD, TOTTHREADS*sizeof(int), cudaMemcpyDeviceToHost);

  cudaFree(sD);
  cudaFree(ranvecD);
  cudaFree(resultD);

  int ran = 16807;

#ifdef MEAS
  ie = cpu_energy(s);
  fp = fopen("cpu_en.dat","w");
#endif

  printf("CPU simulation\n");

  cutCreateTimer(&timer);
  cutStartTimer(timer);
  
  for(int i = 0; i < SWEEPS_GLOBAL; ++i) {
    for(int j = 0; j < SWEEPS_EMPTY*SWEEPS_LOCAL; ++j) {
      for(int x = 0; x < L; ++x) {
	for(int y = 0; y < L; ++y) {
	  int ide = s[L*y+x]*(s[L*y+((x==0)?L-1:x-1)]+s[L*y+((x==L-1)?0:x+1)]+s[L*((y==0)?L-1:y-1)+x]+s[L*((y==L-1)?0:y+1)+x]);
	  if(ide <= 0 || fabs(RAN(ran)*4.656612e-10f) < boltz[ide]) {
	    s[L*y+x] = -s[L*y+x];
#ifdef MEAS
	    ie -= 2*ide;
#endif
	  }
	}
      }
    }
#ifdef MEAS
    fwrite(&ie, sizeof(int), 1, fp);
#endif
  }

  cutStopTimer(timer);
#ifdef MEAS
  fclose(fp);
#endif
  printf("CPU time: %f ms, %f ns per spin flip\n",cutGetTimerValue(timer),1000000/((float)N)*cutGetTimerValue(timer)/TOT_SWEEPS);

  free(s);
}

int main(int argc, char *argv[])
{
  CUT_DEVICE_INIT(argc,argv);
  simulate();
}