ESPResSo 3.2.0-11-g9950804-git
Extensible Simulation Package for Soft Matter Research
lbgpu.cu
Go to the documentation of this file.
00001 /* 
00002    Copyright (C) 2010,2011,2012,2013 The ESPResSo project
00003 
00004    This file is part of ESPResSo.
00005   
00006    ESPResSo is free software: you can redistribute it and/or modify
00007    it under the terms of the GNU General Public License as published by
00008    the Free Software Foundation, either version 3 of the License, or
00009    (at your option) any later version.
00010    
00011    ESPResSo is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014    GNU General Public License for more details.
00015    
00016    You should have received a copy of the GNU General Public License
00017    along with this program.  If not, see <http://www.gnu.org/licenses/>.
00018 */
00019 
00020 /** \file lbgpu.cu
00021  *
00022  * Cuda (.cu) file for the Lattice Boltzmann implementation on GPUs.
00023  * Header file for \ref lbgpu.h.
00024  */
00025 
00026 #include <stdio.h>
00027 #include <cuda.h>
00028 #include <stdlib.h>
00029 
00030 extern "C" {
00031 #include "lbgpu.h"
00032 #include "config.h"
00033 }
00034 
00035 #ifdef LB_GPU
00036 #ifndef GAUSSRANDOM
00037 #define GAUSSRANDOM
00038 #endif
00039 
00040 /**defining structures residing in global memory */
00041 /** struct for phys. values */
00042 static LB_values_gpu *device_values = NULL;
00043 /** structs for velocity densities */
00044 static LB_nodes_gpu nodes_a;
00045 static LB_nodes_gpu nodes_b;
00046 /** struct for particle force */
00047 static LB_particle_force_gpu *particle_force = NULL;
00048 /** struct for particle position and veloctiy */
00049 static LB_particle_gpu *particle_data = NULL;
00050 /** struct for node force */
00051 static LB_node_force_gpu node_f;
00052 /** struct for storing particle rn seed */
00053 static LB_particle_seed_gpu *part = NULL;
00054 
00055 static LB_extern_nodeforce_gpu *extern_nodeforces = NULL;
00056 
00057 #ifdef LB_BOUNDARIES_GPU
00058 static float* LB_boundary_force = NULL;
00059 static float* LB_boundary_velocity = NULL;
00060 /** pointer for bound index array*/
00061 static int *boundary_node_list;
00062 static int *boundary_index_list;
00063 static __device__ __constant__ int n_lb_boundaries_gpu = 0;
00064 static size_t size_of_boundindex;
00065 #endif
00066 /** pointers for additional cuda check flag*/
00067 static int *gpu_check = NULL;
00068 static int *h_gpu_check = NULL;
00069 
00070 static unsigned int intflag = 1;
00071 static LB_nodes_gpu *current_nodes = NULL;
00072 /**defining size values for allocating global memory */
00073 static size_t size_of_values;
00074 static size_t size_of_forces;
00075 static size_t size_of_positions;
00076 static size_t size_of_seed;
00077 static size_t size_of_extern_nodeforces;
00078 
00079 /**parameters residing in constant memory */
00080 static __device__ __constant__ LB_parameters_gpu para;
00081 static const float c_sound_sq = 1.f/3.f;
00082 /**cudasteams for parallel computing on cpu and gpu */
00083 cudaStream_t stream[1];
00084 
00085 cudaError_t err;
00086 cudaError_t _err;
00087 int initflag = 0;
00088 int partinitflag = 0;
00089 /*-------------------------------------------------------*/
00090 /*********************************************************/
00091 /** \name device funktions called by kernel funktions */
00092 /*********************************************************/
00093 /*-------------------------------------------------------*/
00094 
00095 /*-------------------------------------------------------*/
00096 
00097 /** atomic add function for sveral cuda architectures 
00098 */
00099 __device__ inline void atomicadd(float* address, float value){
00100 #if !defined __CUDA_ARCH__ || __CUDA_ARCH__ >= 200 // for Fermi, atomicAdd supports floats
00101   atomicAdd(address, value);
00102 #elif __CUDA_ARCH__ >= 110
00103 #warning Using slower atomicAdd emulation
00104 // float-atomic-add from 
00105 // [url="http://forums.nvidia.com/index.php?showtopic=158039&view=findpost&p=991561"]
00106   float old = value;
00107   while ((old = atomicExch(address, atomicExch(address, 0.0f)+old))!=0.0f);
00108 #else
00109 #error I need at least compute capability 1.1
00110 #endif
00111 }
00112 
00113 /**randomgenerator which generates numbers [0,1]
00114  * @param *rn   Pointer to randomnumber array of the local node or particle 
00115 */
00116 __device__ void random_01(LB_randomnr_gpu *rn){
00117 
00118   const float mxi = 1.f/(float)(1ul<<31);
00119   unsigned int curr = rn->seed;
00120 
00121   curr = 1103515245 * curr + 12345;
00122   rn->randomnr[0] = (float)(curr & ((1ul<<31)-1))*mxi;
00123   curr = 1103515245 * curr + 12345;
00124   rn->randomnr[1] = (float)(curr & ((1ul<<31)-1))*mxi;
00125   rn->seed = curr;
00126 
00127 }
00128 
00129 /** gaussian random nummber generator for thermalisation
00130  * @param *rn   Pointer to randomnumber array of the local node node or particle 
00131 */
00132 __device__ void gaussian_random(LB_randomnr_gpu *rn){
00133 
00134   float x1, x2;
00135   float r2, fac;
00136   /** On every second call two gaussian random numbers are calculated
00137    via the Box-Muller transformation.*/
00138   /** draw two uniform random numbers in the unit circle */
00139   do {
00140     random_01(rn);
00141     x1 = 2.f*rn->randomnr[0]-1.f;
00142     x2 = 2.f*rn->randomnr[1]-1.f;
00143     r2 = x1*x1 + x2*x2;
00144   } while (r2 >= 1.f || r2 == 0.f);
00145 
00146   /** perform Box-Muller transformation */
00147   fac = sqrtf(-2.f*__logf(r2)/r2);
00148   rn->randomnr[0] = x2*fac;
00149   rn->randomnr[1] = x1*fac;
00150   
00151 }
00152 
00153 /**tranformation from 1d array-index to xyz
00154  * @param index         node index / thread index (Input)
00155  * @param xyz           Pointer to calculated xyz array (Output)
00156  */
00157 __device__ void index_to_xyz(unsigned int index, unsigned int *xyz){
00158 
00159   xyz[0] = index%para.dim_x;
00160   index /= para.dim_x;
00161   xyz[1] = index%para.dim_y;
00162   index /= para.dim_y;
00163   xyz[2] = index;
00164 }
00165 
00166 /**calculation of the modes from the velocitydensities (space-transform.)
00167  * @param n_a           Pointer to local node residing in array a (Input)
00168  * @param index         node index / thread index (Input)
00169  * @param mode          Pointer to the local register values mode (Output)
00170 */
00171 __device__ void calc_m_from_n(LB_nodes_gpu n_a, unsigned int index, float *mode){
00172 
00173   /* mass mode */
00174   mode[0] = n_a.vd[0*para.number_of_nodes + index] + n_a.vd[1*para.number_of_nodes + index] + n_a.vd[2*para.number_of_nodes + index]
00175           + n_a.vd[3*para.number_of_nodes + index] + n_a.vd[4*para.number_of_nodes + index] + n_a.vd[5*para.number_of_nodes + index]
00176           + n_a.vd[6*para.number_of_nodes + index] + n_a.vd[7*para.number_of_nodes + index] + n_a.vd[8*para.number_of_nodes + index]
00177           + n_a.vd[9*para.number_of_nodes + index] + n_a.vd[10*para.number_of_nodes + index] + n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index]
00178           + n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index] + n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index]
00179           + n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index];
00180 
00181   /* momentum modes */
00182   mode[1] = (n_a.vd[1*para.number_of_nodes + index] - n_a.vd[2*para.number_of_nodes + index]) + (n_a.vd[7*para.number_of_nodes + index] - n_a.vd[8*para.number_of_nodes + index])
00183           + (n_a.vd[9*para.number_of_nodes + index] - n_a.vd[10*para.number_of_nodes + index]) + (n_a.vd[11*para.number_of_nodes + index] - n_a.vd[12*para.number_of_nodes + index])
00184           + (n_a.vd[13*para.number_of_nodes + index] - n_a.vd[14*para.number_of_nodes + index]);
00185   mode[2] = (n_a.vd[3*para.number_of_nodes + index] - n_a.vd[4*para.number_of_nodes + index]) + (n_a.vd[7*para.number_of_nodes + index] - n_a.vd[8*para.number_of_nodes + index])
00186           - (n_a.vd[9*para.number_of_nodes + index] - n_a.vd[10*para.number_of_nodes + index]) + (n_a.vd[15*para.number_of_nodes + index] - n_a.vd[16*para.number_of_nodes + index])
00187           + (n_a.vd[17*para.number_of_nodes + index] - n_a.vd[18*para.number_of_nodes + index]);
00188   mode[3] = (n_a.vd[5*para.number_of_nodes + index] - n_a.vd[6*para.number_of_nodes + index]) + (n_a.vd[11*para.number_of_nodes + index] - n_a.vd[12*para.number_of_nodes + index])
00189           - (n_a.vd[13*para.number_of_nodes + index] - n_a.vd[14*para.number_of_nodes + index]) + (n_a.vd[15*para.number_of_nodes + index] - n_a.vd[16*para.number_of_nodes + index])
00190           - (n_a.vd[17*para.number_of_nodes + index] - n_a.vd[18*para.number_of_nodes + index]);
00191 
00192   /* stress modes */
00193   mode[4] = -(n_a.vd[0*para.number_of_nodes + index]) + n_a.vd[7*para.number_of_nodes + index] + n_a.vd[8*para.number_of_nodes + index] + n_a.vd[9*para.number_of_nodes + index] + n_a.vd[10*para.number_of_nodes + index]
00194           + n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index] + n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index]
00195           + n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index] + n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index];
00196   mode[5] = n_a.vd[1*para.number_of_nodes + index] + n_a.vd[2*para.number_of_nodes + index] - (n_a.vd[3*para.number_of_nodes + index] + n_a.vd[4*para.number_of_nodes + index])
00197           + (n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index]) + (n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index])
00198           - (n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index]) - (n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index]);
00199   mode[6] = (n_a.vd[1*para.number_of_nodes + index] + n_a.vd[2*para.number_of_nodes + index]) + (n_a.vd[3*para.number_of_nodes + index] + n_a.vd[4*para.number_of_nodes + index])
00200           - (n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index]) - (n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index])
00201           - (n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index]) - (n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index])
00202           - 2.f*(n_a.vd[5*para.number_of_nodes + index] + n_a.vd[6*para.number_of_nodes + index] - (n_a.vd[7*para.number_of_nodes + index] + n_a.vd[8*para.number_of_nodes + index])
00203           - (n_a.vd[9*para.number_of_nodes + index] +n_a.vd[10*para.number_of_nodes + index]));
00204   mode[7] = n_a.vd[7*para.number_of_nodes + index] + n_a.vd[8*para.number_of_nodes + index] - (n_a.vd[9*para.number_of_nodes + index] + n_a.vd[10*para.number_of_nodes + index]);
00205   mode[8] = n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index] - (n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index]);
00206   mode[9] = n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index] - (n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index]);
00207 
00208   /* kinetic modes */
00209   mode[10] = -2.f*(n_a.vd[1*para.number_of_nodes + index] - n_a.vd[2*para.number_of_nodes + index]) + (n_a.vd[7*para.number_of_nodes + index] - n_a.vd[8*para.number_of_nodes + index])
00210            + (n_a.vd[9*para.number_of_nodes + index] - n_a.vd[10*para.number_of_nodes + index]) + (n_a.vd[11*para.number_of_nodes + index] - n_a.vd[12*para.number_of_nodes + index])
00211            + (n_a.vd[13*para.number_of_nodes + index] - n_a.vd[14*para.number_of_nodes + index]);
00212   mode[11] = -2.f*(n_a.vd[3*para.number_of_nodes + index] - n_a.vd[4*para.number_of_nodes + index]) + (n_a.vd[7*para.number_of_nodes + index] - n_a.vd[8*para.number_of_nodes + index])
00213            - (n_a.vd[9*para.number_of_nodes + index] - n_a.vd[10*para.number_of_nodes + index]) + (n_a.vd[15*para.number_of_nodes + index] - n_a.vd[16*para.number_of_nodes + index])
00214            + (n_a.vd[17*para.number_of_nodes + index] - n_a.vd[18*para.number_of_nodes + index]);
00215   mode[12] = -2.f*(n_a.vd[5*para.number_of_nodes + index] - n_a.vd[6*para.number_of_nodes + index]) + (n_a.vd[11*para.number_of_nodes + index] - n_a.vd[12*para.number_of_nodes + index])
00216            - (n_a.vd[13*para.number_of_nodes + index] - n_a.vd[14*para.number_of_nodes + index]) + (n_a.vd[15*para.number_of_nodes + index] - n_a.vd[16*para.number_of_nodes + index])
00217            - (n_a.vd[17*para.number_of_nodes + index] - n_a.vd[18*para.number_of_nodes + index]);
00218   mode[13] = (n_a.vd[7*para.number_of_nodes + index] - n_a.vd[8*para.number_of_nodes + index]) + (n_a.vd[9*para.number_of_nodes + index] - n_a.vd[10*para.number_of_nodes + index])
00219            - (n_a.vd[11*para.number_of_nodes + index] - n_a.vd[12*para.number_of_nodes + index]) - (n_a.vd[13*para.number_of_nodes + index] - n_a.vd[14*para.number_of_nodes + index]);
00220   mode[14] = (n_a.vd[7*para.number_of_nodes + index] - n_a.vd[8*para.number_of_nodes + index]) - (n_a.vd[9*para.number_of_nodes + index] - n_a.vd[10*para.number_of_nodes + index])
00221            - (n_a.vd[15*para.number_of_nodes + index] - n_a.vd[16*para.number_of_nodes + index]) - (n_a.vd[17*para.number_of_nodes + index] - n_a.vd[18*para.number_of_nodes + index]);
00222   mode[15] = (n_a.vd[11*para.number_of_nodes + index] - n_a.vd[12*para.number_of_nodes + index]) - (n_a.vd[13*para.number_of_nodes + index] - n_a.vd[14*para.number_of_nodes + index])
00223            - (n_a.vd[15*para.number_of_nodes + index] - n_a.vd[16*para.number_of_nodes + index]) + (n_a.vd[17*para.number_of_nodes + index] - n_a.vd[18*para.number_of_nodes + index]);
00224   mode[16] = n_a.vd[0*para.number_of_nodes + index] + n_a.vd[7*para.number_of_nodes + index] + n_a.vd[8*para.number_of_nodes + index] + n_a.vd[9*para.number_of_nodes + index] + n_a.vd[10*para.number_of_nodes + index]
00225            + n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index] + n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index]
00226            + n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index] + n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index]
00227            - 2.f*((n_a.vd[1*para.number_of_nodes + index] + n_a.vd[2*para.number_of_nodes + index]) + (n_a.vd[3*para.number_of_nodes + index] + n_a.vd[4*para.number_of_nodes + index])
00228            + (n_a.vd[5*para.number_of_nodes + index] + n_a.vd[6*para.number_of_nodes + index]));
00229   mode[17] = -(n_a.vd[1*para.number_of_nodes + index] + n_a.vd[2*para.number_of_nodes + index]) + (n_a.vd[3*para.number_of_nodes + index] + n_a.vd[4*para.number_of_nodes + index])
00230            + (n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index]) + (n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index])
00231            - (n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index]) - (n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index]);
00232   mode[18] = -(n_a.vd[1*para.number_of_nodes + index] + n_a.vd[2*para.number_of_nodes + index]) - (n_a.vd[3*para.number_of_nodes + index] + n_a.vd[4*para.number_of_nodes + index])
00233            - (n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index]) - (n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index])
00234            - (n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index]) - (n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index])
00235            + 2.f*((n_a.vd[5*para.number_of_nodes + index] + n_a.vd[6*para.number_of_nodes + index]) + (n_a.vd[7*para.number_of_nodes + index] + n_a.vd[8*para.number_of_nodes + index])
00236            + (n_a.vd[9*para.number_of_nodes + index] + n_a.vd[10*para.number_of_nodes + index]));
00237 
00238 }
00239 
00240 /**lb_relax_modes, means collision update of the modes
00241  * @param index         node index / thread index (Input)
00242  * @param mode          Pointer to the local register values mode (Input/Output)
00243  * @param node_f        Pointer to local node force (Input)
00244 */
00245 __device__ void relax_modes(float *mode, unsigned int index, LB_node_force_gpu node_f){
00246 
00247   float Rho = mode[0] + para.rho*para.agrid*para.agrid*para.agrid;
00248   float j[3], pi_eq[6];
00249 
00250   /** re-construct the real density
00251   * remember that the populations are stored as differences to their
00252   * equilibrium value */
00253 
00254   j[0] = mode[1];
00255   j[1] = mode[2];
00256   j[2] = mode[3];
00257 
00258   /** if forces are present, the momentum density is redefined to
00259   * inlcude one half-step of the force action.  See the
00260   * Chapman-Enskog expansion in [Ladd & Verberg]. */
00261 
00262   j[0] += 0.5f*node_f.force[0*para.number_of_nodes + index];
00263   j[1] += 0.5f*node_f.force[1*para.number_of_nodes + index];
00264   j[2] += 0.5f*node_f.force[2*para.number_of_nodes + index];
00265 
00266   /** equilibrium part of the stress modes (eq13 schiller)*/
00267   pi_eq[0] = ((j[0]*j[0])+(j[1]*j[1])+(j[2]*j[2]))/Rho;
00268   pi_eq[1] = ((j[0]*j[0])-(j[1]*j[1]))/Rho;
00269   pi_eq[2] = (((j[0]*j[0])+(j[1]*j[1])+(j[2]*j[2])) - 3.0f*(j[2]*j[2]))/Rho;
00270   pi_eq[3] = j[0]*j[1]/Rho;
00271   pi_eq[4] = j[0]*j[2]/Rho;
00272   pi_eq[5] = j[1]*j[2]/Rho;
00273 
00274   /** relax the stress modes (eq14 schiller)*/
00275   mode[4] = pi_eq[0] + para.gamma_bulk*(mode[4] - pi_eq[0]);
00276   mode[5] = pi_eq[1] + para.gamma_shear*(mode[5] - pi_eq[1]);
00277   mode[6] = pi_eq[2] + para.gamma_shear*(mode[6] - pi_eq[2]);
00278   mode[7] = pi_eq[3] + para.gamma_shear*(mode[7] - pi_eq[3]);
00279   mode[8] = pi_eq[4] + para.gamma_shear*(mode[8] - pi_eq[4]);
00280   mode[9] = pi_eq[5] + para.gamma_shear*(mode[9] - pi_eq[5]);
00281 
00282   /** relax the ghost modes (project them out) */
00283   /** ghost modes have no equilibrium part due to orthogonality */
00284   mode[10] = para.gamma_odd*mode[10];
00285   mode[11] = para.gamma_odd*mode[11];
00286   mode[12] = para.gamma_odd*mode[12];
00287   mode[13] = para.gamma_odd*mode[13];
00288   mode[14] = para.gamma_odd*mode[14];
00289   mode[15] = para.gamma_odd*mode[15];
00290   mode[16] = para.gamma_even*mode[16];
00291   mode[17] = para.gamma_even*mode[17];
00292   mode[18] = para.gamma_even*mode[18];
00293 
00294 }
00295 
00296 /**thermalization of the modes with gaussian random numbers
00297  * @param index         node index / thread index (Input)
00298  * @param mode          Pointer to the local register values mode (Input/Output)
00299  * @param *rn           Pointer to randomnumber array of the local node
00300 */
00301 __device__ void thermalize_modes(float *mode, unsigned int index, LB_randomnr_gpu *rn){
00302 
00303   float Rho = mode[0] + para.rho*para.agrid*para.agrid*para.agrid;
00304 
00305   /*
00306     if (Rho <0)
00307     printf("Rho too small! %f %f %f", Rho, mode[0], para.rho*para.agrid*para.agrid*para.agrid);
00308   */
00309 #ifdef GAUSSRANDOM
00310   /** stress modes */
00311   gaussian_random(rn);
00312   mode[4] += sqrt(Rho*(para.mu*(2.f/3.f)*(1.f-(para.gamma_bulk*para.gamma_bulk)))) * rn->randomnr[1];
00313   mode[5] += sqrt(Rho*(para.mu*(4.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear)))) * rn->randomnr[0];
00314 
00315   gaussian_random(rn);
00316   mode[6] += sqrt(Rho*(para.mu*(4.f/3.f)*(1.f-(para.gamma_shear*para.gamma_shear)))) * rn->randomnr[1];
00317   mode[7] += sqrt(Rho*(para.mu*(1.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear)))) * rn->randomnr[0];
00318 
00319   gaussian_random(rn);
00320   mode[8] += sqrt(Rho*(para.mu*(1.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear)))) * rn->randomnr[1];
00321   mode[9] += sqrt(Rho*(para.mu*(1.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear)))) * rn->randomnr[0];
00322  
00323   /** ghost modes */
00324   gaussian_random(rn);
00325   mode[10] += sqrt(Rho*(para.mu*(2.f/3.f))) * rn->randomnr[1];
00326   mode[11] += sqrt(Rho*(para.mu*(2.f/3.f))) * rn->randomnr[0];
00327 
00328   gaussian_random(rn);
00329   mode[12] += sqrt(Rho*(para.mu*(2.f/3.f))) * rn->randomnr[1];
00330   mode[13] += sqrt(Rho*(para.mu*(2.f/9.f))) * rn->randomnr[0];
00331 
00332   gaussian_random(rn);
00333   mode[14] += sqrt(Rho*(para.mu*(2.f/9.f))) * rn->randomnr[1];
00334   mode[15] += sqrt(Rho*(para.mu*(2.f/9.f))) * rn->randomnr[0];
00335 
00336   gaussian_random(rn);
00337   mode[16] += sqrt(Rho*(para.mu*(2.f))) * rn->randomnr[1];
00338   mode[17] += sqrt(Rho*(para.mu*(4.f/9.f))) * rn->randomnr[0];
00339 
00340   gaussian_random(rn);
00341   mode[18] += sqrt(Rho*(para.mu*(4.f/3.f))) * rn->randomnr[1];
00342 #else
00343   /** stress modes */
00344   random_01(rn);
00345   mode[4] += sqrt(12.f*Rho*para.mu*(2.f/3.f)*(1.f-(para.gamma_bulk*para.gamma_bulk))) * (rn->randomnr[1]-0.5f);
00346   mode[5] += sqrt(12.f*Rho*para.mu*(4.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear))) * (rn->randomnr[0]-0.5f);
00347 
00348   random_01(rn);
00349   mode[6] += sqrt(12.f*Rho*para.mu*(4.f/3.f)*(1.f-(para.gamma_shear*para.gamma_shear))) * (rn->randomnr[1]-0.5f);
00350   mode[7] += sqrt(12.f*Rho*para.mu*(1.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear))) * (rn->randomnr[0]-0.5f);
00351 
00352   random_01(rn);
00353   mode[8] += sqrt(12.f*para.mu*(1.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear))) * (rn->randomnr[1]-0.5f);
00354   mode[9] += sqrt(12.f*para.mu*(1.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear))) * (rn->randomnr[0]-0.5f);
00355  
00356   /** ghost modes */
00357   random_01(rn);
00358   mode[10] += sqrt(12.f*Rho*para.mu*(2.f/3.f)) * (rn->randomnr[1]-0.5f);
00359   mode[11] += sqrt(12.f*Rho*para.mu*(2.f/3.f)) * (rn->randomnr[0]-0.5f);
00360 
00361   random_01(rn);
00362   mode[12] += sqrt(12.f*Rho*para.mu*(2.f/3.f)) * (rn->randomnr[1]-0.5f);
00363   mode[13] += sqrt(12.f*Rho*para.mu*(2.f/9.f)) * (rn->randomnr[0]-0.5f);
00364 
00365   random_01(rn);
00366   mode[14] += sqrt(12.f*Rho*para.mu*(2.f/9.f)) * (rn->randomnr[1]-0.5f);
00367   mode[15] += sqrt(12.f*Rho*para.mu*(2.f/9.f)) * (rn->randomnr[0]-0.5f);
00368 
00369   random_01(rn);
00370   mode[16] += sqrt(12.f*Rho*para.mu*(2.f)) * (rn->randomnr[1]-0.5f);
00371   mode[17] += sqrt(12.f*Rho*para.mu*(4.f/9.f)) * (rn->randomnr[0]-0.5f);
00372 
00373   random_01(rn);
00374   mode[18] += sqrt(12.f*Rho*para.mu*(4.f/3.f)) * (rn->randomnr[1]-0.5f);
00375 #endif
00376 }
00377 /*-------------------------------------------------------*/
00378 /**normalization of the modes need befor backtransformation into velocity space
00379  * @param mode          Pointer to the local register values mode (Input/Output)
00380 */
00381 __device__ void normalize_modes(float* mode){
00382 
00383   /** normalization factors enter in the back transformation */
00384   mode[0] *= 1.f;
00385   mode[1] *= 3.f;
00386   mode[2] *= 3.f;
00387   mode[3] *= 3.f;
00388   mode[4] *= 3.f/2.f;
00389   mode[5] *= 9.f/4.f;
00390   mode[6] *= 3.f/4.f;
00391   mode[7] *= 9.f;
00392   mode[8] *= 9.f;
00393   mode[9] *= 9.f;
00394   mode[10] *= 3.f/2.f;
00395   mode[11] *= 3.f/2.f;
00396   mode[12] *= 3.f/2.f;
00397   mode[13] *= 9.f/2.f;
00398   mode[14] *= 9.f/2.f;
00399   mode[15] *= 9.f/2.f;
00400   mode[16] *= 1.f/2.f;
00401   mode[17] *= 9.f/4.f;
00402   mode[18] *= 3.f/4.f;
00403 
00404 }
00405 /*-------------------------------------------------------*/
00406 /**backtransformation from modespace to desityspace and streaming with the push method using pbc
00407  * @param index         node index / thread index (Input)
00408  * @param mode          Pointer to the local register values mode (Input)
00409  * @param *n_b          Pointer to local node residing in array b (Output)
00410 */
00411 __device__ void calc_n_from_modes_push(LB_nodes_gpu n_b, float *mode, unsigned int index){
00412 
00413   unsigned int xyz[3];
00414   index_to_xyz(index, xyz);
00415   unsigned int x = xyz[0];
00416   unsigned int y = xyz[1];
00417   unsigned int z = xyz[2];
00418 
00419   n_b.vd[0*para.number_of_nodes + x + para.dim_x*y + para.dim_x*para.dim_y*z] = 1.f/3.f * (mode[0] - mode[4] + mode[16]);
00420   n_b.vd[1*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*z] = 1.f/18.f * (mode[0] + mode[1] + mode[5] + mode[6] - mode[17] - mode[18] - 2.f*(mode[10] + mode[16]));
00421   n_b.vd[2*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*z] = 1.f/18.f * (mode[0] - mode[1] + mode[5] + mode[6] - mode[17] - mode[18] + 2.f*(mode[10] - mode[16]));
00422   n_b.vd[3*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z] = 1.f/18.f * (mode[0] + mode[2] - mode[5] + mode[6] + mode[17] - mode[18] - 2.f*(mode[11] + mode[16]));
00423   n_b.vd[4*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z] = 1.f/18.f * (mode[0] - mode[2] - mode[5] + mode[6] + mode[17] - mode[18] + 2.f*(mode[11] - mode[16]));
00424   n_b.vd[5*para.number_of_nodes + x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = 1.f/18.f * (mode[0] + mode[3] - 2.f*(mode[6] + mode[12] + mode[16] - mode[18]));
00425   n_b.vd[6*para.number_of_nodes + x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = 1.f/18.f * (mode[0] - mode[3] - 2.f*(mode[6] - mode[12] + mode[16] - mode[18]));
00426   n_b.vd[7*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z] = 1.f/36.f * (mode[0] + mode[1] + mode[2] + mode[4] + 2.f*mode[6] + mode[7] + mode[10] + mode[11] + mode[13] + mode[14] + mode[16] + 2.f*mode[18]);
00427   n_b.vd[8*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z] = 1.f/36.f * (mode[0] - mode[1] - mode[2] + mode[4] + 2.f*mode[6] + mode[7] - mode[10] - mode[11] - mode[13] - mode[14] + mode[16] + 2.f*mode[18]);
00428   n_b.vd[9*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z] = 1.f/36.f * (mode[0] + mode[1] - mode[2] + mode[4] + 2.f*mode[6] - mode[7] + mode[10] - mode[11] + mode[13] - mode[14] + mode[16] + 2.f*mode[18]);
00429   n_b.vd[10*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z] = 1.f/36.f * (mode[0] - mode[1] + mode[2] + mode[4] + 2.f*mode[6] - mode[7] - mode[10] + mode[11] - mode[13] + mode[14] + mode[16] + 2.f*mode[18]);
00430   n_b.vd[11*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = 1.f/36.f * (mode[0] + mode[1] + mode[3] + mode[4] + mode[5] - mode[6] + mode[8] + mode[10] + mode[12] - mode[13] + mode[15] + mode[16] + mode[17] - mode[18]);
00431   n_b.vd[12*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = 1.f/36.f * (mode[0] - mode[1] - mode[3] + mode[4] + mode[5] - mode[6] + mode[8] - mode[10] - mode[12] + mode[13] - mode[15] + mode[16] + mode[17] - mode[18]);
00432   n_b.vd[13*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = 1.f/36.f * (mode[0] + mode[1] - mode[3] + mode[4] + mode[5] - mode[6] - mode[8] + mode[10] - mode[12] - mode[13] - mode[15] + mode[16] + mode[17] - mode[18]);
00433   n_b.vd[14*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = 1.f/36.f * (mode[0] - mode[1] + mode[3] + mode[4] + mode[5] - mode[6] - mode[8] - mode[10] + mode[12] + mode[13] + mode[15] + mode[16] + mode[17] - mode[18]);
00434   n_b.vd[15*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = 1.f/36.f * (mode[0] + mode[2] + mode[3] + mode[4] - mode[5] - mode[6] + mode[9] + mode[11] + mode[12] - mode[14] - mode[15] + mode[16] - mode[17] - mode[18]);
00435   n_b.vd[16*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = 1.f/36.f * (mode[0] - mode[2] - mode[3] + mode[4] - mode[5] - mode[6] + mode[9] - mode[11] - mode[12] + mode[14] + mode[15] + mode[16] - mode[17] - mode[18]);
00436   n_b.vd[17*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = 1.f/36.f * (mode[0] + mode[2] - mode[3] + mode[4] - mode[5] - mode[6] - mode[9] + mode[11] - mode[12] - mode[14] + mode[15] + mode[16] - mode[17] - mode[18]);
00437   n_b.vd[18*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = 1.f/36.f * (mode[0] - mode[2] + mode[3] + mode[4] - mode[5] - mode[6] - mode[9] - mode[11] + mode[12] + mode[14] - mode[15] + mode[16] - mode[17] - mode[18]);
00438 
00439 }
00440 
00441 /** Bounce back boundary conditions.
00442  * The populations that have propagated into a boundary node
00443  * are bounced back to the node they came from. This results
00444  * in no slip boundary conditions.
00445  *
00446  * [cf. Ladd and Verberg, J. Stat. Phys. 104(5/6):1191-1251, 2001]
00447  * @param index                 node index / thread index (Input)
00448  * @param n_b                   Pointer to local node residing in array b (Input)
00449  * @param n_a                   Pointer to local node residing in array a (Output) (temp stored in buffer a)
00450  * @param LB_boundary_velocity                  The constant velocity at the boundary, set by the user (Input)
00451  * @param LB_boundary_force                     The force on the boundary nodes (Output)
00452 */
00453 __device__ void bounce_back_read(LB_nodes_gpu n_b, LB_nodes_gpu n_a, unsigned int index, \
00454     float* LB_boundary_velocity, float* LB_boundary_force){
00455     
00456   unsigned int xyz[3];
00457   int c[3];
00458   float v[3];
00459   float shift, weight, pop_to_bounce_back;
00460   float boundary_force[3] = {0,0,0};
00461   size_t to_index, to_index_x, to_index_y, to_index_z;
00462   int population, inverse;
00463   int boundary_index;
00464 
00465 
00466   boundary_index=n_b.boundary[index];
00467   if(boundary_index != 0){
00468     
00469     v[0]=LB_boundary_velocity[3*(boundary_index-1)+0];
00470     v[1]=LB_boundary_velocity[3*(boundary_index-1)+1];
00471     v[2]=LB_boundary_velocity[3*(boundary_index-1)+2];
00472 
00473     index_to_xyz(index, xyz);
00474 
00475     unsigned int x = xyz[0];
00476     unsigned int y = xyz[1];
00477     unsigned int z = xyz[2];
00478 
00479 /* CPU analog of shift:
00480    lbpar.agrid*lbpar.agrid*lbpar.agrid*lbpar.rho*2*lbmodel.c[i][l]*lb_boundaries[lbfields[k].boundary-1].velocity[l] */
00481   
00482     /** store vd temporary in second lattice to avoid race conditions */
00483 #define BOUNCEBACK  \
00484   shift = para.agrid*para.agrid*para.agrid*para.agrid*para.rho*2.*3.*weight*para.tau*(v[0]*c[0] + v[1]*c[1] + v[2]*c[2]); \
00485   pop_to_bounce_back = n_b.vd[population*para.number_of_nodes + index ]; \
00486   to_index_x = (x+c[0]+para.dim_x)%para.dim_x; \
00487   to_index_y = (y+c[1]+para.dim_y)%para.dim_y; \
00488   to_index_z = (z+c[2]+para.dim_z)%para.dim_z; \
00489   to_index = to_index_x + para.dim_x*to_index_y + para.dim_x*para.dim_y*to_index_z; \
00490   if (n_b.boundary[to_index] == 0) \
00491   { \
00492     boundary_force[0] += (2*pop_to_bounce_back+shift)*c[0]/para.tau/para.tau/para.agrid; \
00493     boundary_force[1] += (2*pop_to_bounce_back+shift)*c[1]/para.tau/para.tau/para.agrid; \
00494     boundary_force[2] += (2*pop_to_bounce_back+shift)*c[2]/para.tau/para.tau/para.agrid; \
00495     n_b.vd[inverse*para.number_of_nodes + to_index ] = pop_to_bounce_back + shift; \
00496   }
00497 
00498 // ***** SHOULDN'T THERE BE AN ELSE STATMENT IN "BOUNCEBACK"?
00499 // ***** THERE IS AN ODD FACTOR OF 2 THAT YOU INCUR IN THE FORCES FOR THE "lb_stokes_sphere_gpu.tcl" TEST CASE
00500 
00501     // the resting population does nothing.
00502     c[0]=1;c[1]=0;c[2]=0; weight=1./18.; population=2; inverse=1; 
00503     BOUNCEBACK
00504     
00505     c[0]=-1;c[1]=0;c[2]=0; weight=1./18.; population=1; inverse=2; 
00506     BOUNCEBACK
00507     
00508     c[0]=0;c[1]=1;c[2]=0;  weight=1./18.; population=4; inverse=3; 
00509     BOUNCEBACK
00510 
00511     c[0]=0;c[1]=-1;c[2]=0; weight=1./18.; population=3; inverse=4; 
00512     BOUNCEBACK
00513     
00514     c[0]=0;c[1]=0;c[2]=1; weight=1./18.; population=6; inverse=5; 
00515     BOUNCEBACK
00516 
00517     c[0]=0;c[1]=0;c[2]=-1; weight=1./18.; population=5; inverse=6; 
00518     BOUNCEBACK 
00519     
00520     c[0]=1;c[1]=1;c[2]=0; weight=1./36.; population=8; inverse=7; 
00521     BOUNCEBACK
00522     
00523     c[0]=-1;c[1]=-1;c[2]=0; weight=1./36.; population=7; inverse=8; 
00524     BOUNCEBACK
00525     
00526     c[0]=1;c[1]=-1;c[2]=0; weight=1./36.; population=10; inverse=9; 
00527     BOUNCEBACK
00528 
00529     c[0]=-1;c[1]=+1;c[2]=0; weight=1./36.; population=9; inverse=10; 
00530     BOUNCEBACK
00531     
00532     c[0]=1;c[1]=0;c[2]=1; weight=1./36.; population=12; inverse=11; 
00533     BOUNCEBACK
00534     
00535     c[0]=-1;c[1]=0;c[2]=-1; weight=1./36.; population=11; inverse=12; 
00536     BOUNCEBACK
00537 
00538     c[0]=1;c[1]=0;c[2]=-1; weight=1./36.; population=14; inverse=13; 
00539     BOUNCEBACK
00540     
00541     c[0]=-1;c[1]=0;c[2]=1; weight=1./36.; population=13; inverse=14; 
00542     BOUNCEBACK
00543 
00544     c[0]=0;c[1]=1;c[2]=1; weight=1./36.; population=16; inverse=15; 
00545     BOUNCEBACK
00546     
00547     c[0]=0;c[1]=-1;c[2]=-1; weight=1./36.; population=15; inverse=16; 
00548     BOUNCEBACK
00549     
00550     c[0]=0;c[1]=1;c[2]=-1; weight=1./36.; population=18; inverse=17; 
00551     BOUNCEBACK
00552     
00553     c[0]=0;c[1]=-1;c[2]=1; weight=1./36.; population=17; inverse=18; 
00554     BOUNCEBACK  
00555     
00556     atomicadd(&LB_boundary_force[3*(n_b.boundary[index]-1)+0], boundary_force[0]);
00557     atomicadd(&LB_boundary_force[3*(n_b.boundary[index]-1)+1], boundary_force[1]);
00558     atomicadd(&LB_boundary_force[3*(n_b.boundary[index]-1)+2], boundary_force[2]);
00559   }
00560 }
00561 /**bounce back read kernel needed to avoid raceconditions
00562  * @param index                 node index / thread index (Input)
00563  * @param n_b                   Pointer to local node residing in array b (Input)
00564  * @param n_a                   Pointer to local node residing in array a (Output) (temp stored in buffer a)
00565 */
00566 __device__ void bounce_back_write(LB_nodes_gpu n_b, LB_nodes_gpu n_a, unsigned int index){
00567 
00568   unsigned int xyz[3];
00569 
00570   if(n_b.boundary[index] != 0){
00571     index_to_xyz(index, xyz);
00572     unsigned int x = xyz[0];
00573     unsigned int y = xyz[1];
00574     unsigned int z = xyz[2];
00575 
00576     /** stream vd from boundary node back to origin node */
00577     n_b.vd[1*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*z] = n_a.vd[1*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*z];
00578     n_b.vd[2*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*z] = n_a.vd[2*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*z];
00579     n_b.vd[3*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z] = n_a.vd[3*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z];
00580     n_b.vd[4*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z] = n_a.vd[4*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z];
00581     n_b.vd[5*para.number_of_nodes + x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = n_a.vd[5*para.number_of_nodes + x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)];
00582     n_b.vd[6*para.number_of_nodes + x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = n_a.vd[6*para.number_of_nodes + x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)];
00583     n_b.vd[7*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z] = n_a.vd[7*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z];
00584     n_b.vd[8*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z] = n_a.vd[8*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z];
00585     n_b.vd[9*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z] = n_a.vd[9*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z];
00586     n_b.vd[10*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z] = n_a.vd[10*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z];
00587     n_b.vd[11*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = n_a.vd[11*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)];
00588     n_b.vd[12*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = n_a.vd[12*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)];
00589     n_b.vd[13*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = n_a.vd[13*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)];
00590     n_b.vd[14*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = n_a.vd[14*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)];
00591     n_b.vd[15*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = n_a.vd[15*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z)];
00592     n_b.vd[16*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = n_a.vd[16*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)];
00593     n_b.vd[17*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = n_a.vd[17*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)];
00594     n_b.vd[18*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = n_a.vd[18*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z)];
00595   }
00596 }
00597 /** add of (external) forces within the modespace, needed for particle-interaction
00598  * @param index         node index / thread index (Input)
00599  * @param mode          Pointer to the local register values mode (Input/Output)
00600  * @param node_f        Pointer to local node force (Input)
00601 */
00602 __device__ void apply_forces(unsigned int index, float *mode, LB_node_force_gpu node_f) {
00603 
00604   float Rho, u[3], C[6];
00605   Rho = mode[0] + para.rho*para.agrid*para.agrid*para.agrid;
00606 
00607   /** hydrodynamic momentum density is redefined when forces present */
00608   u[0] = (mode[1] + 0.5f*node_f.force[0*para.number_of_nodes + index])/Rho;
00609   u[1] = (mode[2] + 0.5f*node_f.force[1*para.number_of_nodes + index])/Rho;
00610   u[2] = (mode[3] + 0.5f*node_f.force[2*para.number_of_nodes + index])/Rho;
00611 
00612   C[0] = (1.f + para.gamma_bulk)*u[0]*node_f.force[0*para.number_of_nodes + index] + 1.f/3.f*(para.gamma_bulk-para.gamma_shear)*(u[0]*node_f.force[0*para.number_of_nodes + index] + u[1]*node_f.force[1*para.number_of_nodes + index] + u[2]*node_f.force[2*para.number_of_nodes + index]);
00613   C[2] = (1.f + para.gamma_bulk)*u[1]*node_f.force[1*para.number_of_nodes + index] + 1.f/3.f*(para.gamma_bulk-para.gamma_shear)*(u[0]*node_f.force[0*para.number_of_nodes + index] + u[1]*node_f.force[1*para.number_of_nodes + index] + u[2]*node_f.force[2*para.number_of_nodes + index]);
00614   C[5] = (1.f + para.gamma_bulk)*u[2]*node_f.force[2*para.number_of_nodes + index] + 1.f/3.f*(para.gamma_bulk-para.gamma_shear)*(u[0]*node_f.force[0*para.number_of_nodes + index] + u[1]*node_f.force[1*para.number_of_nodes + index] + u[2]*node_f.force[2*para.number_of_nodes + index]);
00615   C[1] = 1.f/2.f*(1.f+para.gamma_shear)*(u[0]*node_f.force[1*para.number_of_nodes + index]+u[1]*node_f.force[0*para.number_of_nodes + index]);
00616   C[3] = 1.f/2.f*(1.f+para.gamma_shear)*(u[0]*node_f.force[2*para.number_of_nodes + index]+u[2]*node_f.force[0*para.number_of_nodes + index]);
00617   C[4] = 1.f/2.f*(1.f+para.gamma_shear)*(u[1]*node_f.force[2*para.number_of_nodes + index]+u[2]*node_f.force[1*para.number_of_nodes + index]);
00618 
00619   /** update momentum modes */
00620   mode[1] += node_f.force[0*para.number_of_nodes + index];
00621   mode[2] += node_f.force[1*para.number_of_nodes + index];
00622   mode[3] += node_f.force[2*para.number_of_nodes + index];
00623         
00624   /** update stress modes */
00625   mode[4] += C[0] + C[2] + C[5];
00626   mode[5] += C[0] - C[2];
00627   mode[6] += C[0] + C[2] - 2.f*C[5];
00628   mode[7] += C[1];
00629   mode[8] += C[3];
00630   mode[9] += C[4];
00631 
00632 #ifdef EXTERNAL_FORCES
00633   if(para.external_force){
00634     node_f.force[0*para.number_of_nodes + index] = para.ext_force[0]*powf(para.agrid,4)*para.tau*para.tau;
00635     node_f.force[1*para.number_of_nodes + index] = para.ext_force[1]*powf(para.agrid,4)*para.tau*para.tau;
00636     node_f.force[2*para.number_of_nodes + index] = para.ext_force[2]*powf(para.agrid,4)*para.tau*para.tau;
00637   }
00638   else{
00639   node_f.force[0*para.number_of_nodes + index] = 0.f;
00640   node_f.force[1*para.number_of_nodes + index] = 0.f;
00641   node_f.force[2*para.number_of_nodes + index] = 0.f;
00642   }
00643 #else
00644   /** reset force */
00645   node_f.force[0*para.number_of_nodes + index] = 0.f;
00646   node_f.force[1*para.number_of_nodes + index] = 0.f;
00647   node_f.force[2*para.number_of_nodes + index] = 0.f;
00648 #endif
00649 }
00650 
00651 /**function used to calc physical values of every node
00652  * @param index         node index / thread index (Input)
00653  * @param mode          Pointer to the local register values mode (Input)
00654  * @param n_a           Pointer to local node residing in array a for boundary flag(Input)
00655  * @param *d_v          Pointer to local device values (Input/Output)
00656  * @param singlenode    Flag, if there is only one node
00657  * This function is a clone of lb_calc_local_fields and
00658  * additionally performs unit conversions.
00659 */
00660 __device__ void calc_values(LB_nodes_gpu n_a, float *mode, LB_values_gpu *d_v, unsigned int index, unsigned int singlenode){
00661 
00662   float rho = mode[0] + para.rho*para.agrid*para.agrid*para.agrid;
00663         
00664   float *v, *pi;
00665   if(singlenode == 1){ 
00666     v=&(d_v[0].v[0]);
00667     pi=&(d_v[0].pi[0]);
00668     d_v[0].rho = rho;
00669   } else {
00670     v=&(d_v[index].v[0]);
00671     pi=&(d_v[index].pi[0]);
00672     d_v[index].rho = rho;
00673   }
00674   float j[3]; float pi_eq[6];
00675   
00676   j[0] = mode[1];
00677   j[1] = mode[2];
00678   j[2] = mode[3];
00679 
00680 // To Do: Here half the forces have to go in!
00681 //  j[0] += 0.5*lbfields[index].force[0];
00682 //  j[1] += 0.5*lbfields[index].force[1];
00683 //  j[2] += 0.5*lbfields[index].force[2];
00684 
00685   v[0]=j[0]/rho;
00686   v[1]=j[1]/rho;
00687   v[2]=j[2]/rho;
00688 
00689   /* equilibrium part of the stress modes */
00690   pi_eq[0] = (j[0]*j[0]+j[1]*j[1]+j[2]*j[2])/ rho;
00691   pi_eq[1] = ((j[0]*j[0])-(j[1]*j[1]))/ rho;
00692   pi_eq[2] = (j[0]*j[0]+j[1]*j[1]+j[2]*j[2] - 3.0*j[2]*j[2])/ rho;
00693   pi_eq[3] = j[0]*j[1]/ rho;
00694   pi_eq[4] = j[0]*j[2]/ rho;
00695   pi_eq[5] = j[1]*j[2]/ rho;
00696 
00697   /* Now we must predict the outcome of the next collision */
00698   /* We immediately average pre- and post-collision. */
00699   mode[4] = pi_eq[0] + (0.5+0.5*para.gamma_bulk )*(mode[4] - pi_eq[0]);
00700   mode[5] = pi_eq[1] + (0.5+0.5*para.gamma_shear)*(mode[5] - pi_eq[1]);
00701   mode[6] = pi_eq[2] + (0.5+0.5*para.gamma_shear)*(mode[6] - pi_eq[2]);
00702   mode[7] = pi_eq[3] + (0.5+0.5*para.gamma_shear)*(mode[7] - pi_eq[3]);
00703   mode[8] = pi_eq[4] + (0.5+0.5*para.gamma_shear)*(mode[8] - pi_eq[4]);
00704   mode[9] = pi_eq[5] + (0.5+0.5*para.gamma_shear)*(mode[9] - pi_eq[5]);
00705 
00706   /* Now we have to transform to the "usual" stress tensor components */
00707   /* We use eq. 116ff in Duenweg Ladd for that. */
00708   pi[0]=(mode[0]+mode[4]+mode[5])/3.;
00709   pi[2]=(2*mode[0]+2*mode[4]-mode[5]+3*mode[6])/6.;
00710   pi[5]=(2*mode[0]+2*mode[4]-mode[5]+3*mode[6])/6.;
00711   pi[1]=mode[7];
00712   pi[3]=mode[8];
00713   pi[4]=mode[9];
00714 
00715   /* Finally some unit conversions */
00716   rho*=1./para.agrid/para.agrid/para.agrid;
00717   v[0]*=1./para.tau/para.agrid;
00718   v[1]*=1./para.tau/para.agrid;
00719   v[2]*=1./para.tau/para.agrid;
00720 
00721   for (int i =0; i<6; i++) {
00722     pi[i]*=1./para.tau/para.tau/para.agrid/para.agrid/para.agrid;
00723   }
00724 
00725 }
00726 /** 
00727  * @param node_index    node index around (8) particle (Input)
00728  * @param *mode                 Pointer to the local register values mode (Output)
00729  * @param n_a                   Pointer to local node residing in array a(Input)
00730 */
00731 __device__ void calc_mode(float *mode, LB_nodes_gpu n_a, unsigned int node_index){
00732         
00733   /** mass mode */
00734   mode[0] = n_a.vd[0*para.number_of_nodes + node_index] + n_a.vd[1*para.number_of_nodes + node_index] + n_a.vd[2*para.number_of_nodes + node_index]
00735           + n_a.vd[3*para.number_of_nodes + node_index] + n_a.vd[4*para.number_of_nodes + node_index] + n_a.vd[5*para.number_of_nodes + node_index]
00736           + n_a.vd[6*para.number_of_nodes + node_index] + n_a.vd[7*para.number_of_nodes + node_index] + n_a.vd[8*para.number_of_nodes + node_index]
00737           + n_a.vd[9*para.number_of_nodes + node_index] + n_a.vd[10*para.number_of_nodes + node_index] + n_a.vd[11*para.number_of_nodes + node_index] + n_a.vd[12*para.number_of_nodes + node_index]
00738           + n_a.vd[13*para.number_of_nodes + node_index] + n_a.vd[14*para.number_of_nodes + node_index] + n_a.vd[15*para.number_of_nodes + node_index] + n_a.vd[16*para.number_of_nodes + node_index]
00739           + n_a.vd[17*para.number_of_nodes + node_index] + n_a.vd[18*para.number_of_nodes + node_index];
00740 
00741   /** momentum modes */
00742   mode[1] = (n_a.vd[1*para.number_of_nodes + node_index] - n_a.vd[2*para.number_of_nodes + node_index]) + (n_a.vd[7*para.number_of_nodes + node_index] - n_a.vd[8*para.number_of_nodes + node_index])
00743           + (n_a.vd[9*para.number_of_nodes + node_index] - n_a.vd[10*para.number_of_nodes + node_index]) + (n_a.vd[11*para.number_of_nodes + node_index] - n_a.vd[12*para.number_of_nodes + node_index])
00744           + (n_a.vd[13*para.number_of_nodes + node_index] - n_a.vd[14*para.number_of_nodes + node_index]);
00745   mode[2] = (n_a.vd[3*para.number_of_nodes + node_index] - n_a.vd[4*para.number_of_nodes + node_index]) + (n_a.vd[7*para.number_of_nodes + node_index] - n_a.vd[8*para.number_of_nodes + node_index])
00746           - (n_a.vd[9*para.number_of_nodes + node_index] - n_a.vd[10*para.number_of_nodes + node_index]) + (n_a.vd[15*para.number_of_nodes + node_index] - n_a.vd[16*para.number_of_nodes + node_index])
00747           + (n_a.vd[17*para.number_of_nodes + node_index] - n_a.vd[18*para.number_of_nodes + node_index]);
00748   mode[3] = (n_a.vd[5*para.number_of_nodes + node_index] - n_a.vd[6*para.number_of_nodes + node_index]) + (n_a.vd[11*para.number_of_nodes + node_index] - n_a.vd[12*para.number_of_nodes + node_index])
00749           - (n_a.vd[13*para.number_of_nodes + node_index] - n_a.vd[14*para.number_of_nodes + node_index]) + (n_a.vd[15*para.number_of_nodes + node_index] - n_a.vd[16*para.number_of_nodes + node_index])
00750           - (n_a.vd[17*para.number_of_nodes + node_index] - n_a.vd[18*para.number_of_nodes + node_index]);
00751 }
00752 /*********************************************************/
00753 /** \name Coupling part */
00754 /*********************************************************/
00755 /**(Eq. (12) Ahlrichs and Duenweg, JCP 111(17):8225 (1999))
00756  * @param n_a                   Pointer to local node residing in array a (Input)
00757  * @param *delta                Pointer for the weighting of particle position (Output)
00758  * @param *delta_j              Pointer for the weighting of particle momentum (Output)
00759  * @param *particle_data        Pointer to the particle position and velocity (Input)
00760  * @param *particle_force       Pointer to the particle force (Input)
00761  * @param part_index            particle id / thread id (Input)
00762  * @param *rn_part              Pointer to randomnumber array of the particle
00763  * @param node_index            node index around (8) particle (Output)
00764 */
00765 __device__ void calc_viscous_force(LB_nodes_gpu n_a, float *delta, LB_particle_gpu *particle_data, LB_particle_force_gpu *particle_force, unsigned int part_index, LB_randomnr_gpu *rn_part, float *delta_j, unsigned int *node_index){
00766         
00767   float mode[4];
00768   int my_left[3];
00769   float interpolated_u1, interpolated_u2, interpolated_u3;
00770   float Rho;
00771   interpolated_u1 = interpolated_u2 = interpolated_u3 = 0.f;
00772 
00773   float temp_delta[6];
00774   float temp_delta_half[6];
00775 
00776   /** see ahlrichs + duenweg page 8227 equ (10) and (11) */
00777   #pragma unroll
00778   for(int i=0; i<3; ++i){
00779     float scaledpos = particle_data[part_index].p[i]/para.agrid - 0.5f;
00780     my_left[i] = (int)(floorf(scaledpos));
00781     //printf("scaledpos %f \t myleft: %d \n", scaledpos, my_left[i]);
00782     temp_delta[3+i] = scaledpos - my_left[i];
00783     temp_delta[i] = 1.f - temp_delta[3+i];
00784     /**further value used for interpolation of fluid velocity at part pos near boundaries */
00785     temp_delta_half[3+i] = (scaledpos - my_left[i])*2.f;
00786     temp_delta_half[i] = 2.f - temp_delta_half[3+i];
00787   }
00788 
00789   delta[0] = temp_delta[0] * temp_delta[1] * temp_delta[2];
00790   delta[1] = temp_delta[3] * temp_delta[1] * temp_delta[2];
00791   delta[2] = temp_delta[0] * temp_delta[4] * temp_delta[2];
00792   delta[3] = temp_delta[3] * temp_delta[4] * temp_delta[2];
00793   delta[4] = temp_delta[0] * temp_delta[1] * temp_delta[5];
00794   delta[5] = temp_delta[3] * temp_delta[1] * temp_delta[5];
00795   delta[6] = temp_delta[0] * temp_delta[4] * temp_delta[5];
00796   delta[7] = temp_delta[3] * temp_delta[4] * temp_delta[5];
00797 
00798   // modulo for negative numbers is strange at best, shift to make sure we are positive
00799   int x = my_left[0] + para.dim_x;
00800   int y = my_left[1] + para.dim_y;
00801   int z = my_left[2] + para.dim_z;
00802 
00803   node_index[0] = x%para.dim_x     + para.dim_x*(y%para.dim_y)     + para.dim_x*para.dim_y*(z%para.dim_z);
00804   node_index[1] = (x+1)%para.dim_x + para.dim_x*(y%para.dim_y)     + para.dim_x*para.dim_y*(z%para.dim_z);
00805   node_index[2] = x%para.dim_x     + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*(z%para.dim_z);
00806   node_index[3] = (x+1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*(z%para.dim_z);
00807   node_index[4] = x%para.dim_x     + para.dim_x*(y%para.dim_y)     + para.dim_x*para.dim_y*((z+1)%para.dim_z);
00808   node_index[5] = (x+1)%para.dim_x + para.dim_x*(y%para.dim_y)     + para.dim_x*para.dim_y*((z+1)%para.dim_z);
00809   node_index[6] = x%para.dim_x     + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z);
00810   node_index[7] = (x+1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z);
00811   #pragma unroll
00812   for(int i=0; i<8; ++i){
00813     calc_mode(mode, n_a, node_index[i]);
00814     Rho = mode[0] + para.rho*para.agrid*para.agrid*para.agrid;  
00815     interpolated_u1 += delta[i]*mode[1]/(Rho);
00816     interpolated_u2 += delta[i]*mode[2]/(Rho);
00817     interpolated_u3 += delta[i]*mode[3]/(Rho);
00818   }
00819 
00820   /** calculate viscous force
00821    * take care to rescale velocities with time_step and transform to MD units
00822    * (Eq. (9) Ahlrichs and Duenweg, JCP 111(17):8225 (1999)) */
00823 #ifdef LB_ELECTROHYDRODYNAMICS
00824   particle_force[part_index].f[0] = - para.friction * (particle_data[part_index].v[0]/para.time_step - interpolated_u1*para.agrid/para.tau - particle_data[part_index].mu_E[0]);
00825   particle_force[part_index].f[1] = - para.friction * (particle_data[part_index].v[1]/para.time_step - interpolated_u2*para.agrid/para.tau - particle_data[part_index].mu_E[1]);
00826   particle_force[part_index].f[2] = - para.friction * (particle_data[part_index].v[2]/para.time_step - interpolated_u3*para.agrid/para.tau - particle_data[part_index].mu_E[2]);
00827 #else
00828   particle_force[part_index].f[0] = - para.friction * (particle_data[part_index].v[0]/para.time_step - interpolated_u1*para.agrid/para.tau);
00829   particle_force[part_index].f[1] = - para.friction * (particle_data[part_index].v[1]/para.time_step - interpolated_u2*para.agrid/para.tau);
00830   particle_force[part_index].f[2] = - para.friction * (particle_data[part_index].v[2]/para.time_step - interpolated_u3*para.agrid/para.tau);
00831 #endif
00832   /** add stochastik force of zero mean (Ahlrichs, Duennweg equ. 15)*/
00833 #ifdef GAUSSRANDOM
00834   gaussian_random(rn_part);
00835   particle_force[part_index].f[0] += para.lb_coupl_pref2*rn_part->randomnr[0];
00836   particle_force[part_index].f[1] += para.lb_coupl_pref2*rn_part->randomnr[1];
00837   gaussian_random(rn_part);
00838   particle_force[part_index].f[2] += para.lb_coupl_pref2*rn_part->randomnr[0];
00839 #else
00840   random_01(rn_part);
00841   particle_force[part_index].f[0] += para.lb_coupl_pref*(rn_part->randomnr[0]-0.5f);
00842   particle_force[part_index].f[1] += para.lb_coupl_pref*(rn_part->randomnr[1]-0.5f);
00843   random_01(rn_part);
00844   particle_force[part_index].f[2] += para.lb_coupl_pref*(rn_part->randomnr[0]-0.5f);
00845 #endif    
00846   /** delta_j for transform momentum transfer to lattice units which is done in calc_node_force
00847   (Eq. (12) Ahlrichs and Duenweg, JCP 111(17):8225 (1999)) */
00848   delta_j[0] = - particle_force[part_index].f[0]*para.time_step*para.tau/para.agrid;
00849   delta_j[1] = - particle_force[part_index].f[1]*para.time_step*para.tau/para.agrid;
00850   delta_j[2] = - particle_force[part_index].f[2]*para.time_step*para.tau/para.agrid;    
00851                                                                                                                                                                                                                                                                   
00852 }
00853 
00854 /**calcutlation of the node force caused by the particles, with atomicadd due to avoiding race conditions 
00855         (Eq. (14) Ahlrichs and Duenweg, JCP 111(17):8225 (1999))
00856  * @param *delta                Pointer for the weighting of particle position (Input)
00857  * @param *delta_j              Pointer for the weighting of particle momentum (Input)
00858  * @param node_index            node index around (8) particle (Input)
00859  * @param node_f                Pointer to the node force (Output).
00860 */
00861 __device__ void calc_node_force(float *delta, float *delta_j, unsigned int *node_index, LB_node_force_gpu node_f){
00862 
00863 #if 1
00864   atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[0]]), (delta[0]*delta_j[0]));
00865   atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[0]]), (delta[0]*delta_j[1]));
00866   atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[0]]), (delta[0]*delta_j[2]));
00867 
00868   atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[1]]), (delta[1]*delta_j[0]));
00869   atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[1]]), (delta[1]*delta_j[1]));
00870   atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[1]]), (delta[1]*delta_j[2]));
00871 
00872   atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[2]]), (delta[2]*delta_j[0]));
00873   atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[2]]), (delta[2]*delta_j[1]));
00874   atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[2]]), (delta[2]*delta_j[2]));
00875 
00876   atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[3]]), (delta[3]*delta_j[0]));
00877   atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[3]]), (delta[3]*delta_j[1]));
00878   atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[3]]), (delta[3]*delta_j[2]));
00879 
00880   atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[4]]), (delta[4]*delta_j[0]));
00881   atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[4]]), (delta[4]*delta_j[1]));
00882   atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[4]]), (delta[4]*delta_j[2]));
00883 
00884   atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[5]]), (delta[5]*delta_j[0]));
00885   atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[5]]), (delta[5]*delta_j[1]));
00886   atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[5]]), (delta[5]*delta_j[2]));
00887 
00888   atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[6]]), (delta[6]*delta_j[0]));
00889   atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[6]]), (delta[6]*delta_j[1]));
00890   atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[6]]), (delta[6]*delta_j[2]));
00891 
00892   atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[7]]), (delta[7]*delta_j[0]));
00893   atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[7]]), (delta[7]*delta_j[1]));
00894   atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[7]]), (delta[7]*delta_j[2]));
00895 #endif
00896 }
00897 /*********************************************************/
00898 /** \name System setup and Kernel funktions */
00899 /*********************************************************/
00900 /**kernel to calculate local populations from hydrodynamic fields given by the tcl values.
00901  * The mapping is given in terms of the equilibrium distribution.
00902  *
00903  * Eq. (2.15) Ladd, J. Fluid Mech. 271, 295-309 (1994)
00904  * Eq. (4) in Berk Usta, Ladd and Butler, JCP 122, 094902 (2005)
00905  *
00906  * @param n_a            Pointer to the lattice site (Input).
00907  * @param *gpu_check additional check if gpu kernel are executed(Input).
00908 */
00909 __global__ void calc_n_equilibrium(LB_nodes_gpu n_a, int *gpu_check) {
00910 
00911   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
00912 
00913   if(index<para.number_of_nodes){
00914 
00915     /** default values for fields in lattice units */
00916     gpu_check[0] = 1;
00917 
00918     float Rho = para.rho*para.agrid*para.agrid*para.agrid;
00919     float v[3] = { 0.0f, 0.0f, 0.0f };
00920     float pi[6] = { Rho*c_sound_sq, 0.0f, Rho*c_sound_sq, 0.0f, 0.0f, Rho*c_sound_sq };
00921 
00922     float rhoc_sq = Rho*c_sound_sq;
00923     float avg_rho = para.rho*para.agrid*para.agrid*para.agrid;
00924     float local_rho, local_j[3], *local_pi, trace;
00925 
00926     local_rho  = Rho;
00927 
00928     local_j[0] = Rho * v[0];
00929     local_j[1] = Rho * v[1];
00930     local_j[2] = Rho * v[2];
00931 
00932     local_pi = pi;
00933 
00934     /** reduce the pressure tensor to the part needed here */
00935     local_pi[0] -= rhoc_sq;
00936     local_pi[2] -= rhoc_sq;
00937     local_pi[5] -= rhoc_sq;
00938 
00939     trace = local_pi[0] + local_pi[2] + local_pi[5];
00940 
00941     float rho_times_coeff;
00942     float tmp1,tmp2;
00943 
00944     /** update the q=0 sublattice */
00945     n_a.vd[0*para.number_of_nodes + index] = 1.f/3.f * (local_rho-avg_rho) - 1.f/2.f*trace;
00946 
00947     /** update the q=1 sublattice */
00948     rho_times_coeff = 1.f/18.f * (local_rho-avg_rho);
00949 
00950     n_a.vd[1*para.number_of_nodes + index] = rho_times_coeff + 1.f/6.f*local_j[0] + 1.f/4.f*local_pi[0] - 1.f/12.f*trace;
00951     n_a.vd[2*para.number_of_nodes + index] = rho_times_coeff - 1.f/6.f*local_j[0] + 1.f/4.f*local_pi[0] - 1.f/12.f*trace;
00952     n_a.vd[3*para.number_of_nodes + index] = rho_times_coeff + 1.f/6.f*local_j[1] + 1.f/4.f*local_pi[2] - 1.f/12.f*trace;
00953     n_a.vd[4*para.number_of_nodes + index] = rho_times_coeff - 1.f/6.f*local_j[1] + 1.f/4.f*local_pi[2] - 1.f/12.f*trace;
00954     n_a.vd[5*para.number_of_nodes + index] = rho_times_coeff + 1.f/6.f*local_j[2] + 1.f/4.f*local_pi[5] - 1.f/12.f*trace;
00955     n_a.vd[6*para.number_of_nodes + index] = rho_times_coeff - 1.f/6.f*local_j[2] + 1.f/4.f*local_pi[5] - 1.f/12.f*trace;
00956 
00957     /** update the q=2 sublattice */
00958     rho_times_coeff = 1.f/36.f * (local_rho-avg_rho);
00959 
00960     tmp1 = local_pi[0] + local_pi[2];
00961     tmp2 = 2.0f*local_pi[1];
00962     n_a.vd[7*para.number_of_nodes + index]  = rho_times_coeff + 1.f/12.f*(local_j[0]+local_j[1]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace;
00963     n_a.vd[8*para.number_of_nodes + index]  = rho_times_coeff - 1.f/12.f*(local_j[0]+local_j[1]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace;
00964     n_a.vd[9*para.number_of_nodes + index]  = rho_times_coeff + 1.f/12.f*(local_j[0]-local_j[1]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace;
00965     n_a.vd[10*para.number_of_nodes + index] = rho_times_coeff - 1.f/12.f*(local_j[0]-local_j[1]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace;
00966 
00967     tmp1 = local_pi[0] + local_pi[5];
00968     tmp2 = 2.0f*local_pi[3];
00969 
00970     n_a.vd[11*para.number_of_nodes + index] = rho_times_coeff + 1.f/12.f*(local_j[0]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace;
00971     n_a.vd[12*para.number_of_nodes + index] = rho_times_coeff - 1.f/12.f*(local_j[0]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace;
00972     n_a.vd[13*para.number_of_nodes + index] = rho_times_coeff + 1.f/12.f*(local_j[0]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace;
00973     n_a.vd[14*para.number_of_nodes + index] = rho_times_coeff - 1.f/12.f*(local_j[0]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace;
00974 
00975     tmp1 = local_pi[2] + local_pi[5];
00976     tmp2 = 2.0f*local_pi[4];
00977 
00978     n_a.vd[15*para.number_of_nodes + index] = rho_times_coeff + 1.f/12.f*(local_j[1]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace;
00979     n_a.vd[16*para.number_of_nodes + index] = rho_times_coeff - 1.f/12.f*(local_j[1]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace;
00980     n_a.vd[17*para.number_of_nodes + index] = rho_times_coeff + 1.f/12.f*(local_j[1]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace;
00981     n_a.vd[18*para.number_of_nodes + index] = rho_times_coeff - 1.f/12.f*(local_j[1]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace;
00982 
00983     /**set different seed for randomgen on every node */
00984     n_a.seed[index] = para.your_seed + index;
00985   }
00986 }
00987 /** kernel to calculate local populations from hydrodynamic fields
00988  * from given flow field velocities.  The mapping is given in terms of
00989  * the equilibrium distribution.
00990  *
00991  * Eq. (2.15) Ladd, J. Fluid Mech. 271, 295-309 (1994)
00992  * Eq. (4) in Berk Usta, Ladd and Butler, JCP 122, 094902 (2005)
00993  *
00994  * @param n_a              the current nodes array (double buffering!)
00995  * @param single_nodeindex the node to set the velocity for
00996  * @param velocity         the velocity to set
00997  */
00998 __global__ void set_u_equilibrium(LB_nodes_gpu n_a, int single_nodeindex,float *velocity) {
00999 
01000   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01001 
01002   if(index == 0){
01003 
01004     /** default values for fields in lattice units */
01005     float mode[19];
01006     calc_mode(mode, n_a, single_nodeindex);
01007     float Rho = mode[0] + para.rho*para.agrid*para.agrid*para.agrid;
01008 
01009     float v[3];
01010     v[0] = velocity[0];
01011     v[1] = velocity[1];
01012     v[2] = velocity[2];
01013 
01014     float pi[6] = { Rho*c_sound_sq, 0.0f, Rho*c_sound_sq, 0.0f, 0.0f, Rho*c_sound_sq };
01015 
01016     float rhoc_sq = Rho*c_sound_sq;
01017     float avg_rho = para.rho*para.agrid*para.agrid*para.agrid;
01018     float local_rho, local_j[3], *local_pi, trace;
01019 
01020     local_rho  = Rho;
01021 
01022     local_j[0] = Rho * v[0];
01023     local_j[1] = Rho * v[1];
01024     local_j[2] = Rho * v[2];
01025 
01026     local_pi = pi;
01027 
01028     /** reduce the pressure tensor to the part needed here */
01029     local_pi[0] -= rhoc_sq;
01030     local_pi[2] -= rhoc_sq;
01031     local_pi[5] -= rhoc_sq;
01032 
01033     trace = local_pi[0] + local_pi[2] + local_pi[5];
01034 
01035     float rho_times_coeff;
01036     float tmp1,tmp2;
01037 
01038     /** update the q=0 sublattice */
01039     n_a.vd[0*para.number_of_nodes + single_nodeindex] = 1.f/3.f * (local_rho-avg_rho) - 1.f/2.f*trace;
01040 
01041     /** update the q=1 sublattice */
01042     rho_times_coeff = 1.f/18.f * (local_rho-avg_rho);
01043 
01044     n_a.vd[1*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/6.f*local_j[0] + 1.f/4.f*local_pi[0] - 1.f/12.f*trace;
01045     n_a.vd[2*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/6.f*local_j[0] + 1.f/4.f*local_pi[0] - 1.f/12.f*trace;
01046     n_a.vd[3*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/6.f*local_j[1] + 1.f/4.f*local_pi[2] - 1.f/12.f*trace;
01047     n_a.vd[4*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/6.f*local_j[1] + 1.f/4.f*local_pi[2] - 1.f/12.f*trace;
01048     n_a.vd[5*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/6.f*local_j[2] + 1.f/4.f*local_pi[5] - 1.f/12.f*trace;
01049     n_a.vd[6*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/6.f*local_j[2] + 1.f/4.f*local_pi[5] - 1.f/12.f*trace;
01050 
01051     /** update the q=2 sublattice */
01052     rho_times_coeff = 1.f/36.f * (local_rho-avg_rho);
01053 
01054     tmp1 = local_pi[0] + local_pi[2];
01055     tmp2 = 2.0f*local_pi[1];
01056     n_a.vd[7*para.number_of_nodes + single_nodeindex]  = rho_times_coeff + 1.f/12.f*(local_j[0]+local_j[1]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace;
01057     n_a.vd[8*para.number_of_nodes + single_nodeindex]  = rho_times_coeff - 1.f/12.f*(local_j[0]+local_j[1]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace;
01058     n_a.vd[9*para.number_of_nodes + single_nodeindex]  = rho_times_coeff + 1.f/12.f*(local_j[0]-local_j[1]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace;
01059     n_a.vd[10*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/12.f*(local_j[0]-local_j[1]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace;
01060 
01061     tmp1 = local_pi[0] + local_pi[5];
01062     tmp2 = 2.0f*local_pi[3];
01063 
01064     n_a.vd[11*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/12.f*(local_j[0]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace;
01065     n_a.vd[12*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/12.f*(local_j[0]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace;
01066     n_a.vd[13*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/12.f*(local_j[0]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace;
01067     n_a.vd[14*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/12.f*(local_j[0]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace;
01068 
01069     tmp1 = local_pi[2] + local_pi[5];
01070     tmp2 = 2.0f*local_pi[4];
01071 
01072     n_a.vd[15*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/12.f*(local_j[1]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace;
01073     n_a.vd[16*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/12.f*(local_j[1]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace;
01074     n_a.vd[17*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/12.f*(local_j[1]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace;
01075     n_a.vd[18*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/12.f*(local_j[1]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace;
01076 
01077   }
01078 }
01079 /** kernel for the initalisation of the particle force array
01080  * @param *particle_force       Pointer to local particle force (Output)
01081  * @param *part                 Pointer to the particle rn seed storearray (Output)
01082 */
01083 __global__ void init_particle_force(LB_particle_force_gpu *particle_force, LB_particle_seed_gpu *part){
01084         
01085   unsigned int part_index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01086         
01087   if(part_index<para.number_of_particles){
01088     particle_force[part_index].f[0] = 0.0f;
01089     particle_force[part_index].f[1] = 0.0f;
01090     particle_force[part_index].f[2] = 0.0f;
01091         
01092     part[part_index].seed = para.your_seed + part_index;
01093   }
01094                         
01095 }
01096 
01097 /** kernel for the initalisation of the partikel force array
01098  * @param *particle_force       pointer to local particle force (Input)
01099 */
01100 __global__ void reset_particle_force(LB_particle_force_gpu *particle_force){
01101         
01102   unsigned int part_index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01103         
01104   if(part_index<para.number_of_particles){
01105     particle_force[part_index].f[0] = 0.0f;
01106     particle_force[part_index].f[1] = 0.0f;
01107     particle_force[part_index].f[2] = 0.0f;
01108   }                     
01109 }
01110 
01111 /** (re-)initialization of the node force / set up of external force in lb units
01112  * @param node_f                Pointer to local node force (Input)
01113 */
01114 __global__ void reinit_node_force(LB_node_force_gpu node_f){
01115 
01116   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01117 
01118   if(index<para.number_of_nodes){
01119 #ifdef EXTERNAL_FORCES
01120     if(para.external_force){
01121       node_f.force[0*para.number_of_nodes + index] = para.ext_force[0]*powf(para.agrid,4)*para.tau*para.tau;
01122       node_f.force[1*para.number_of_nodes + index] = para.ext_force[1]*powf(para.agrid,4)*para.tau*para.tau;
01123       node_f.force[2*para.number_of_nodes + index] = para.ext_force[2]*powf(para.agrid,4)*para.tau*para.tau;
01124     }
01125     else{
01126       node_f.force[0*para.number_of_nodes + index] = 0.0f;
01127       node_f.force[1*para.number_of_nodes + index] = 0.0f;
01128       node_f.force[2*para.number_of_nodes + index] = 0.0f;
01129     }
01130 #else
01131     node_f.force[0*para.number_of_nodes + index] = 0.0f;
01132     node_f.force[1*para.number_of_nodes + index] = 0.0f;
01133     node_f.force[2*para.number_of_nodes + index] = 0.0f;
01134 #endif
01135   }
01136 }
01137 
01138 /**set the boundary flag for all boundary nodes
01139  * @param boundary_node_list    The indices of the boundary nodes
01140  * @param boundary_index_list   The flag representing the corresponding boundary
01141  * @param number_of_boundnodes  The number of boundary nodes
01142  * @param n_a                   Pointer to local node residing in array a (Input)
01143  * @param n_b                   Pointer to local node residing in array b (Input)
01144 */
01145 __global__ void init_boundaries(int *boundary_node_list, int *boundary_index_list, int number_of_boundnodes, LB_nodes_gpu n_a, LB_nodes_gpu n_b){
01146 
01147   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01148 
01149   if(index<number_of_boundnodes){
01150     n_a.boundary[boundary_node_list[index]] = boundary_index_list[index];
01151     n_b.boundary[boundary_node_list[index]] = boundary_index_list[index];
01152   }     
01153 }
01154 
01155 /**reset the boundary flag of every node
01156  * @param n_a           Pointer to local node residing in array a (Input)
01157  * @param n_b           Pointer to local node residing in array b (Input)       
01158 */
01159 __global__ void reset_boundaries(LB_nodes_gpu n_a, LB_nodes_gpu n_b){
01160 
01161   size_t index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01162 
01163   if(index<para.number_of_nodes){
01164     n_a.boundary[index] = n_b.boundary[index] = 0;
01165   }
01166 }
01167 
01168 /** integrationstep of the lb-fluid-solver
01169  * @param n_a           Pointer to local node residing in array a (Input)
01170  * @param n_b           Pointer to local node residing in array b (Input)
01171  * @param *d_v          Pointer to local device values (Input)
01172  * @param node_f        Pointer to local node force (Input)
01173 */
01174 __global__ void integrate(LB_nodes_gpu n_a, LB_nodes_gpu n_b, LB_values_gpu *d_v, LB_node_force_gpu node_f){
01175     
01176   /**every node is connected to a thread via the index*/
01177   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01178   /**the 19 moments (modes) are only temporary register values */
01179   float mode[19];
01180   LB_randomnr_gpu rng;
01181 
01182   if(index<para.number_of_nodes){
01183     /** storing the seed into a register value*/
01184     rng.seed = n_a.seed[index];
01185     /**calc_m_from_n*/
01186     calc_m_from_n(n_a, index, mode);
01187     /**lb_relax_modes*/
01188     relax_modes(mode, index, node_f);
01189     /**lb_thermalize_modes */
01190     if (para.fluct) thermalize_modes(mode, index, &rng);
01191 #ifdef EXTERNAL_FORCES
01192     /**if external force is used apply node force */
01193     apply_forces(index, mode, node_f);
01194 #else
01195     /**if partcles are used apply node forces*/
01196     if (para.number_of_particles) apply_forces(index, mode, node_f); 
01197 #endif
01198     /**lb_calc_n_from_modes_push*/
01199     normalize_modes(mode);
01200     /**calc of velocity densities and streaming with pbc*/
01201     calc_n_from_modes_push(n_b, mode, index);
01202     /** rewriting the seed back to the global memory*/
01203     n_b.seed[index] = rng.seed;
01204   }  
01205 }
01206 
01207 /** part interaction kernel
01208  * @param n_a                           Pointer to local node residing in array a (Input)
01209  * @param *particle_data                Pointer to the particle position and velocity (Input)
01210  * @param *particle_force               Pointer to the particle force (Input)
01211  * @param *part                         Pointer to the rn array of the particles (Input)
01212  * @param node_f                        Pointer to local node force (Input)
01213 */
01214 __global__ void calc_fluid_particle_ia(LB_nodes_gpu n_a, LB_particle_gpu *particle_data, LB_particle_force_gpu *particle_force, LB_node_force_gpu node_f, LB_particle_seed_gpu *part){
01215         
01216   unsigned int part_index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01217   unsigned int node_index[8];
01218   float delta[8];
01219   float delta_j[3];
01220   LB_randomnr_gpu rng_part;
01221         
01222   if(part_index<para.number_of_particles){
01223 
01224     rng_part.seed = part[part_index].seed;
01225     /**calc of the force which act on the particle */
01226     calc_viscous_force(n_a, delta, particle_data, particle_force, part_index, &rng_part, delta_j, node_index);
01227     /**calc of the force which acts back to the fluid node */
01228     calc_node_force(delta, delta_j, node_index, node_f);
01229     part[part_index].seed = rng_part.seed;              
01230   }
01231 }
01232 
01233 /**Bounce back boundary read kernel
01234  * @param n_a                                   Pointer to local node residing in array a (Input)
01235  * @param n_b                                   Pointer to local node residing in array b (Input)
01236  * @param LB_boundary_velocity                  The constant velocity at the boundary, set by the user (Input)
01237  * @param LB_boundary_force                     The force on the boundary nodes (Output)
01238 */
01239 __global__ void bb_read(LB_nodes_gpu n_a, LB_nodes_gpu n_b, float* LB_boundary_velocity, float* LB_boundary_force){
01240 
01241   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01242 
01243   if(index<para.number_of_nodes){
01244     bounce_back_read(n_b, n_a, index, LB_boundary_velocity, LB_boundary_force);
01245   }
01246 }
01247 
01248 /**Bounce back boundary write kernel
01249  * @param n_a                                   Pointer to local node residing in array a (Input)
01250  * @param n_b                                   Pointer to local node residing in array b (Input)
01251 */
01252 __global__ void bb_write(LB_nodes_gpu n_a, LB_nodes_gpu n_b){
01253 
01254   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01255 
01256   if(index<para.number_of_nodes){
01257     bounce_back_write(n_b, n_a, index);
01258   }
01259 }
01260 
01261 /** get physical values of the nodes (density, velocity, ...)
01262  * @param n_a           Pointer to local node residing in array a (Input)
01263  * @param *d_v          Pointer to local device values (Input)
01264 */
01265 __global__ void values(LB_nodes_gpu n_a, LB_values_gpu *d_v){
01266 
01267   float mode[19];
01268   unsigned int singlenode = 0;
01269   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01270 
01271   if(index<para.number_of_nodes){
01272     calc_m_from_n(n_a, index, mode);
01273     calc_values(n_a, mode, d_v, index, singlenode);
01274   }
01275 }
01276 
01277 /** get boundary flags
01278  *  @param n_a                Pointer to local node residing in array a (Input)
01279  *  @param device_bound_array Pointer to local device values (Input)
01280  */
01281 __global__ void lb_get_boundaries(LB_nodes_gpu n_a, unsigned int *device_bound_array){
01282 
01283   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01284 
01285   if(index<para.number_of_nodes){
01286    device_bound_array[index] = n_a.boundary[index];
01287   }
01288 }
01289 
01290 /**set extern force on single nodes kernel
01291  * @param n_extern_nodeforces           number of nodes (Input)
01292  * @param *extern_nodeforces            Pointer to extern node force array (Input)
01293  * @param node_f                        node force struct (Output)
01294 */
01295 __global__ void init_extern_nodeforces(int n_extern_nodeforces, LB_extern_nodeforce_gpu *extern_nodeforces, LB_node_force_gpu node_f){
01296 
01297   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01298 
01299   if(index<n_extern_nodeforces){
01300     node_f.force[0*para.number_of_nodes + extern_nodeforces[index].index] = extern_nodeforces[index].force[0]*powf(para.agrid,4)*para.tau*para.tau;
01301     node_f.force[1*para.number_of_nodes + extern_nodeforces[index].index] = extern_nodeforces[index].force[1]*powf(para.agrid,4)*para.tau*para.tau;
01302     node_f.force[2*para.number_of_nodes + extern_nodeforces[index].index] = extern_nodeforces[index].force[2]*powf(para.agrid,4)*para.tau*para.tau;
01303   }
01304 }
01305 
01306 /**print single node values kernel
01307  * @param single_nodeindex              index of the node (Input)
01308  * @param *d_p_v                        Pointer to result storage array (Input)
01309  * @param n_a                           Pointer to local node residing in array a (Input)
01310 */
01311 __global__ void lb_print_node(int single_nodeindex, LB_values_gpu *d_p_v, LB_nodes_gpu n_a){
01312         
01313   float mode[19];
01314   unsigned int singlenode = 1;
01315   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01316 
01317   if(index == 0){
01318     calc_m_from_n(n_a, single_nodeindex, mode);
01319     calc_values(n_a, mode, d_p_v, single_nodeindex, singlenode);
01320   }     
01321 }
01322 /**calculate mass of the hole fluid kernel
01323  * @param *sum                          Pointer to result storage value (Output)
01324  * @param n_a                           Pointer to local node residing in array a (Input)
01325 */
01326 __global__ void calc_mass(LB_nodes_gpu n_a, float *sum) {
01327   float mode[1];
01328 
01329   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01330 
01331   if(index<para.number_of_nodes){
01332     calc_mode(mode, n_a, index);
01333     float Rho = mode[0] + para.rho*para.agrid*para.agrid*para.agrid;
01334     //if(n_a.boundary[index]){
01335       //mode[0] = 0.f;
01336     //}
01337     atomicadd(&(sum[0]), Rho);
01338   }
01339 }
01340 /**calculate momentum of the hole fluid kernel
01341  * @param node_f                        node force struct (Input)
01342  * @param *sum                          Pointer to result storage value (Output)
01343  * @param n_a                           Pointer to local node residing in array a (Input)
01344 */
01345 __global__ void momentum(LB_nodes_gpu n_a, float *sum, LB_node_force_gpu node_f) {
01346   float mode[4];
01347 
01348   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01349 
01350   if(index<para.number_of_nodes){
01351     calc_mode(mode, n_a, index);
01352     if(n_a.boundary[index]){
01353       mode[1] = mode[2] = mode[3] = 0.f;
01354     }
01355     atomicadd(&(sum[0]), mode[1]+node_f.force[0*para.number_of_nodes + index]);
01356     atomicadd(&(sum[1]), mode[2]+node_f.force[1*para.number_of_nodes + index]);
01357     atomicadd(&(sum[2]), mode[3]+node_f.force[2*para.number_of_nodes + index]);
01358   }
01359 }
01360 
01361 /**calculate temperature of the fluid kernel
01362  * @param *cpu_jsquared                 Pointer to result storage value (Output)
01363  * @param n_a                           Pointer to local node residing in array a (Input)
01364 */
01365 __global__ void temperature(LB_nodes_gpu n_a, float *cpu_jsquared) {
01366   float mode[4];
01367   float jsquared = 0.f;
01368   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01369 
01370   if(index<para.number_of_nodes){
01371     calc_mode(mode, n_a, index);
01372     if(n_a.boundary[index]){
01373       jsquared = 0.f;
01374     }
01375     else{
01376       jsquared = mode[1]*mode[1]+mode[2]*mode[2]+mode[3]*mode[3];
01377     }
01378     atomicadd(cpu_jsquared, jsquared);
01379   }
01380 }
01381 /**print single node boundary flag
01382  * @param single_nodeindex              index of the node (Input)
01383  * @param *device_flag                  Pointer to result storage array (Input)
01384  * @param n_a                           Pointer to local node residing in array a (Input)
01385 */
01386 __global__ void lb_get_boundary_flag(int single_nodeindex, unsigned int *device_flag, LB_nodes_gpu n_a){
01387         
01388   unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x;
01389 
01390   if(index == 0){
01391     device_flag[0] = n_a.boundary[single_nodeindex];
01392   }     
01393 }
01394 /**erroroutput for memory allocation and memory copy 
01395  * @param err cuda error code
01396  * @param *file .cu file were the error took place
01397  * @param line line of the file were the error took place
01398 */
01399 void _cuda_safe_mem(cudaError_t err, char *file, unsigned int line){
01400     if( cudaSuccess != err) {                                             
01401       fprintf(stderr, "Cuda Memory error at %s:%u.\n", file, line);
01402       printf("CUDA error: %s\n", cudaGetErrorString(err));
01403       exit(EXIT_FAILURE);
01404     } else {
01405       _err=cudaGetLastError(); \
01406       if (_err != cudaSuccess) {
01407         fprintf(stderr, "Error found during memory operation. Possibly however from an failed operation before. %s:%u.\n", file, line);
01408         printf("CUDA error: %s\n", cudaGetErrorString(err));
01409         exit(EXIT_FAILURE);
01410       }
01411     }
01412 }
01413 #define cuda_safe_mem(a) _cuda_safe_mem((a), __FILE__, __LINE__)
01414 
01415 #define KERNELCALL(_f, _a, _b, _params) \
01416 _f<<<_a, _b, 0, stream[0]>>>_params; \
01417 _err=cudaGetLastError(); \
01418 if (_err!=cudaSuccess){ \
01419   printf("CUDA error: %s\n", cudaGetErrorString(_err)); \
01420   fprintf(stderr, "error calling %s with dim %d %d %d #thpb %d in %s:%u\n", #_f, _a.x, _a.y, _a.z, _b, __FILE__, __LINE__); \
01421   exit(EXIT_FAILURE); \
01422 }
01423 /*********************************************************/
01424 /** \name Host functions to setup and call kernels */
01425 /*********************************************************/
01426 /**********************************************************************/
01427 /* Host funktions to setup and call kernels*/
01428 /**********************************************************************/
01429 
01430 /**initialization for the lb gpu fluid called from host
01431  * @param *lbpar_gpu    Pointer to parameters to setup the lb field
01432 */
01433 void lb_init_GPU(LB_parameters_gpu *lbpar_gpu){
01434 
01435   if(initflag){
01436     cudaFree(device_values);
01437     cudaFree(nodes_a.vd);
01438     cudaFree(nodes_b.vd);
01439     cudaFree(nodes_a.seed);
01440     cudaFree(nodes_b.seed);
01441     cudaFree(nodes_a.boundary);
01442     cudaFree(nodes_b.boundary);
01443     cudaFree(node_f.force);
01444     cudaFree(gpu_check);
01445   }
01446   /** Allocate structs in device memory*/
01447   size_of_values = lbpar_gpu->number_of_nodes * sizeof(LB_values_gpu);
01448   size_of_forces = lbpar_gpu->number_of_particles * sizeof(LB_particle_force_gpu);
01449   size_of_positions = lbpar_gpu->number_of_particles * sizeof(LB_particle_gpu);
01450   size_of_seed = lbpar_gpu->number_of_particles * sizeof(LB_particle_seed_gpu);
01451 
01452   cuda_safe_mem(cudaMalloc((void**)&device_values, size_of_values));
01453 
01454 
01455   cuda_safe_mem(cudaMalloc((void**)&nodes_a.vd, lbpar_gpu->number_of_nodes * 19 * sizeof(float)));
01456   cuda_safe_mem(cudaMalloc((void**)&nodes_b.vd, lbpar_gpu->number_of_nodes * 19 * sizeof(float)));                                           
01457 
01458   cuda_safe_mem(cudaMalloc((void**)&nodes_a.seed, lbpar_gpu->number_of_nodes * sizeof(unsigned int)));
01459   cuda_safe_mem(cudaMalloc((void**)&nodes_a.boundary, lbpar_gpu->number_of_nodes * sizeof(unsigned int)));
01460   cuda_safe_mem(cudaMalloc((void**)&nodes_b.seed, lbpar_gpu->number_of_nodes * sizeof(unsigned int)));
01461   cuda_safe_mem(cudaMalloc((void**)&nodes_b.boundary, lbpar_gpu->number_of_nodes * sizeof(unsigned int)));
01462 
01463   cuda_safe_mem(cudaMalloc((void**)&node_f.force, lbpar_gpu->number_of_nodes * 3 * sizeof(float)));
01464 //maybe coalesced alloc  
01465   cuda_safe_mem(cudaMalloc((void**)&particle_force, size_of_forces));
01466   cuda_safe_mem(cudaMalloc((void**)&particle_data, size_of_positions));
01467         
01468   cuda_safe_mem(cudaMalloc((void**)&part, size_of_seed));
01469         
01470   /**write parameters in const memory*/
01471   cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu)));
01472   /**check flag if lb gpu init works*/
01473   cuda_safe_mem(cudaMalloc((void**)&gpu_check, sizeof(int)));
01474   initflag = 1;
01475   h_gpu_check = (int*)malloc(sizeof(int));
01476 
01477   /** values for the kernel call */
01478   int threads_per_block = 64;
01479   int blocks_per_grid_y = 4;
01480   int blocks_per_grid_x = (lbpar_gpu->number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y);
01481   dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
01482 
01483   cudaStreamCreate(&stream[0]);
01484   /** values for the particle kernel */
01485   int threads_per_block_particles = 64;
01486   int blocks_per_grid_particles_y = 4;
01487   int blocks_per_grid_particles_x = (lbpar_gpu->number_of_particles + threads_per_block_particles * blocks_per_grid_particles_y - 1)/(threads_per_block_particles * blocks_per_grid_particles_y);
01488   dim3 dim_grid_particles = make_uint3(blocks_per_grid_particles_x, blocks_per_grid_particles_y, 1);
01489 
01490   KERNELCALL(reset_boundaries, dim_grid, threads_per_block, (nodes_a, nodes_b));
01491 
01492   /** calc of veloctiydensities from given parameters and initialize the Node_Force array with zero */
01493   KERNELCALL(calc_n_equilibrium, dim_grid, threads_per_block, (nodes_a, gpu_check));    
01494   /** init part forces with zero*/
01495   if(lbpar_gpu->number_of_particles) KERNELCALL(init_particle_force, dim_grid_particles, threads_per_block_particles, (particle_force, part));
01496   KERNELCALL(reinit_node_force, dim_grid, threads_per_block, (node_f));
01497 
01498   intflag = 1;
01499   current_nodes = &nodes_a;
01500   h_gpu_check[0] = 0;
01501   cuda_safe_mem(cudaMemcpy(h_gpu_check, gpu_check, sizeof(int), cudaMemcpyDeviceToHost));
01502 //fprintf(stderr, "initialization of lb gpu code %i\n", lbpar_gpu->number_of_nodes);
01503   cudaThreadSynchronize();
01504   if(!h_gpu_check[0]){
01505     fprintf(stderr, "initialization of lb gpu code failed! \n");
01506     errexit();  
01507   }     
01508 }
01509 /** reinitialization for the lb gpu fluid called from host
01510  * @param *lbpar_gpu    Pointer to parameters to setup the lb field
01511 */
01512 void lb_reinit_GPU(LB_parameters_gpu *lbpar_gpu){
01513 
01514   /**write parameters in const memory*/
01515   cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu)));
01516   
01517   /** values for the kernel call */
01518   int threads_per_block = 64;
01519   int blocks_per_grid_y = 4;
01520   int blocks_per_grid_x = (lbpar_gpu->number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y);
01521   dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
01522 
01523   /** calc of veloctiydensities from given parameters and initialize the Node_Force array with zero */
01524   KERNELCALL(calc_n_equilibrium, dim_grid, threads_per_block, (nodes_a, gpu_check));
01525 }
01526 
01527 /**setup and call particle reallocation from the host
01528  * @param *lbpar_gpu    Pointer to parameters to setup the lb field
01529  * @param **host_data   Pointer to host information data
01530 */
01531 void lb_realloc_particle_GPU(LB_parameters_gpu *lbpar_gpu, LB_particle_gpu **host_data){
01532 
01533   if(partinitflag){
01534     cudaFreeHost(*host_data);
01535     cudaFree(particle_force);
01536     cudaFree(particle_data);
01537     cudaFree(part);
01538   }
01539   /** Allocate struct for particle positions */
01540   size_of_forces = lbpar_gpu->number_of_particles * sizeof(LB_particle_force_gpu);
01541   size_of_positions = lbpar_gpu->number_of_particles * sizeof(LB_particle_gpu);
01542   size_of_seed = lbpar_gpu->number_of_particles * sizeof(LB_particle_seed_gpu);
01543 #if !defined __CUDA_ARCH__ || __CUDA_ARCH__ >= 200
01544   /**pinned memory mode - use special function to get OS-pinned memory*/
01545   cudaHostAlloc((void**)host_data, size_of_positions, cudaHostAllocWriteCombined);
01546 #else
01547   cudaMallocHost((void**)host_data, size_of_positions);
01548 #endif
01549 
01550   cuda_safe_mem(cudaMalloc((void**)&particle_force, size_of_forces));
01551   cuda_safe_mem(cudaMalloc((void**)&particle_data, size_of_positions));
01552   cuda_safe_mem(cudaMalloc((void**)&part, size_of_seed));
01553   //copy parameters, especially number of parts to gpu mem
01554   cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu)));
01555   //mem init of particles ok
01556   partinitflag  = 1;
01557   /** values for the particle kernel */
01558   int threads_per_block_particles = 64;
01559   int blocks_per_grid_particles_y = 4;
01560   int blocks_per_grid_particles_x = (lbpar_gpu->number_of_particles + threads_per_block_particles * blocks_per_grid_particles_y - 1)/(threads_per_block_particles * blocks_per_grid_particles_y);
01561   dim3 dim_grid_particles = make_uint3(blocks_per_grid_particles_x, blocks_per_grid_particles_y, 1);
01562 
01563   if(lbpar_gpu->number_of_particles) KERNELCALL(init_particle_force, dim_grid_particles, threads_per_block_particles, (particle_force, part));  
01564 }
01565 #ifdef LB_BOUNDARIES_GPU
01566 /** setup and call boundaries from the host
01567  * @param host_n_lb_boundaries number of LB boundaries
01568  * @param number_of_boundnodes  number of boundnodes
01569  * @param host_boundary_node_list    The indices of the boundary nodes
01570  * @param host_boundary_index_list   The flag representing the corresponding boundary
01571  * @param host_LB_Boundary_velocity                     The constant velocity at the boundary, set by the user (Input)
01572 */
01573 void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes, int *host_boundary_node_list, int* host_boundary_index_list, float* host_LB_Boundary_velocity){
01574   int temp = host_n_lb_boundaries;
01575 
01576   size_of_boundindex = number_of_boundnodes*sizeof(int);
01577   cuda_safe_mem(cudaMalloc((void**)&boundary_node_list, size_of_boundindex));
01578   cuda_safe_mem(cudaMalloc((void**)&boundary_index_list, size_of_boundindex));
01579   cuda_safe_mem(cudaMemcpy(boundary_index_list, host_boundary_index_list, size_of_boundindex, cudaMemcpyHostToDevice));
01580   cuda_safe_mem(cudaMemcpy(boundary_node_list, host_boundary_node_list, size_of_boundindex, cudaMemcpyHostToDevice));
01581 
01582   cuda_safe_mem(cudaMalloc((void**)&LB_boundary_force   , 3*host_n_lb_boundaries*sizeof(float)));
01583   cuda_safe_mem(cudaMalloc((void**)&LB_boundary_velocity, 3*host_n_lb_boundaries*sizeof(float)));
01584   cuda_safe_mem(cudaMemcpy(LB_boundary_velocity, host_LB_Boundary_velocity, 3*n_lb_boundaries*sizeof(float), cudaMemcpyHostToDevice));
01585   cuda_safe_mem(cudaMemcpyToSymbol(n_lb_boundaries_gpu, &temp, sizeof(int)));
01586 
01587   
01588   /** values for the kernel call */
01589   int threads_per_block = 64;
01590   int blocks_per_grid_y = 4;
01591   int blocks_per_grid_x = (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y);
01592   dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
01593 
01594   KERNELCALL(reset_boundaries, dim_grid, threads_per_block, (nodes_a, nodes_b));
01595 
01596   if (n_lb_boundaries == 0) {
01597     cudaThreadSynchronize();
01598     return;
01599   }
01600   if(number_of_boundnodes == 0){
01601     fprintf(stderr, "WARNING: boundary cmd executed but no boundary node found!\n");
01602   } else{
01603     int threads_per_block_bound = 64;
01604     int blocks_per_grid_bound_y = 4;
01605     int blocks_per_grid_bound_x = (number_of_boundnodes + threads_per_block_bound * blocks_per_grid_bound_y - 1) /(threads_per_block_bound * blocks_per_grid_bound_y);
01606     dim3 dim_grid_bound = make_uint3(blocks_per_grid_bound_x, blocks_per_grid_bound_y, 1);
01607 
01608     KERNELCALL(init_boundaries, dim_grid_bound, threads_per_block_bound, (boundary_node_list, boundary_index_list, number_of_boundnodes, nodes_a, nodes_b));
01609   }
01610 
01611   cudaThreadSynchronize();
01612 }
01613 #endif
01614 /**setup and call extern single node force initialization from the host
01615  * @param *lbpar_gpu                            Pointer to host parameter struct
01616 */
01617 void lb_reinit_extern_nodeforce_GPU(LB_parameters_gpu *lbpar_gpu){
01618 
01619   cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu))); 
01620 
01621   /** values for the kernel call */
01622   int threads_per_block = 64;
01623   int blocks_per_grid_y = 4;
01624   int blocks_per_grid_x = (lbpar_gpu->number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y);
01625   dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
01626 
01627   KERNELCALL(reinit_node_force, dim_grid, threads_per_block, (node_f));
01628 
01629 }
01630 /**setup and call extern single node force initialization from the host
01631  * @param n_extern_nodeforces                   number of nodes on which the external force has to be applied
01632  * @param *host_extern_nodeforces               Pointer to the host extern node forces
01633  * @param *lbpar_gpu                            Pointer to host parameter struct
01634 */
01635 void lb_init_extern_nodeforces_GPU(int n_extern_nodeforces, LB_extern_nodeforce_gpu *host_extern_nodeforces, LB_parameters_gpu *lbpar_gpu){
01636 
01637   size_of_extern_nodeforces = n_extern_nodeforces*sizeof(LB_extern_nodeforce_gpu);
01638   cuda_safe_mem(cudaMalloc((void**)&extern_nodeforces, size_of_extern_nodeforces));
01639   cudaMemcpy(extern_nodeforces, host_extern_nodeforces, size_of_extern_nodeforces, cudaMemcpyHostToDevice);
01640 
01641   if(lbpar_gpu->external_force == 0)cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu))); 
01642 
01643   int threads_per_block_exf = 64;
01644   int blocks_per_grid_exf_y = 4;
01645   int blocks_per_grid_exf_x = (n_extern_nodeforces + threads_per_block_exf * blocks_per_grid_exf_y - 1) /(threads_per_block_exf * blocks_per_grid_exf_y);
01646   dim3 dim_grid_exf = make_uint3(blocks_per_grid_exf_x, blocks_per_grid_exf_y, 1);
01647         
01648   KERNELCALL(init_extern_nodeforces, dim_grid_exf, threads_per_block_exf, (n_extern_nodeforces, extern_nodeforces, node_f));
01649   cudaFree(extern_nodeforces);
01650 }
01651 
01652 /**setup and call particle kernel from the host
01653  * @param **host_data           Pointer to the host particle positions and velocities
01654 */
01655 void lb_particle_GPU(LB_particle_gpu *host_data){
01656   
01657   /** get espresso md particle values*/
01658   cudaMemcpyAsync(particle_data, host_data, size_of_positions, cudaMemcpyHostToDevice, stream[0]);
01659   /** call of the particle kernel */
01660   /** values for the particle kernel */
01661   int threads_per_block_particles = 64;
01662   int blocks_per_grid_particles_y = 4;
01663   int blocks_per_grid_particles_x = (lbpar_gpu.number_of_particles + threads_per_block_particles * blocks_per_grid_particles_y - 1)/(threads_per_block_particles * blocks_per_grid_particles_y);
01664   dim3 dim_grid_particles = make_uint3(blocks_per_grid_particles_x, blocks_per_grid_particles_y, 1);
01665 
01666   KERNELCALL(calc_fluid_particle_ia, dim_grid_particles, threads_per_block_particles, (*current_nodes, particle_data, particle_force, node_f, part));
01667 }
01668 /** setup and call kernel to copy particle forces to host
01669  * @param *host_forces contains the particle force computed on the GPU
01670 */
01671 void lb_copy_forces_GPU(LB_particle_force_gpu *host_forces){
01672 
01673   /** Copy result from device memory to host memory*/
01674   cudaMemcpy(host_forces, particle_force, size_of_forces, cudaMemcpyDeviceToHost);
01675 
01676     /** values for the particle kernel */
01677   int threads_per_block_particles = 64;
01678   int blocks_per_grid_particles_y = 4;
01679   int blocks_per_grid_particles_x = (lbpar_gpu.number_of_particles + threads_per_block_particles * blocks_per_grid_particles_y - 1)/(threads_per_block_particles * blocks_per_grid_particles_y);
01680   dim3 dim_grid_particles = make_uint3(blocks_per_grid_particles_x, blocks_per_grid_particles_y, 1);
01681 
01682   /** reset part forces with zero*/
01683   KERNELCALL(reset_particle_force, dim_grid_particles, threads_per_block_particles, (particle_force));
01684         
01685   cudaThreadSynchronize();
01686 }
01687 
01688 /** setup and call kernel for getting macroscopic fluid values of all nodes
01689  * @param *host_values struct to save the gpu values
01690 */
01691 void lb_get_values_GPU(LB_values_gpu *host_values){
01692 
01693   /** values for the kernel call */
01694   int threads_per_block = 64;
01695   int blocks_per_grid_y = 4;
01696   int blocks_per_grid_x = (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y);
01697   dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
01698 
01699   KERNELCALL(values, dim_grid, threads_per_block, (*current_nodes, device_values));
01700   cudaMemcpy(host_values, device_values, size_of_values, cudaMemcpyDeviceToHost);
01701 
01702 }
01703 
01704 /** get all the boundary flags for all nodes
01705  *  @param host_bound_array here go the values of the boundary flag
01706  */
01707 void lb_get_boundary_flags_GPU(unsigned int* host_bound_array){
01708    
01709   unsigned int* device_bound_array;
01710   cuda_safe_mem(cudaMalloc((void**)&device_bound_array, lbpar_gpu.number_of_nodes*sizeof(unsigned int)));       
01711   /** values for the kernel call */
01712   int threads_per_block = 64;
01713   int blocks_per_grid_y = 4;
01714   int blocks_per_grid_x = (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) / (threads_per_block * blocks_per_grid_y);
01715   dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
01716 
01717   KERNELCALL(lb_get_boundaries, dim_grid, threads_per_block, (*current_nodes, device_bound_array));
01718 
01719   cudaMemcpy(host_bound_array, device_bound_array, lbpar_gpu.number_of_nodes*sizeof(unsigned int), cudaMemcpyDeviceToHost);
01720 
01721   cudaFree(device_bound_array);
01722 
01723 }
01724 
01725 /** setup and call kernel for getting macroscopic fluid values of a single node*/
01726 void lb_print_node_GPU(int single_nodeindex, LB_values_gpu *host_print_values){ 
01727       
01728   LB_values_gpu *device_print_values;
01729   cuda_safe_mem(cudaMalloc((void**)&device_print_values, sizeof(LB_values_gpu)));       
01730   int threads_per_block_print = 1;
01731   int blocks_per_grid_print_y = 1;
01732   int blocks_per_grid_print_x = 1;
01733   dim3 dim_grid_print = make_uint3(blocks_per_grid_print_x, blocks_per_grid_print_y, 1);
01734 
01735   KERNELCALL(lb_print_node, dim_grid_print, threads_per_block_print, (single_nodeindex, device_print_values, *current_nodes));
01736 
01737   cudaMemcpy(host_print_values, device_print_values, sizeof(LB_values_gpu), cudaMemcpyDeviceToHost);
01738   cudaFree(device_print_values);
01739 
01740 }
01741 /** setup and call kernel to calculate the total momentum of the hole fluid
01742  * @param *mass value of the mass calcutated on the GPU
01743 */
01744 void lb_calc_fluid_mass_GPU(double* mass){
01745 
01746   float* tot_mass;
01747   float cpu_mass =  0.f ;
01748   cuda_safe_mem(cudaMalloc((void**)&tot_mass, sizeof(float)));
01749   cudaMemcpy(tot_mass, &cpu_mass, sizeof(float), cudaMemcpyHostToDevice);
01750 
01751   /** values for the kernel call */
01752   int threads_per_block = 64;
01753   int blocks_per_grid_y = 4;
01754   int blocks_per_grid_x = (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y);
01755   dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
01756 
01757   KERNELCALL(calc_mass, dim_grid, threads_per_block,(*current_nodes, tot_mass));
01758 
01759   cudaMemcpy(&cpu_mass, tot_mass, sizeof(float), cudaMemcpyDeviceToHost);
01760   
01761   cudaFree(tot_mass);
01762   mass[0] = (double)(cpu_mass);
01763 }
01764 
01765 /** setup and call kernel to calculate the total momentum of the hole fluid
01766  *  @param host_mom value of the momentum calcutated on the GPU
01767  */
01768 void lb_calc_fluid_momentum_GPU(double* host_mom){
01769 
01770   float* tot_momentum;
01771   float host_momentum[3] = { 0.f, 0.f, 0.f};
01772   cuda_safe_mem(cudaMalloc((void**)&tot_momentum, 3*sizeof(float)));
01773   cudaMemcpy(tot_momentum, host_momentum, 3*sizeof(float), cudaMemcpyHostToDevice);
01774 
01775   /** values for the kernel call */
01776   int threads_per_block = 64;
01777   int blocks_per_grid_y = 4;
01778   int blocks_per_grid_x = (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y);
01779   dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
01780 
01781   KERNELCALL(momentum, dim_grid, threads_per_block,(*current_nodes, tot_momentum, node_f));
01782   
01783   cudaMemcpy(host_momentum, tot_momentum, 3*sizeof(float), cudaMemcpyDeviceToHost);
01784   
01785   cudaFree(tot_momentum);
01786   host_mom[0] = (double)(host_momentum[0]* lbpar_gpu.agrid/lbpar_gpu.tau);
01787   host_mom[1] = (double)(host_momentum[1]* lbpar_gpu.agrid/lbpar_gpu.tau);
01788   host_mom[2] = (double)(host_momentum[2]* lbpar_gpu.agrid/lbpar_gpu.tau);
01789 }
01790 /** setup and call kernel to calculate the temperature of the hole fluid
01791  *  @param host_temp value of the temperatur calcutated on the GPU
01792 */
01793 void lb_calc_fluid_temperature_GPU(double* host_temp){
01794   float host_jsquared = 0.f;
01795   float* device_jsquared;
01796   cuda_safe_mem(cudaMalloc((void**)&device_jsquared, sizeof(float)));
01797   cudaMemcpy(device_jsquared, &host_jsquared, sizeof(float), cudaMemcpyHostToDevice);
01798 
01799   /** values for the kernel call */
01800   int threads_per_block = 64;
01801   int blocks_per_grid_y = 4;
01802   int blocks_per_grid_x = (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y);
01803   dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
01804 
01805   KERNELCALL(temperature, dim_grid, threads_per_block,(*current_nodes, device_jsquared));
01806 
01807   cudaMemcpy(&host_jsquared, device_jsquared, sizeof(float), cudaMemcpyDeviceToHost);
01808 
01809   host_temp[0] = (double)(host_jsquared*1./(3.f*lbpar_gpu.rho*lbpar_gpu.dim_x*lbpar_gpu.dim_y*lbpar_gpu.dim_z*lbpar_gpu.tau*lbpar_gpu.tau*lbpar_gpu.agrid));
01810 }
01811 /** setup and call kernel for getting macroscopic fluid values of all nodes
01812  * @param *host_values struct to save the gpu values
01813 */
01814 void lb_save_checkpoint_GPU(float *host_checkpoint_vd, unsigned int *host_checkpoint_seed, unsigned int *host_checkpoint_boundary, float *host_checkpoint_force){
01815 
01816   cudaMemcpy(host_checkpoint_vd, current_nodes->vd, lbpar_gpu.number_of_nodes * 19 * sizeof(float), cudaMemcpyDeviceToHost);
01817   cudaMemcpy(host_checkpoint_seed, current_nodes->seed, lbpar_gpu.number_of_nodes * sizeof(unsigned int), cudaMemcpyDeviceToHost);
01818   cudaMemcpy(host_checkpoint_boundary, current_nodes->boundary, lbpar_gpu.number_of_nodes * sizeof(unsigned int), cudaMemcpyDeviceToHost);
01819   cudaMemcpy(host_checkpoint_force, node_f.force, lbpar_gpu.number_of_nodes * 3 * sizeof(float), cudaMemcpyDeviceToHost);
01820 
01821 }
01822 /** setup and call kernel for setting macroscopic fluid values of all nodes
01823  * @param *host_values struct to set stored values
01824 */
01825 void lb_load_checkpoint_GPU(float *host_checkpoint_vd, unsigned int *host_checkpoint_seed, unsigned int *host_checkpoint_boundary, float *host_checkpoint_force){
01826 
01827   cudaMemcpy(current_nodes->vd, host_checkpoint_vd, lbpar_gpu.number_of_nodes * 19 * sizeof(float), cudaMemcpyHostToDevice);
01828   intflag = 1;
01829   cudaMemcpy(current_nodes->seed, host_checkpoint_seed, lbpar_gpu.number_of_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice);
01830   cudaMemcpy(current_nodes->boundary, host_checkpoint_boundary, lbpar_gpu.number_of_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice);
01831   cudaMemcpy(node_f.force, host_checkpoint_force, lbpar_gpu.number_of_nodes * 3 * sizeof(float), cudaMemcpyHostToDevice);
01832 
01833 }
01834 
01835 /** setup and call kernel to get the boundary flag of a single node
01836  *  @param single_nodeindex number of the node to get the flag for
01837  *  @param host_flag her goes the value of the boundary flag
01838  */
01839 void lb_get_boundary_flag_GPU(int single_nodeindex, unsigned int* host_flag){
01840    
01841   unsigned int* device_flag;
01842   cuda_safe_mem(cudaMalloc((void**)&device_flag, sizeof(unsigned int)));        
01843   int threads_per_block_flag = 1;
01844   int blocks_per_grid_flag_y = 1;
01845   int blocks_per_grid_flag_x = 1;
01846   dim3 dim_grid_flag = make_uint3(blocks_per_grid_flag_x, blocks_per_grid_flag_y, 1);
01847 
01848   KERNELCALL(lb_get_boundary_flag, dim_grid_flag, threads_per_block_flag, (single_nodeindex, device_flag, *current_nodes));
01849 
01850   cudaMemcpy(host_flag, device_flag, sizeof(unsigned int), cudaMemcpyDeviceToHost);
01851 
01852   cudaFree(device_flag);
01853 
01854 }
01855 /** set the net velocity at a single node
01856  *  @param single_nodeindex the node to set the velocity for 
01857  *  @param host_velocity the velocity to set
01858  */
01859 void lb_set_node_velocity_GPU(int single_nodeindex, float* host_velocity){
01860    
01861   float* device_velocity;
01862   cuda_safe_mem(cudaMalloc((void**)&device_velocity, 3*sizeof(float))); 
01863   cudaMemcpy(device_velocity, host_velocity, 3*sizeof(float), cudaMemcpyHostToDevice);
01864   int threads_per_block_flag = 1;
01865   int blocks_per_grid_flag_y = 1;
01866   int blocks_per_grid_flag_x = 1;
01867   dim3 dim_grid_flag = make_uint3(blocks_per_grid_flag_x, blocks_per_grid_flag_y, 1);
01868 
01869   KERNELCALL(set_u_equilibrium, dim_grid_flag, threads_per_block_flag, (*current_nodes, single_nodeindex, device_velocity));
01870 
01871   cudaFree(device_velocity);
01872 
01873 }
01874 /** reinit of params 
01875  * @param *lbpar_gpu struct containing the paramters of the fluid
01876 */
01877 void reinit_parameters_GPU(LB_parameters_gpu *lbpar_gpu){
01878 
01879   /**write parameters in const memory*/
01880   cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu)));
01881 }
01882 /**integration kernel for the lb gpu fluid update called from host */
01883 void lb_integrate_GPU(){
01884   
01885   /** values for the kernel call */
01886   int threads_per_block = 64;
01887   int blocks_per_grid_y = 4;
01888   int blocks_per_grid_x = (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y);
01889   dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1);
01890 
01891 #ifdef LB_BOUNDARIES_GPU
01892   if (n_lb_boundaries > 0) 
01893     cuda_safe_mem(cudaMemset    (       LB_boundary_force, 0, 3*n_lb_boundaries*sizeof(float)));
01894 #endif
01895 
01896   /**call of fluid step*/
01897   if (intflag == 1){
01898     KERNELCALL(integrate, dim_grid, threads_per_block, (nodes_a, nodes_b, device_values, node_f));
01899     current_nodes = &nodes_b;
01900 #ifdef LB_BOUNDARIES_GPU                
01901 
01902     if (n_lb_boundaries > 0) {
01903         KERNELCALL(bb_read, dim_grid, threads_per_block, (nodes_a, nodes_b, LB_boundary_velocity, LB_boundary_force));
01904   //      KERNELCALL(bb_write, dim_grid, threads_per_block, (nodes_a, nodes_b));
01905       }
01906 #endif
01907     intflag = 0;
01908   }
01909   else{
01910     KERNELCALL(integrate, dim_grid, threads_per_block, (nodes_b, nodes_a, device_values, node_f));
01911     current_nodes = &nodes_a;
01912 #ifdef LB_BOUNDARIES_GPU                
01913 
01914     if (n_lb_boundaries > 0) {
01915       KERNELCALL(bb_read, dim_grid, threads_per_block, (nodes_b, nodes_a, LB_boundary_velocity, LB_boundary_force));
01916  //     KERNELCALL(bb_write, dim_grid, threads_per_block, (nodes_b, nodes_a));
01917     }
01918 #endif
01919     intflag = 1;
01920   }             
01921 }
01922 
01923 void lb_gpu_get_boundary_forces(double* forces) {
01924 #ifdef LB_BOUNDARIES_GPU
01925   float* temp = (float*) malloc(3*n_lb_boundaries*sizeof(float));
01926   cuda_safe_mem(cudaMemcpy(temp, LB_boundary_force, 3*n_lb_boundaries*sizeof(float), cudaMemcpyDeviceToHost));
01927   for (int i =0; i<3*n_lb_boundaries; i++) {
01928     forces[i]=(double)temp[i];
01929   }
01930   free(temp);
01931 #endif
01932 }
01933 
01934 /** free gpu memory kernel called from the host (not used anymore) */
01935 void lb_free_GPU(){
01936   // Free device memory
01937   cudaFree(device_values);
01938   cudaFree(&para);
01939   cudaFree(&nodes_a);
01940   cudaFree(&nodes_b);
01941   cudaFree(particle_force);
01942   cudaFree(particle_data);
01943   cudaFree(&node_f);
01944   cudaFree(part);
01945   cudaStreamDestroy(stream[0]);
01946 }
01947 #endif /* LB_GPU */