![]() |
ESPResSo 3.2.0-11-g9950804-git
Extensible Simulation Package for Soft Matter Research
|
00001 /* 00002 Copyright (C) 2010,2011,2012,2013 The ESPResSo project 00003 00004 This file is part of ESPResSo. 00005 00006 ESPResSo is free software: you can redistribute it and/or modify 00007 it under the terms of the GNU General Public License as published by 00008 the Free Software Foundation, either version 3 of the License, or 00009 (at your option) any later version. 00010 00011 ESPResSo is distributed in the hope that it will be useful, 00012 but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 GNU General Public License for more details. 00015 00016 You should have received a copy of the GNU General Public License 00017 along with this program. If not, see <http://www.gnu.org/licenses/>. 00018 */ 00019 00020 /** \file lbgpu.cu 00021 * 00022 * Cuda (.cu) file for the Lattice Boltzmann implementation on GPUs. 00023 * Header file for \ref lbgpu.h. 00024 */ 00025 00026 #include <stdio.h> 00027 #include <cuda.h> 00028 #include <stdlib.h> 00029 00030 extern "C" { 00031 #include "lbgpu.h" 00032 #include "config.h" 00033 } 00034 00035 #ifdef LB_GPU 00036 #ifndef GAUSSRANDOM 00037 #define GAUSSRANDOM 00038 #endif 00039 00040 /**defining structures residing in global memory */ 00041 /** struct for phys. values */ 00042 static LB_values_gpu *device_values = NULL; 00043 /** structs for velocity densities */ 00044 static LB_nodes_gpu nodes_a; 00045 static LB_nodes_gpu nodes_b; 00046 /** struct for particle force */ 00047 static LB_particle_force_gpu *particle_force = NULL; 00048 /** struct for particle position and veloctiy */ 00049 static LB_particle_gpu *particle_data = NULL; 00050 /** struct for node force */ 00051 static LB_node_force_gpu node_f; 00052 /** struct for storing particle rn seed */ 00053 static LB_particle_seed_gpu *part = NULL; 00054 00055 static LB_extern_nodeforce_gpu *extern_nodeforces = NULL; 00056 00057 #ifdef LB_BOUNDARIES_GPU 00058 static float* LB_boundary_force = NULL; 00059 static float* LB_boundary_velocity = NULL; 00060 /** pointer for bound index array*/ 00061 static int *boundary_node_list; 00062 static int *boundary_index_list; 00063 static __device__ __constant__ int n_lb_boundaries_gpu = 0; 00064 static size_t size_of_boundindex; 00065 #endif 00066 /** pointers for additional cuda check flag*/ 00067 static int *gpu_check = NULL; 00068 static int *h_gpu_check = NULL; 00069 00070 static unsigned int intflag = 1; 00071 static LB_nodes_gpu *current_nodes = NULL; 00072 /**defining size values for allocating global memory */ 00073 static size_t size_of_values; 00074 static size_t size_of_forces; 00075 static size_t size_of_positions; 00076 static size_t size_of_seed; 00077 static size_t size_of_extern_nodeforces; 00078 00079 /**parameters residing in constant memory */ 00080 static __device__ __constant__ LB_parameters_gpu para; 00081 static const float c_sound_sq = 1.f/3.f; 00082 /**cudasteams for parallel computing on cpu and gpu */ 00083 cudaStream_t stream[1]; 00084 00085 cudaError_t err; 00086 cudaError_t _err; 00087 int initflag = 0; 00088 int partinitflag = 0; 00089 /*-------------------------------------------------------*/ 00090 /*********************************************************/ 00091 /** \name device funktions called by kernel funktions */ 00092 /*********************************************************/ 00093 /*-------------------------------------------------------*/ 00094 00095 /*-------------------------------------------------------*/ 00096 00097 /** atomic add function for sveral cuda architectures 00098 */ 00099 __device__ inline void atomicadd(float* address, float value){ 00100 #if !defined __CUDA_ARCH__ || __CUDA_ARCH__ >= 200 // for Fermi, atomicAdd supports floats 00101 atomicAdd(address, value); 00102 #elif __CUDA_ARCH__ >= 110 00103 #warning Using slower atomicAdd emulation 00104 // float-atomic-add from 00105 // [url="http://forums.nvidia.com/index.php?showtopic=158039&view=findpost&p=991561"] 00106 float old = value; 00107 while ((old = atomicExch(address, atomicExch(address, 0.0f)+old))!=0.0f); 00108 #else 00109 #error I need at least compute capability 1.1 00110 #endif 00111 } 00112 00113 /**randomgenerator which generates numbers [0,1] 00114 * @param *rn Pointer to randomnumber array of the local node or particle 00115 */ 00116 __device__ void random_01(LB_randomnr_gpu *rn){ 00117 00118 const float mxi = 1.f/(float)(1ul<<31); 00119 unsigned int curr = rn->seed; 00120 00121 curr = 1103515245 * curr + 12345; 00122 rn->randomnr[0] = (float)(curr & ((1ul<<31)-1))*mxi; 00123 curr = 1103515245 * curr + 12345; 00124 rn->randomnr[1] = (float)(curr & ((1ul<<31)-1))*mxi; 00125 rn->seed = curr; 00126 00127 } 00128 00129 /** gaussian random nummber generator for thermalisation 00130 * @param *rn Pointer to randomnumber array of the local node node or particle 00131 */ 00132 __device__ void gaussian_random(LB_randomnr_gpu *rn){ 00133 00134 float x1, x2; 00135 float r2, fac; 00136 /** On every second call two gaussian random numbers are calculated 00137 via the Box-Muller transformation.*/ 00138 /** draw two uniform random numbers in the unit circle */ 00139 do { 00140 random_01(rn); 00141 x1 = 2.f*rn->randomnr[0]-1.f; 00142 x2 = 2.f*rn->randomnr[1]-1.f; 00143 r2 = x1*x1 + x2*x2; 00144 } while (r2 >= 1.f || r2 == 0.f); 00145 00146 /** perform Box-Muller transformation */ 00147 fac = sqrtf(-2.f*__logf(r2)/r2); 00148 rn->randomnr[0] = x2*fac; 00149 rn->randomnr[1] = x1*fac; 00150 00151 } 00152 00153 /**tranformation from 1d array-index to xyz 00154 * @param index node index / thread index (Input) 00155 * @param xyz Pointer to calculated xyz array (Output) 00156 */ 00157 __device__ void index_to_xyz(unsigned int index, unsigned int *xyz){ 00158 00159 xyz[0] = index%para.dim_x; 00160 index /= para.dim_x; 00161 xyz[1] = index%para.dim_y; 00162 index /= para.dim_y; 00163 xyz[2] = index; 00164 } 00165 00166 /**calculation of the modes from the velocitydensities (space-transform.) 00167 * @param n_a Pointer to local node residing in array a (Input) 00168 * @param index node index / thread index (Input) 00169 * @param mode Pointer to the local register values mode (Output) 00170 */ 00171 __device__ void calc_m_from_n(LB_nodes_gpu n_a, unsigned int index, float *mode){ 00172 00173 /* mass mode */ 00174 mode[0] = n_a.vd[0*para.number_of_nodes + index] + n_a.vd[1*para.number_of_nodes + index] + n_a.vd[2*para.number_of_nodes + index] 00175 + n_a.vd[3*para.number_of_nodes + index] + n_a.vd[4*para.number_of_nodes + index] + n_a.vd[5*para.number_of_nodes + index] 00176 + n_a.vd[6*para.number_of_nodes + index] + n_a.vd[7*para.number_of_nodes + index] + n_a.vd[8*para.number_of_nodes + index] 00177 + n_a.vd[9*para.number_of_nodes + index] + n_a.vd[10*para.number_of_nodes + index] + n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index] 00178 + n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index] + n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index] 00179 + n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index]; 00180 00181 /* momentum modes */ 00182 mode[1] = (n_a.vd[1*para.number_of_nodes + index] - n_a.vd[2*para.number_of_nodes + index]) + (n_a.vd[7*para.number_of_nodes + index] - n_a.vd[8*para.number_of_nodes + index]) 00183 + (n_a.vd[9*para.number_of_nodes + index] - n_a.vd[10*para.number_of_nodes + index]) + (n_a.vd[11*para.number_of_nodes + index] - n_a.vd[12*para.number_of_nodes + index]) 00184 + (n_a.vd[13*para.number_of_nodes + index] - n_a.vd[14*para.number_of_nodes + index]); 00185 mode[2] = (n_a.vd[3*para.number_of_nodes + index] - n_a.vd[4*para.number_of_nodes + index]) + (n_a.vd[7*para.number_of_nodes + index] - n_a.vd[8*para.number_of_nodes + index]) 00186 - (n_a.vd[9*para.number_of_nodes + index] - n_a.vd[10*para.number_of_nodes + index]) + (n_a.vd[15*para.number_of_nodes + index] - n_a.vd[16*para.number_of_nodes + index]) 00187 + (n_a.vd[17*para.number_of_nodes + index] - n_a.vd[18*para.number_of_nodes + index]); 00188 mode[3] = (n_a.vd[5*para.number_of_nodes + index] - n_a.vd[6*para.number_of_nodes + index]) + (n_a.vd[11*para.number_of_nodes + index] - n_a.vd[12*para.number_of_nodes + index]) 00189 - (n_a.vd[13*para.number_of_nodes + index] - n_a.vd[14*para.number_of_nodes + index]) + (n_a.vd[15*para.number_of_nodes + index] - n_a.vd[16*para.number_of_nodes + index]) 00190 - (n_a.vd[17*para.number_of_nodes + index] - n_a.vd[18*para.number_of_nodes + index]); 00191 00192 /* stress modes */ 00193 mode[4] = -(n_a.vd[0*para.number_of_nodes + index]) + n_a.vd[7*para.number_of_nodes + index] + n_a.vd[8*para.number_of_nodes + index] + n_a.vd[9*para.number_of_nodes + index] + n_a.vd[10*para.number_of_nodes + index] 00194 + n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index] + n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index] 00195 + n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index] + n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index]; 00196 mode[5] = n_a.vd[1*para.number_of_nodes + index] + n_a.vd[2*para.number_of_nodes + index] - (n_a.vd[3*para.number_of_nodes + index] + n_a.vd[4*para.number_of_nodes + index]) 00197 + (n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index]) + (n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index]) 00198 - (n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index]) - (n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index]); 00199 mode[6] = (n_a.vd[1*para.number_of_nodes + index] + n_a.vd[2*para.number_of_nodes + index]) + (n_a.vd[3*para.number_of_nodes + index] + n_a.vd[4*para.number_of_nodes + index]) 00200 - (n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index]) - (n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index]) 00201 - (n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index]) - (n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index]) 00202 - 2.f*(n_a.vd[5*para.number_of_nodes + index] + n_a.vd[6*para.number_of_nodes + index] - (n_a.vd[7*para.number_of_nodes + index] + n_a.vd[8*para.number_of_nodes + index]) 00203 - (n_a.vd[9*para.number_of_nodes + index] +n_a.vd[10*para.number_of_nodes + index])); 00204 mode[7] = n_a.vd[7*para.number_of_nodes + index] + n_a.vd[8*para.number_of_nodes + index] - (n_a.vd[9*para.number_of_nodes + index] + n_a.vd[10*para.number_of_nodes + index]); 00205 mode[8] = n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index] - (n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index]); 00206 mode[9] = n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index] - (n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index]); 00207 00208 /* kinetic modes */ 00209 mode[10] = -2.f*(n_a.vd[1*para.number_of_nodes + index] - n_a.vd[2*para.number_of_nodes + index]) + (n_a.vd[7*para.number_of_nodes + index] - n_a.vd[8*para.number_of_nodes + index]) 00210 + (n_a.vd[9*para.number_of_nodes + index] - n_a.vd[10*para.number_of_nodes + index]) + (n_a.vd[11*para.number_of_nodes + index] - n_a.vd[12*para.number_of_nodes + index]) 00211 + (n_a.vd[13*para.number_of_nodes + index] - n_a.vd[14*para.number_of_nodes + index]); 00212 mode[11] = -2.f*(n_a.vd[3*para.number_of_nodes + index] - n_a.vd[4*para.number_of_nodes + index]) + (n_a.vd[7*para.number_of_nodes + index] - n_a.vd[8*para.number_of_nodes + index]) 00213 - (n_a.vd[9*para.number_of_nodes + index] - n_a.vd[10*para.number_of_nodes + index]) + (n_a.vd[15*para.number_of_nodes + index] - n_a.vd[16*para.number_of_nodes + index]) 00214 + (n_a.vd[17*para.number_of_nodes + index] - n_a.vd[18*para.number_of_nodes + index]); 00215 mode[12] = -2.f*(n_a.vd[5*para.number_of_nodes + index] - n_a.vd[6*para.number_of_nodes + index]) + (n_a.vd[11*para.number_of_nodes + index] - n_a.vd[12*para.number_of_nodes + index]) 00216 - (n_a.vd[13*para.number_of_nodes + index] - n_a.vd[14*para.number_of_nodes + index]) + (n_a.vd[15*para.number_of_nodes + index] - n_a.vd[16*para.number_of_nodes + index]) 00217 - (n_a.vd[17*para.number_of_nodes + index] - n_a.vd[18*para.number_of_nodes + index]); 00218 mode[13] = (n_a.vd[7*para.number_of_nodes + index] - n_a.vd[8*para.number_of_nodes + index]) + (n_a.vd[9*para.number_of_nodes + index] - n_a.vd[10*para.number_of_nodes + index]) 00219 - (n_a.vd[11*para.number_of_nodes + index] - n_a.vd[12*para.number_of_nodes + index]) - (n_a.vd[13*para.number_of_nodes + index] - n_a.vd[14*para.number_of_nodes + index]); 00220 mode[14] = (n_a.vd[7*para.number_of_nodes + index] - n_a.vd[8*para.number_of_nodes + index]) - (n_a.vd[9*para.number_of_nodes + index] - n_a.vd[10*para.number_of_nodes + index]) 00221 - (n_a.vd[15*para.number_of_nodes + index] - n_a.vd[16*para.number_of_nodes + index]) - (n_a.vd[17*para.number_of_nodes + index] - n_a.vd[18*para.number_of_nodes + index]); 00222 mode[15] = (n_a.vd[11*para.number_of_nodes + index] - n_a.vd[12*para.number_of_nodes + index]) - (n_a.vd[13*para.number_of_nodes + index] - n_a.vd[14*para.number_of_nodes + index]) 00223 - (n_a.vd[15*para.number_of_nodes + index] - n_a.vd[16*para.number_of_nodes + index]) + (n_a.vd[17*para.number_of_nodes + index] - n_a.vd[18*para.number_of_nodes + index]); 00224 mode[16] = n_a.vd[0*para.number_of_nodes + index] + n_a.vd[7*para.number_of_nodes + index] + n_a.vd[8*para.number_of_nodes + index] + n_a.vd[9*para.number_of_nodes + index] + n_a.vd[10*para.number_of_nodes + index] 00225 + n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index] + n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index] 00226 + n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index] + n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index] 00227 - 2.f*((n_a.vd[1*para.number_of_nodes + index] + n_a.vd[2*para.number_of_nodes + index]) + (n_a.vd[3*para.number_of_nodes + index] + n_a.vd[4*para.number_of_nodes + index]) 00228 + (n_a.vd[5*para.number_of_nodes + index] + n_a.vd[6*para.number_of_nodes + index])); 00229 mode[17] = -(n_a.vd[1*para.number_of_nodes + index] + n_a.vd[2*para.number_of_nodes + index]) + (n_a.vd[3*para.number_of_nodes + index] + n_a.vd[4*para.number_of_nodes + index]) 00230 + (n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index]) + (n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index]) 00231 - (n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index]) - (n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index]); 00232 mode[18] = -(n_a.vd[1*para.number_of_nodes + index] + n_a.vd[2*para.number_of_nodes + index]) - (n_a.vd[3*para.number_of_nodes + index] + n_a.vd[4*para.number_of_nodes + index]) 00233 - (n_a.vd[11*para.number_of_nodes + index] + n_a.vd[12*para.number_of_nodes + index]) - (n_a.vd[13*para.number_of_nodes + index] + n_a.vd[14*para.number_of_nodes + index]) 00234 - (n_a.vd[15*para.number_of_nodes + index] + n_a.vd[16*para.number_of_nodes + index]) - (n_a.vd[17*para.number_of_nodes + index] + n_a.vd[18*para.number_of_nodes + index]) 00235 + 2.f*((n_a.vd[5*para.number_of_nodes + index] + n_a.vd[6*para.number_of_nodes + index]) + (n_a.vd[7*para.number_of_nodes + index] + n_a.vd[8*para.number_of_nodes + index]) 00236 + (n_a.vd[9*para.number_of_nodes + index] + n_a.vd[10*para.number_of_nodes + index])); 00237 00238 } 00239 00240 /**lb_relax_modes, means collision update of the modes 00241 * @param index node index / thread index (Input) 00242 * @param mode Pointer to the local register values mode (Input/Output) 00243 * @param node_f Pointer to local node force (Input) 00244 */ 00245 __device__ void relax_modes(float *mode, unsigned int index, LB_node_force_gpu node_f){ 00246 00247 float Rho = mode[0] + para.rho*para.agrid*para.agrid*para.agrid; 00248 float j[3], pi_eq[6]; 00249 00250 /** re-construct the real density 00251 * remember that the populations are stored as differences to their 00252 * equilibrium value */ 00253 00254 j[0] = mode[1]; 00255 j[1] = mode[2]; 00256 j[2] = mode[3]; 00257 00258 /** if forces are present, the momentum density is redefined to 00259 * inlcude one half-step of the force action. See the 00260 * Chapman-Enskog expansion in [Ladd & Verberg]. */ 00261 00262 j[0] += 0.5f*node_f.force[0*para.number_of_nodes + index]; 00263 j[1] += 0.5f*node_f.force[1*para.number_of_nodes + index]; 00264 j[2] += 0.5f*node_f.force[2*para.number_of_nodes + index]; 00265 00266 /** equilibrium part of the stress modes (eq13 schiller)*/ 00267 pi_eq[0] = ((j[0]*j[0])+(j[1]*j[1])+(j[2]*j[2]))/Rho; 00268 pi_eq[1] = ((j[0]*j[0])-(j[1]*j[1]))/Rho; 00269 pi_eq[2] = (((j[0]*j[0])+(j[1]*j[1])+(j[2]*j[2])) - 3.0f*(j[2]*j[2]))/Rho; 00270 pi_eq[3] = j[0]*j[1]/Rho; 00271 pi_eq[4] = j[0]*j[2]/Rho; 00272 pi_eq[5] = j[1]*j[2]/Rho; 00273 00274 /** relax the stress modes (eq14 schiller)*/ 00275 mode[4] = pi_eq[0] + para.gamma_bulk*(mode[4] - pi_eq[0]); 00276 mode[5] = pi_eq[1] + para.gamma_shear*(mode[5] - pi_eq[1]); 00277 mode[6] = pi_eq[2] + para.gamma_shear*(mode[6] - pi_eq[2]); 00278 mode[7] = pi_eq[3] + para.gamma_shear*(mode[7] - pi_eq[3]); 00279 mode[8] = pi_eq[4] + para.gamma_shear*(mode[8] - pi_eq[4]); 00280 mode[9] = pi_eq[5] + para.gamma_shear*(mode[9] - pi_eq[5]); 00281 00282 /** relax the ghost modes (project them out) */ 00283 /** ghost modes have no equilibrium part due to orthogonality */ 00284 mode[10] = para.gamma_odd*mode[10]; 00285 mode[11] = para.gamma_odd*mode[11]; 00286 mode[12] = para.gamma_odd*mode[12]; 00287 mode[13] = para.gamma_odd*mode[13]; 00288 mode[14] = para.gamma_odd*mode[14]; 00289 mode[15] = para.gamma_odd*mode[15]; 00290 mode[16] = para.gamma_even*mode[16]; 00291 mode[17] = para.gamma_even*mode[17]; 00292 mode[18] = para.gamma_even*mode[18]; 00293 00294 } 00295 00296 /**thermalization of the modes with gaussian random numbers 00297 * @param index node index / thread index (Input) 00298 * @param mode Pointer to the local register values mode (Input/Output) 00299 * @param *rn Pointer to randomnumber array of the local node 00300 */ 00301 __device__ void thermalize_modes(float *mode, unsigned int index, LB_randomnr_gpu *rn){ 00302 00303 float Rho = mode[0] + para.rho*para.agrid*para.agrid*para.agrid; 00304 00305 /* 00306 if (Rho <0) 00307 printf("Rho too small! %f %f %f", Rho, mode[0], para.rho*para.agrid*para.agrid*para.agrid); 00308 */ 00309 #ifdef GAUSSRANDOM 00310 /** stress modes */ 00311 gaussian_random(rn); 00312 mode[4] += sqrt(Rho*(para.mu*(2.f/3.f)*(1.f-(para.gamma_bulk*para.gamma_bulk)))) * rn->randomnr[1]; 00313 mode[5] += sqrt(Rho*(para.mu*(4.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear)))) * rn->randomnr[0]; 00314 00315 gaussian_random(rn); 00316 mode[6] += sqrt(Rho*(para.mu*(4.f/3.f)*(1.f-(para.gamma_shear*para.gamma_shear)))) * rn->randomnr[1]; 00317 mode[7] += sqrt(Rho*(para.mu*(1.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear)))) * rn->randomnr[0]; 00318 00319 gaussian_random(rn); 00320 mode[8] += sqrt(Rho*(para.mu*(1.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear)))) * rn->randomnr[1]; 00321 mode[9] += sqrt(Rho*(para.mu*(1.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear)))) * rn->randomnr[0]; 00322 00323 /** ghost modes */ 00324 gaussian_random(rn); 00325 mode[10] += sqrt(Rho*(para.mu*(2.f/3.f))) * rn->randomnr[1]; 00326 mode[11] += sqrt(Rho*(para.mu*(2.f/3.f))) * rn->randomnr[0]; 00327 00328 gaussian_random(rn); 00329 mode[12] += sqrt(Rho*(para.mu*(2.f/3.f))) * rn->randomnr[1]; 00330 mode[13] += sqrt(Rho*(para.mu*(2.f/9.f))) * rn->randomnr[0]; 00331 00332 gaussian_random(rn); 00333 mode[14] += sqrt(Rho*(para.mu*(2.f/9.f))) * rn->randomnr[1]; 00334 mode[15] += sqrt(Rho*(para.mu*(2.f/9.f))) * rn->randomnr[0]; 00335 00336 gaussian_random(rn); 00337 mode[16] += sqrt(Rho*(para.mu*(2.f))) * rn->randomnr[1]; 00338 mode[17] += sqrt(Rho*(para.mu*(4.f/9.f))) * rn->randomnr[0]; 00339 00340 gaussian_random(rn); 00341 mode[18] += sqrt(Rho*(para.mu*(4.f/3.f))) * rn->randomnr[1]; 00342 #else 00343 /** stress modes */ 00344 random_01(rn); 00345 mode[4] += sqrt(12.f*Rho*para.mu*(2.f/3.f)*(1.f-(para.gamma_bulk*para.gamma_bulk))) * (rn->randomnr[1]-0.5f); 00346 mode[5] += sqrt(12.f*Rho*para.mu*(4.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear))) * (rn->randomnr[0]-0.5f); 00347 00348 random_01(rn); 00349 mode[6] += sqrt(12.f*Rho*para.mu*(4.f/3.f)*(1.f-(para.gamma_shear*para.gamma_shear))) * (rn->randomnr[1]-0.5f); 00350 mode[7] += sqrt(12.f*Rho*para.mu*(1.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear))) * (rn->randomnr[0]-0.5f); 00351 00352 random_01(rn); 00353 mode[8] += sqrt(12.f*para.mu*(1.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear))) * (rn->randomnr[1]-0.5f); 00354 mode[9] += sqrt(12.f*para.mu*(1.f/9.f)*(1.f-(para.gamma_shear*para.gamma_shear))) * (rn->randomnr[0]-0.5f); 00355 00356 /** ghost modes */ 00357 random_01(rn); 00358 mode[10] += sqrt(12.f*Rho*para.mu*(2.f/3.f)) * (rn->randomnr[1]-0.5f); 00359 mode[11] += sqrt(12.f*Rho*para.mu*(2.f/3.f)) * (rn->randomnr[0]-0.5f); 00360 00361 random_01(rn); 00362 mode[12] += sqrt(12.f*Rho*para.mu*(2.f/3.f)) * (rn->randomnr[1]-0.5f); 00363 mode[13] += sqrt(12.f*Rho*para.mu*(2.f/9.f)) * (rn->randomnr[0]-0.5f); 00364 00365 random_01(rn); 00366 mode[14] += sqrt(12.f*Rho*para.mu*(2.f/9.f)) * (rn->randomnr[1]-0.5f); 00367 mode[15] += sqrt(12.f*Rho*para.mu*(2.f/9.f)) * (rn->randomnr[0]-0.5f); 00368 00369 random_01(rn); 00370 mode[16] += sqrt(12.f*Rho*para.mu*(2.f)) * (rn->randomnr[1]-0.5f); 00371 mode[17] += sqrt(12.f*Rho*para.mu*(4.f/9.f)) * (rn->randomnr[0]-0.5f); 00372 00373 random_01(rn); 00374 mode[18] += sqrt(12.f*Rho*para.mu*(4.f/3.f)) * (rn->randomnr[1]-0.5f); 00375 #endif 00376 } 00377 /*-------------------------------------------------------*/ 00378 /**normalization of the modes need befor backtransformation into velocity space 00379 * @param mode Pointer to the local register values mode (Input/Output) 00380 */ 00381 __device__ void normalize_modes(float* mode){ 00382 00383 /** normalization factors enter in the back transformation */ 00384 mode[0] *= 1.f; 00385 mode[1] *= 3.f; 00386 mode[2] *= 3.f; 00387 mode[3] *= 3.f; 00388 mode[4] *= 3.f/2.f; 00389 mode[5] *= 9.f/4.f; 00390 mode[6] *= 3.f/4.f; 00391 mode[7] *= 9.f; 00392 mode[8] *= 9.f; 00393 mode[9] *= 9.f; 00394 mode[10] *= 3.f/2.f; 00395 mode[11] *= 3.f/2.f; 00396 mode[12] *= 3.f/2.f; 00397 mode[13] *= 9.f/2.f; 00398 mode[14] *= 9.f/2.f; 00399 mode[15] *= 9.f/2.f; 00400 mode[16] *= 1.f/2.f; 00401 mode[17] *= 9.f/4.f; 00402 mode[18] *= 3.f/4.f; 00403 00404 } 00405 /*-------------------------------------------------------*/ 00406 /**backtransformation from modespace to desityspace and streaming with the push method using pbc 00407 * @param index node index / thread index (Input) 00408 * @param mode Pointer to the local register values mode (Input) 00409 * @param *n_b Pointer to local node residing in array b (Output) 00410 */ 00411 __device__ void calc_n_from_modes_push(LB_nodes_gpu n_b, float *mode, unsigned int index){ 00412 00413 unsigned int xyz[3]; 00414 index_to_xyz(index, xyz); 00415 unsigned int x = xyz[0]; 00416 unsigned int y = xyz[1]; 00417 unsigned int z = xyz[2]; 00418 00419 n_b.vd[0*para.number_of_nodes + x + para.dim_x*y + para.dim_x*para.dim_y*z] = 1.f/3.f * (mode[0] - mode[4] + mode[16]); 00420 n_b.vd[1*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*z] = 1.f/18.f * (mode[0] + mode[1] + mode[5] + mode[6] - mode[17] - mode[18] - 2.f*(mode[10] + mode[16])); 00421 n_b.vd[2*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*z] = 1.f/18.f * (mode[0] - mode[1] + mode[5] + mode[6] - mode[17] - mode[18] + 2.f*(mode[10] - mode[16])); 00422 n_b.vd[3*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z] = 1.f/18.f * (mode[0] + mode[2] - mode[5] + mode[6] + mode[17] - mode[18] - 2.f*(mode[11] + mode[16])); 00423 n_b.vd[4*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z] = 1.f/18.f * (mode[0] - mode[2] - mode[5] + mode[6] + mode[17] - mode[18] + 2.f*(mode[11] - mode[16])); 00424 n_b.vd[5*para.number_of_nodes + x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = 1.f/18.f * (mode[0] + mode[3] - 2.f*(mode[6] + mode[12] + mode[16] - mode[18])); 00425 n_b.vd[6*para.number_of_nodes + x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = 1.f/18.f * (mode[0] - mode[3] - 2.f*(mode[6] - mode[12] + mode[16] - mode[18])); 00426 n_b.vd[7*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z] = 1.f/36.f * (mode[0] + mode[1] + mode[2] + mode[4] + 2.f*mode[6] + mode[7] + mode[10] + mode[11] + mode[13] + mode[14] + mode[16] + 2.f*mode[18]); 00427 n_b.vd[8*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z] = 1.f/36.f * (mode[0] - mode[1] - mode[2] + mode[4] + 2.f*mode[6] + mode[7] - mode[10] - mode[11] - mode[13] - mode[14] + mode[16] + 2.f*mode[18]); 00428 n_b.vd[9*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z] = 1.f/36.f * (mode[0] + mode[1] - mode[2] + mode[4] + 2.f*mode[6] - mode[7] + mode[10] - mode[11] + mode[13] - mode[14] + mode[16] + 2.f*mode[18]); 00429 n_b.vd[10*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z] = 1.f/36.f * (mode[0] - mode[1] + mode[2] + mode[4] + 2.f*mode[6] - mode[7] - mode[10] + mode[11] - mode[13] + mode[14] + mode[16] + 2.f*mode[18]); 00430 n_b.vd[11*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = 1.f/36.f * (mode[0] + mode[1] + mode[3] + mode[4] + mode[5] - mode[6] + mode[8] + mode[10] + mode[12] - mode[13] + mode[15] + mode[16] + mode[17] - mode[18]); 00431 n_b.vd[12*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = 1.f/36.f * (mode[0] - mode[1] - mode[3] + mode[4] + mode[5] - mode[6] + mode[8] - mode[10] - mode[12] + mode[13] - mode[15] + mode[16] + mode[17] - mode[18]); 00432 n_b.vd[13*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = 1.f/36.f * (mode[0] + mode[1] - mode[3] + mode[4] + mode[5] - mode[6] - mode[8] + mode[10] - mode[12] - mode[13] - mode[15] + mode[16] + mode[17] - mode[18]); 00433 n_b.vd[14*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = 1.f/36.f * (mode[0] - mode[1] + mode[3] + mode[4] + mode[5] - mode[6] - mode[8] - mode[10] + mode[12] + mode[13] + mode[15] + mode[16] + mode[17] - mode[18]); 00434 n_b.vd[15*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = 1.f/36.f * (mode[0] + mode[2] + mode[3] + mode[4] - mode[5] - mode[6] + mode[9] + mode[11] + mode[12] - mode[14] - mode[15] + mode[16] - mode[17] - mode[18]); 00435 n_b.vd[16*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = 1.f/36.f * (mode[0] - mode[2] - mode[3] + mode[4] - mode[5] - mode[6] + mode[9] - mode[11] - mode[12] + mode[14] + mode[15] + mode[16] - mode[17] - mode[18]); 00436 n_b.vd[17*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = 1.f/36.f * (mode[0] + mode[2] - mode[3] + mode[4] - mode[5] - mode[6] - mode[9] + mode[11] - mode[12] - mode[14] + mode[15] + mode[16] - mode[17] - mode[18]); 00437 n_b.vd[18*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = 1.f/36.f * (mode[0] - mode[2] + mode[3] + mode[4] - mode[5] - mode[6] - mode[9] - mode[11] + mode[12] + mode[14] - mode[15] + mode[16] - mode[17] - mode[18]); 00438 00439 } 00440 00441 /** Bounce back boundary conditions. 00442 * The populations that have propagated into a boundary node 00443 * are bounced back to the node they came from. This results 00444 * in no slip boundary conditions. 00445 * 00446 * [cf. Ladd and Verberg, J. Stat. Phys. 104(5/6):1191-1251, 2001] 00447 * @param index node index / thread index (Input) 00448 * @param n_b Pointer to local node residing in array b (Input) 00449 * @param n_a Pointer to local node residing in array a (Output) (temp stored in buffer a) 00450 * @param LB_boundary_velocity The constant velocity at the boundary, set by the user (Input) 00451 * @param LB_boundary_force The force on the boundary nodes (Output) 00452 */ 00453 __device__ void bounce_back_read(LB_nodes_gpu n_b, LB_nodes_gpu n_a, unsigned int index, \ 00454 float* LB_boundary_velocity, float* LB_boundary_force){ 00455 00456 unsigned int xyz[3]; 00457 int c[3]; 00458 float v[3]; 00459 float shift, weight, pop_to_bounce_back; 00460 float boundary_force[3] = {0,0,0}; 00461 size_t to_index, to_index_x, to_index_y, to_index_z; 00462 int population, inverse; 00463 int boundary_index; 00464 00465 00466 boundary_index=n_b.boundary[index]; 00467 if(boundary_index != 0){ 00468 00469 v[0]=LB_boundary_velocity[3*(boundary_index-1)+0]; 00470 v[1]=LB_boundary_velocity[3*(boundary_index-1)+1]; 00471 v[2]=LB_boundary_velocity[3*(boundary_index-1)+2]; 00472 00473 index_to_xyz(index, xyz); 00474 00475 unsigned int x = xyz[0]; 00476 unsigned int y = xyz[1]; 00477 unsigned int z = xyz[2]; 00478 00479 /* CPU analog of shift: 00480 lbpar.agrid*lbpar.agrid*lbpar.agrid*lbpar.rho*2*lbmodel.c[i][l]*lb_boundaries[lbfields[k].boundary-1].velocity[l] */ 00481 00482 /** store vd temporary in second lattice to avoid race conditions */ 00483 #define BOUNCEBACK \ 00484 shift = para.agrid*para.agrid*para.agrid*para.agrid*para.rho*2.*3.*weight*para.tau*(v[0]*c[0] + v[1]*c[1] + v[2]*c[2]); \ 00485 pop_to_bounce_back = n_b.vd[population*para.number_of_nodes + index ]; \ 00486 to_index_x = (x+c[0]+para.dim_x)%para.dim_x; \ 00487 to_index_y = (y+c[1]+para.dim_y)%para.dim_y; \ 00488 to_index_z = (z+c[2]+para.dim_z)%para.dim_z; \ 00489 to_index = to_index_x + para.dim_x*to_index_y + para.dim_x*para.dim_y*to_index_z; \ 00490 if (n_b.boundary[to_index] == 0) \ 00491 { \ 00492 boundary_force[0] += (2*pop_to_bounce_back+shift)*c[0]/para.tau/para.tau/para.agrid; \ 00493 boundary_force[1] += (2*pop_to_bounce_back+shift)*c[1]/para.tau/para.tau/para.agrid; \ 00494 boundary_force[2] += (2*pop_to_bounce_back+shift)*c[2]/para.tau/para.tau/para.agrid; \ 00495 n_b.vd[inverse*para.number_of_nodes + to_index ] = pop_to_bounce_back + shift; \ 00496 } 00497 00498 // ***** SHOULDN'T THERE BE AN ELSE STATMENT IN "BOUNCEBACK"? 00499 // ***** THERE IS AN ODD FACTOR OF 2 THAT YOU INCUR IN THE FORCES FOR THE "lb_stokes_sphere_gpu.tcl" TEST CASE 00500 00501 // the resting population does nothing. 00502 c[0]=1;c[1]=0;c[2]=0; weight=1./18.; population=2; inverse=1; 00503 BOUNCEBACK 00504 00505 c[0]=-1;c[1]=0;c[2]=0; weight=1./18.; population=1; inverse=2; 00506 BOUNCEBACK 00507 00508 c[0]=0;c[1]=1;c[2]=0; weight=1./18.; population=4; inverse=3; 00509 BOUNCEBACK 00510 00511 c[0]=0;c[1]=-1;c[2]=0; weight=1./18.; population=3; inverse=4; 00512 BOUNCEBACK 00513 00514 c[0]=0;c[1]=0;c[2]=1; weight=1./18.; population=6; inverse=5; 00515 BOUNCEBACK 00516 00517 c[0]=0;c[1]=0;c[2]=-1; weight=1./18.; population=5; inverse=6; 00518 BOUNCEBACK 00519 00520 c[0]=1;c[1]=1;c[2]=0; weight=1./36.; population=8; inverse=7; 00521 BOUNCEBACK 00522 00523 c[0]=-1;c[1]=-1;c[2]=0; weight=1./36.; population=7; inverse=8; 00524 BOUNCEBACK 00525 00526 c[0]=1;c[1]=-1;c[2]=0; weight=1./36.; population=10; inverse=9; 00527 BOUNCEBACK 00528 00529 c[0]=-1;c[1]=+1;c[2]=0; weight=1./36.; population=9; inverse=10; 00530 BOUNCEBACK 00531 00532 c[0]=1;c[1]=0;c[2]=1; weight=1./36.; population=12; inverse=11; 00533 BOUNCEBACK 00534 00535 c[0]=-1;c[1]=0;c[2]=-1; weight=1./36.; population=11; inverse=12; 00536 BOUNCEBACK 00537 00538 c[0]=1;c[1]=0;c[2]=-1; weight=1./36.; population=14; inverse=13; 00539 BOUNCEBACK 00540 00541 c[0]=-1;c[1]=0;c[2]=1; weight=1./36.; population=13; inverse=14; 00542 BOUNCEBACK 00543 00544 c[0]=0;c[1]=1;c[2]=1; weight=1./36.; population=16; inverse=15; 00545 BOUNCEBACK 00546 00547 c[0]=0;c[1]=-1;c[2]=-1; weight=1./36.; population=15; inverse=16; 00548 BOUNCEBACK 00549 00550 c[0]=0;c[1]=1;c[2]=-1; weight=1./36.; population=18; inverse=17; 00551 BOUNCEBACK 00552 00553 c[0]=0;c[1]=-1;c[2]=1; weight=1./36.; population=17; inverse=18; 00554 BOUNCEBACK 00555 00556 atomicadd(&LB_boundary_force[3*(n_b.boundary[index]-1)+0], boundary_force[0]); 00557 atomicadd(&LB_boundary_force[3*(n_b.boundary[index]-1)+1], boundary_force[1]); 00558 atomicadd(&LB_boundary_force[3*(n_b.boundary[index]-1)+2], boundary_force[2]); 00559 } 00560 } 00561 /**bounce back read kernel needed to avoid raceconditions 00562 * @param index node index / thread index (Input) 00563 * @param n_b Pointer to local node residing in array b (Input) 00564 * @param n_a Pointer to local node residing in array a (Output) (temp stored in buffer a) 00565 */ 00566 __device__ void bounce_back_write(LB_nodes_gpu n_b, LB_nodes_gpu n_a, unsigned int index){ 00567 00568 unsigned int xyz[3]; 00569 00570 if(n_b.boundary[index] != 0){ 00571 index_to_xyz(index, xyz); 00572 unsigned int x = xyz[0]; 00573 unsigned int y = xyz[1]; 00574 unsigned int z = xyz[2]; 00575 00576 /** stream vd from boundary node back to origin node */ 00577 n_b.vd[1*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*z] = n_a.vd[1*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*z]; 00578 n_b.vd[2*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*z] = n_a.vd[2*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*z]; 00579 n_b.vd[3*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z] = n_a.vd[3*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z]; 00580 n_b.vd[4*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z] = n_a.vd[4*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z]; 00581 n_b.vd[5*para.number_of_nodes + x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = n_a.vd[5*para.number_of_nodes + x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)]; 00582 n_b.vd[6*para.number_of_nodes + x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = n_a.vd[6*para.number_of_nodes + x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)]; 00583 n_b.vd[7*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z] = n_a.vd[7*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z]; 00584 n_b.vd[8*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z] = n_a.vd[8*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z]; 00585 n_b.vd[9*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z] = n_a.vd[9*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*z]; 00586 n_b.vd[10*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z] = n_a.vd[10*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*z]; 00587 n_b.vd[11*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = n_a.vd[11*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)]; 00588 n_b.vd[12*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = n_a.vd[12*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)]; 00589 n_b.vd[13*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = n_a.vd[13*para.number_of_nodes + (x+1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)]; 00590 n_b.vd[14*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = n_a.vd[14*para.number_of_nodes + (para.dim_x+x-1)%para.dim_x + para.dim_x*y + para.dim_x*para.dim_y*((z+1)%para.dim_z)]; 00591 n_b.vd[15*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = n_a.vd[15*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z)]; 00592 n_b.vd[16*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = n_a.vd[16*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)]; 00593 n_b.vd[17*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)] = n_a.vd[17*para.number_of_nodes + x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((para.dim_z+z-1)%para.dim_z)]; 00594 n_b.vd[18*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z)] = n_a.vd[18*para.number_of_nodes + x + para.dim_x*((para.dim_y+y-1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z)]; 00595 } 00596 } 00597 /** add of (external) forces within the modespace, needed for particle-interaction 00598 * @param index node index / thread index (Input) 00599 * @param mode Pointer to the local register values mode (Input/Output) 00600 * @param node_f Pointer to local node force (Input) 00601 */ 00602 __device__ void apply_forces(unsigned int index, float *mode, LB_node_force_gpu node_f) { 00603 00604 float Rho, u[3], C[6]; 00605 Rho = mode[0] + para.rho*para.agrid*para.agrid*para.agrid; 00606 00607 /** hydrodynamic momentum density is redefined when forces present */ 00608 u[0] = (mode[1] + 0.5f*node_f.force[0*para.number_of_nodes + index])/Rho; 00609 u[1] = (mode[2] + 0.5f*node_f.force[1*para.number_of_nodes + index])/Rho; 00610 u[2] = (mode[3] + 0.5f*node_f.force[2*para.number_of_nodes + index])/Rho; 00611 00612 C[0] = (1.f + para.gamma_bulk)*u[0]*node_f.force[0*para.number_of_nodes + index] + 1.f/3.f*(para.gamma_bulk-para.gamma_shear)*(u[0]*node_f.force[0*para.number_of_nodes + index] + u[1]*node_f.force[1*para.number_of_nodes + index] + u[2]*node_f.force[2*para.number_of_nodes + index]); 00613 C[2] = (1.f + para.gamma_bulk)*u[1]*node_f.force[1*para.number_of_nodes + index] + 1.f/3.f*(para.gamma_bulk-para.gamma_shear)*(u[0]*node_f.force[0*para.number_of_nodes + index] + u[1]*node_f.force[1*para.number_of_nodes + index] + u[2]*node_f.force[2*para.number_of_nodes + index]); 00614 C[5] = (1.f + para.gamma_bulk)*u[2]*node_f.force[2*para.number_of_nodes + index] + 1.f/3.f*(para.gamma_bulk-para.gamma_shear)*(u[0]*node_f.force[0*para.number_of_nodes + index] + u[1]*node_f.force[1*para.number_of_nodes + index] + u[2]*node_f.force[2*para.number_of_nodes + index]); 00615 C[1] = 1.f/2.f*(1.f+para.gamma_shear)*(u[0]*node_f.force[1*para.number_of_nodes + index]+u[1]*node_f.force[0*para.number_of_nodes + index]); 00616 C[3] = 1.f/2.f*(1.f+para.gamma_shear)*(u[0]*node_f.force[2*para.number_of_nodes + index]+u[2]*node_f.force[0*para.number_of_nodes + index]); 00617 C[4] = 1.f/2.f*(1.f+para.gamma_shear)*(u[1]*node_f.force[2*para.number_of_nodes + index]+u[2]*node_f.force[1*para.number_of_nodes + index]); 00618 00619 /** update momentum modes */ 00620 mode[1] += node_f.force[0*para.number_of_nodes + index]; 00621 mode[2] += node_f.force[1*para.number_of_nodes + index]; 00622 mode[3] += node_f.force[2*para.number_of_nodes + index]; 00623 00624 /** update stress modes */ 00625 mode[4] += C[0] + C[2] + C[5]; 00626 mode[5] += C[0] - C[2]; 00627 mode[6] += C[0] + C[2] - 2.f*C[5]; 00628 mode[7] += C[1]; 00629 mode[8] += C[3]; 00630 mode[9] += C[4]; 00631 00632 #ifdef EXTERNAL_FORCES 00633 if(para.external_force){ 00634 node_f.force[0*para.number_of_nodes + index] = para.ext_force[0]*powf(para.agrid,4)*para.tau*para.tau; 00635 node_f.force[1*para.number_of_nodes + index] = para.ext_force[1]*powf(para.agrid,4)*para.tau*para.tau; 00636 node_f.force[2*para.number_of_nodes + index] = para.ext_force[2]*powf(para.agrid,4)*para.tau*para.tau; 00637 } 00638 else{ 00639 node_f.force[0*para.number_of_nodes + index] = 0.f; 00640 node_f.force[1*para.number_of_nodes + index] = 0.f; 00641 node_f.force[2*para.number_of_nodes + index] = 0.f; 00642 } 00643 #else 00644 /** reset force */ 00645 node_f.force[0*para.number_of_nodes + index] = 0.f; 00646 node_f.force[1*para.number_of_nodes + index] = 0.f; 00647 node_f.force[2*para.number_of_nodes + index] = 0.f; 00648 #endif 00649 } 00650 00651 /**function used to calc physical values of every node 00652 * @param index node index / thread index (Input) 00653 * @param mode Pointer to the local register values mode (Input) 00654 * @param n_a Pointer to local node residing in array a for boundary flag(Input) 00655 * @param *d_v Pointer to local device values (Input/Output) 00656 * @param singlenode Flag, if there is only one node 00657 * This function is a clone of lb_calc_local_fields and 00658 * additionally performs unit conversions. 00659 */ 00660 __device__ void calc_values(LB_nodes_gpu n_a, float *mode, LB_values_gpu *d_v, unsigned int index, unsigned int singlenode){ 00661 00662 float rho = mode[0] + para.rho*para.agrid*para.agrid*para.agrid; 00663 00664 float *v, *pi; 00665 if(singlenode == 1){ 00666 v=&(d_v[0].v[0]); 00667 pi=&(d_v[0].pi[0]); 00668 d_v[0].rho = rho; 00669 } else { 00670 v=&(d_v[index].v[0]); 00671 pi=&(d_v[index].pi[0]); 00672 d_v[index].rho = rho; 00673 } 00674 float j[3]; float pi_eq[6]; 00675 00676 j[0] = mode[1]; 00677 j[1] = mode[2]; 00678 j[2] = mode[3]; 00679 00680 // To Do: Here half the forces have to go in! 00681 // j[0] += 0.5*lbfields[index].force[0]; 00682 // j[1] += 0.5*lbfields[index].force[1]; 00683 // j[2] += 0.5*lbfields[index].force[2]; 00684 00685 v[0]=j[0]/rho; 00686 v[1]=j[1]/rho; 00687 v[2]=j[2]/rho; 00688 00689 /* equilibrium part of the stress modes */ 00690 pi_eq[0] = (j[0]*j[0]+j[1]*j[1]+j[2]*j[2])/ rho; 00691 pi_eq[1] = ((j[0]*j[0])-(j[1]*j[1]))/ rho; 00692 pi_eq[2] = (j[0]*j[0]+j[1]*j[1]+j[2]*j[2] - 3.0*j[2]*j[2])/ rho; 00693 pi_eq[3] = j[0]*j[1]/ rho; 00694 pi_eq[4] = j[0]*j[2]/ rho; 00695 pi_eq[5] = j[1]*j[2]/ rho; 00696 00697 /* Now we must predict the outcome of the next collision */ 00698 /* We immediately average pre- and post-collision. */ 00699 mode[4] = pi_eq[0] + (0.5+0.5*para.gamma_bulk )*(mode[4] - pi_eq[0]); 00700 mode[5] = pi_eq[1] + (0.5+0.5*para.gamma_shear)*(mode[5] - pi_eq[1]); 00701 mode[6] = pi_eq[2] + (0.5+0.5*para.gamma_shear)*(mode[6] - pi_eq[2]); 00702 mode[7] = pi_eq[3] + (0.5+0.5*para.gamma_shear)*(mode[7] - pi_eq[3]); 00703 mode[8] = pi_eq[4] + (0.5+0.5*para.gamma_shear)*(mode[8] - pi_eq[4]); 00704 mode[9] = pi_eq[5] + (0.5+0.5*para.gamma_shear)*(mode[9] - pi_eq[5]); 00705 00706 /* Now we have to transform to the "usual" stress tensor components */ 00707 /* We use eq. 116ff in Duenweg Ladd for that. */ 00708 pi[0]=(mode[0]+mode[4]+mode[5])/3.; 00709 pi[2]=(2*mode[0]+2*mode[4]-mode[5]+3*mode[6])/6.; 00710 pi[5]=(2*mode[0]+2*mode[4]-mode[5]+3*mode[6])/6.; 00711 pi[1]=mode[7]; 00712 pi[3]=mode[8]; 00713 pi[4]=mode[9]; 00714 00715 /* Finally some unit conversions */ 00716 rho*=1./para.agrid/para.agrid/para.agrid; 00717 v[0]*=1./para.tau/para.agrid; 00718 v[1]*=1./para.tau/para.agrid; 00719 v[2]*=1./para.tau/para.agrid; 00720 00721 for (int i =0; i<6; i++) { 00722 pi[i]*=1./para.tau/para.tau/para.agrid/para.agrid/para.agrid; 00723 } 00724 00725 } 00726 /** 00727 * @param node_index node index around (8) particle (Input) 00728 * @param *mode Pointer to the local register values mode (Output) 00729 * @param n_a Pointer to local node residing in array a(Input) 00730 */ 00731 __device__ void calc_mode(float *mode, LB_nodes_gpu n_a, unsigned int node_index){ 00732 00733 /** mass mode */ 00734 mode[0] = n_a.vd[0*para.number_of_nodes + node_index] + n_a.vd[1*para.number_of_nodes + node_index] + n_a.vd[2*para.number_of_nodes + node_index] 00735 + n_a.vd[3*para.number_of_nodes + node_index] + n_a.vd[4*para.number_of_nodes + node_index] + n_a.vd[5*para.number_of_nodes + node_index] 00736 + n_a.vd[6*para.number_of_nodes + node_index] + n_a.vd[7*para.number_of_nodes + node_index] + n_a.vd[8*para.number_of_nodes + node_index] 00737 + n_a.vd[9*para.number_of_nodes + node_index] + n_a.vd[10*para.number_of_nodes + node_index] + n_a.vd[11*para.number_of_nodes + node_index] + n_a.vd[12*para.number_of_nodes + node_index] 00738 + n_a.vd[13*para.number_of_nodes + node_index] + n_a.vd[14*para.number_of_nodes + node_index] + n_a.vd[15*para.number_of_nodes + node_index] + n_a.vd[16*para.number_of_nodes + node_index] 00739 + n_a.vd[17*para.number_of_nodes + node_index] + n_a.vd[18*para.number_of_nodes + node_index]; 00740 00741 /** momentum modes */ 00742 mode[1] = (n_a.vd[1*para.number_of_nodes + node_index] - n_a.vd[2*para.number_of_nodes + node_index]) + (n_a.vd[7*para.number_of_nodes + node_index] - n_a.vd[8*para.number_of_nodes + node_index]) 00743 + (n_a.vd[9*para.number_of_nodes + node_index] - n_a.vd[10*para.number_of_nodes + node_index]) + (n_a.vd[11*para.number_of_nodes + node_index] - n_a.vd[12*para.number_of_nodes + node_index]) 00744 + (n_a.vd[13*para.number_of_nodes + node_index] - n_a.vd[14*para.number_of_nodes + node_index]); 00745 mode[2] = (n_a.vd[3*para.number_of_nodes + node_index] - n_a.vd[4*para.number_of_nodes + node_index]) + (n_a.vd[7*para.number_of_nodes + node_index] - n_a.vd[8*para.number_of_nodes + node_index]) 00746 - (n_a.vd[9*para.number_of_nodes + node_index] - n_a.vd[10*para.number_of_nodes + node_index]) + (n_a.vd[15*para.number_of_nodes + node_index] - n_a.vd[16*para.number_of_nodes + node_index]) 00747 + (n_a.vd[17*para.number_of_nodes + node_index] - n_a.vd[18*para.number_of_nodes + node_index]); 00748 mode[3] = (n_a.vd[5*para.number_of_nodes + node_index] - n_a.vd[6*para.number_of_nodes + node_index]) + (n_a.vd[11*para.number_of_nodes + node_index] - n_a.vd[12*para.number_of_nodes + node_index]) 00749 - (n_a.vd[13*para.number_of_nodes + node_index] - n_a.vd[14*para.number_of_nodes + node_index]) + (n_a.vd[15*para.number_of_nodes + node_index] - n_a.vd[16*para.number_of_nodes + node_index]) 00750 - (n_a.vd[17*para.number_of_nodes + node_index] - n_a.vd[18*para.number_of_nodes + node_index]); 00751 } 00752 /*********************************************************/ 00753 /** \name Coupling part */ 00754 /*********************************************************/ 00755 /**(Eq. (12) Ahlrichs and Duenweg, JCP 111(17):8225 (1999)) 00756 * @param n_a Pointer to local node residing in array a (Input) 00757 * @param *delta Pointer for the weighting of particle position (Output) 00758 * @param *delta_j Pointer for the weighting of particle momentum (Output) 00759 * @param *particle_data Pointer to the particle position and velocity (Input) 00760 * @param *particle_force Pointer to the particle force (Input) 00761 * @param part_index particle id / thread id (Input) 00762 * @param *rn_part Pointer to randomnumber array of the particle 00763 * @param node_index node index around (8) particle (Output) 00764 */ 00765 __device__ void calc_viscous_force(LB_nodes_gpu n_a, float *delta, LB_particle_gpu *particle_data, LB_particle_force_gpu *particle_force, unsigned int part_index, LB_randomnr_gpu *rn_part, float *delta_j, unsigned int *node_index){ 00766 00767 float mode[4]; 00768 int my_left[3]; 00769 float interpolated_u1, interpolated_u2, interpolated_u3; 00770 float Rho; 00771 interpolated_u1 = interpolated_u2 = interpolated_u3 = 0.f; 00772 00773 float temp_delta[6]; 00774 float temp_delta_half[6]; 00775 00776 /** see ahlrichs + duenweg page 8227 equ (10) and (11) */ 00777 #pragma unroll 00778 for(int i=0; i<3; ++i){ 00779 float scaledpos = particle_data[part_index].p[i]/para.agrid - 0.5f; 00780 my_left[i] = (int)(floorf(scaledpos)); 00781 //printf("scaledpos %f \t myleft: %d \n", scaledpos, my_left[i]); 00782 temp_delta[3+i] = scaledpos - my_left[i]; 00783 temp_delta[i] = 1.f - temp_delta[3+i]; 00784 /**further value used for interpolation of fluid velocity at part pos near boundaries */ 00785 temp_delta_half[3+i] = (scaledpos - my_left[i])*2.f; 00786 temp_delta_half[i] = 2.f - temp_delta_half[3+i]; 00787 } 00788 00789 delta[0] = temp_delta[0] * temp_delta[1] * temp_delta[2]; 00790 delta[1] = temp_delta[3] * temp_delta[1] * temp_delta[2]; 00791 delta[2] = temp_delta[0] * temp_delta[4] * temp_delta[2]; 00792 delta[3] = temp_delta[3] * temp_delta[4] * temp_delta[2]; 00793 delta[4] = temp_delta[0] * temp_delta[1] * temp_delta[5]; 00794 delta[5] = temp_delta[3] * temp_delta[1] * temp_delta[5]; 00795 delta[6] = temp_delta[0] * temp_delta[4] * temp_delta[5]; 00796 delta[7] = temp_delta[3] * temp_delta[4] * temp_delta[5]; 00797 00798 // modulo for negative numbers is strange at best, shift to make sure we are positive 00799 int x = my_left[0] + para.dim_x; 00800 int y = my_left[1] + para.dim_y; 00801 int z = my_left[2] + para.dim_z; 00802 00803 node_index[0] = x%para.dim_x + para.dim_x*(y%para.dim_y) + para.dim_x*para.dim_y*(z%para.dim_z); 00804 node_index[1] = (x+1)%para.dim_x + para.dim_x*(y%para.dim_y) + para.dim_x*para.dim_y*(z%para.dim_z); 00805 node_index[2] = x%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*(z%para.dim_z); 00806 node_index[3] = (x+1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*(z%para.dim_z); 00807 node_index[4] = x%para.dim_x + para.dim_x*(y%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z); 00808 node_index[5] = (x+1)%para.dim_x + para.dim_x*(y%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z); 00809 node_index[6] = x%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z); 00810 node_index[7] = (x+1)%para.dim_x + para.dim_x*((y+1)%para.dim_y) + para.dim_x*para.dim_y*((z+1)%para.dim_z); 00811 #pragma unroll 00812 for(int i=0; i<8; ++i){ 00813 calc_mode(mode, n_a, node_index[i]); 00814 Rho = mode[0] + para.rho*para.agrid*para.agrid*para.agrid; 00815 interpolated_u1 += delta[i]*mode[1]/(Rho); 00816 interpolated_u2 += delta[i]*mode[2]/(Rho); 00817 interpolated_u3 += delta[i]*mode[3]/(Rho); 00818 } 00819 00820 /** calculate viscous force 00821 * take care to rescale velocities with time_step and transform to MD units 00822 * (Eq. (9) Ahlrichs and Duenweg, JCP 111(17):8225 (1999)) */ 00823 #ifdef LB_ELECTROHYDRODYNAMICS 00824 particle_force[part_index].f[0] = - para.friction * (particle_data[part_index].v[0]/para.time_step - interpolated_u1*para.agrid/para.tau - particle_data[part_index].mu_E[0]); 00825 particle_force[part_index].f[1] = - para.friction * (particle_data[part_index].v[1]/para.time_step - interpolated_u2*para.agrid/para.tau - particle_data[part_index].mu_E[1]); 00826 particle_force[part_index].f[2] = - para.friction * (particle_data[part_index].v[2]/para.time_step - interpolated_u3*para.agrid/para.tau - particle_data[part_index].mu_E[2]); 00827 #else 00828 particle_force[part_index].f[0] = - para.friction * (particle_data[part_index].v[0]/para.time_step - interpolated_u1*para.agrid/para.tau); 00829 particle_force[part_index].f[1] = - para.friction * (particle_data[part_index].v[1]/para.time_step - interpolated_u2*para.agrid/para.tau); 00830 particle_force[part_index].f[2] = - para.friction * (particle_data[part_index].v[2]/para.time_step - interpolated_u3*para.agrid/para.tau); 00831 #endif 00832 /** add stochastik force of zero mean (Ahlrichs, Duennweg equ. 15)*/ 00833 #ifdef GAUSSRANDOM 00834 gaussian_random(rn_part); 00835 particle_force[part_index].f[0] += para.lb_coupl_pref2*rn_part->randomnr[0]; 00836 particle_force[part_index].f[1] += para.lb_coupl_pref2*rn_part->randomnr[1]; 00837 gaussian_random(rn_part); 00838 particle_force[part_index].f[2] += para.lb_coupl_pref2*rn_part->randomnr[0]; 00839 #else 00840 random_01(rn_part); 00841 particle_force[part_index].f[0] += para.lb_coupl_pref*(rn_part->randomnr[0]-0.5f); 00842 particle_force[part_index].f[1] += para.lb_coupl_pref*(rn_part->randomnr[1]-0.5f); 00843 random_01(rn_part); 00844 particle_force[part_index].f[2] += para.lb_coupl_pref*(rn_part->randomnr[0]-0.5f); 00845 #endif 00846 /** delta_j for transform momentum transfer to lattice units which is done in calc_node_force 00847 (Eq. (12) Ahlrichs and Duenweg, JCP 111(17):8225 (1999)) */ 00848 delta_j[0] = - particle_force[part_index].f[0]*para.time_step*para.tau/para.agrid; 00849 delta_j[1] = - particle_force[part_index].f[1]*para.time_step*para.tau/para.agrid; 00850 delta_j[2] = - particle_force[part_index].f[2]*para.time_step*para.tau/para.agrid; 00851 00852 } 00853 00854 /**calcutlation of the node force caused by the particles, with atomicadd due to avoiding race conditions 00855 (Eq. (14) Ahlrichs and Duenweg, JCP 111(17):8225 (1999)) 00856 * @param *delta Pointer for the weighting of particle position (Input) 00857 * @param *delta_j Pointer for the weighting of particle momentum (Input) 00858 * @param node_index node index around (8) particle (Input) 00859 * @param node_f Pointer to the node force (Output). 00860 */ 00861 __device__ void calc_node_force(float *delta, float *delta_j, unsigned int *node_index, LB_node_force_gpu node_f){ 00862 00863 #if 1 00864 atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[0]]), (delta[0]*delta_j[0])); 00865 atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[0]]), (delta[0]*delta_j[1])); 00866 atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[0]]), (delta[0]*delta_j[2])); 00867 00868 atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[1]]), (delta[1]*delta_j[0])); 00869 atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[1]]), (delta[1]*delta_j[1])); 00870 atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[1]]), (delta[1]*delta_j[2])); 00871 00872 atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[2]]), (delta[2]*delta_j[0])); 00873 atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[2]]), (delta[2]*delta_j[1])); 00874 atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[2]]), (delta[2]*delta_j[2])); 00875 00876 atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[3]]), (delta[3]*delta_j[0])); 00877 atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[3]]), (delta[3]*delta_j[1])); 00878 atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[3]]), (delta[3]*delta_j[2])); 00879 00880 atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[4]]), (delta[4]*delta_j[0])); 00881 atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[4]]), (delta[4]*delta_j[1])); 00882 atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[4]]), (delta[4]*delta_j[2])); 00883 00884 atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[5]]), (delta[5]*delta_j[0])); 00885 atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[5]]), (delta[5]*delta_j[1])); 00886 atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[5]]), (delta[5]*delta_j[2])); 00887 00888 atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[6]]), (delta[6]*delta_j[0])); 00889 atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[6]]), (delta[6]*delta_j[1])); 00890 atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[6]]), (delta[6]*delta_j[2])); 00891 00892 atomicadd(&(node_f.force[0*para.number_of_nodes + node_index[7]]), (delta[7]*delta_j[0])); 00893 atomicadd(&(node_f.force[1*para.number_of_nodes + node_index[7]]), (delta[7]*delta_j[1])); 00894 atomicadd(&(node_f.force[2*para.number_of_nodes + node_index[7]]), (delta[7]*delta_j[2])); 00895 #endif 00896 } 00897 /*********************************************************/ 00898 /** \name System setup and Kernel funktions */ 00899 /*********************************************************/ 00900 /**kernel to calculate local populations from hydrodynamic fields given by the tcl values. 00901 * The mapping is given in terms of the equilibrium distribution. 00902 * 00903 * Eq. (2.15) Ladd, J. Fluid Mech. 271, 295-309 (1994) 00904 * Eq. (4) in Berk Usta, Ladd and Butler, JCP 122, 094902 (2005) 00905 * 00906 * @param n_a Pointer to the lattice site (Input). 00907 * @param *gpu_check additional check if gpu kernel are executed(Input). 00908 */ 00909 __global__ void calc_n_equilibrium(LB_nodes_gpu n_a, int *gpu_check) { 00910 00911 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 00912 00913 if(index<para.number_of_nodes){ 00914 00915 /** default values for fields in lattice units */ 00916 gpu_check[0] = 1; 00917 00918 float Rho = para.rho*para.agrid*para.agrid*para.agrid; 00919 float v[3] = { 0.0f, 0.0f, 0.0f }; 00920 float pi[6] = { Rho*c_sound_sq, 0.0f, Rho*c_sound_sq, 0.0f, 0.0f, Rho*c_sound_sq }; 00921 00922 float rhoc_sq = Rho*c_sound_sq; 00923 float avg_rho = para.rho*para.agrid*para.agrid*para.agrid; 00924 float local_rho, local_j[3], *local_pi, trace; 00925 00926 local_rho = Rho; 00927 00928 local_j[0] = Rho * v[0]; 00929 local_j[1] = Rho * v[1]; 00930 local_j[2] = Rho * v[2]; 00931 00932 local_pi = pi; 00933 00934 /** reduce the pressure tensor to the part needed here */ 00935 local_pi[0] -= rhoc_sq; 00936 local_pi[2] -= rhoc_sq; 00937 local_pi[5] -= rhoc_sq; 00938 00939 trace = local_pi[0] + local_pi[2] + local_pi[5]; 00940 00941 float rho_times_coeff; 00942 float tmp1,tmp2; 00943 00944 /** update the q=0 sublattice */ 00945 n_a.vd[0*para.number_of_nodes + index] = 1.f/3.f * (local_rho-avg_rho) - 1.f/2.f*trace; 00946 00947 /** update the q=1 sublattice */ 00948 rho_times_coeff = 1.f/18.f * (local_rho-avg_rho); 00949 00950 n_a.vd[1*para.number_of_nodes + index] = rho_times_coeff + 1.f/6.f*local_j[0] + 1.f/4.f*local_pi[0] - 1.f/12.f*trace; 00951 n_a.vd[2*para.number_of_nodes + index] = rho_times_coeff - 1.f/6.f*local_j[0] + 1.f/4.f*local_pi[0] - 1.f/12.f*trace; 00952 n_a.vd[3*para.number_of_nodes + index] = rho_times_coeff + 1.f/6.f*local_j[1] + 1.f/4.f*local_pi[2] - 1.f/12.f*trace; 00953 n_a.vd[4*para.number_of_nodes + index] = rho_times_coeff - 1.f/6.f*local_j[1] + 1.f/4.f*local_pi[2] - 1.f/12.f*trace; 00954 n_a.vd[5*para.number_of_nodes + index] = rho_times_coeff + 1.f/6.f*local_j[2] + 1.f/4.f*local_pi[5] - 1.f/12.f*trace; 00955 n_a.vd[6*para.number_of_nodes + index] = rho_times_coeff - 1.f/6.f*local_j[2] + 1.f/4.f*local_pi[5] - 1.f/12.f*trace; 00956 00957 /** update the q=2 sublattice */ 00958 rho_times_coeff = 1.f/36.f * (local_rho-avg_rho); 00959 00960 tmp1 = local_pi[0] + local_pi[2]; 00961 tmp2 = 2.0f*local_pi[1]; 00962 n_a.vd[7*para.number_of_nodes + index] = rho_times_coeff + 1.f/12.f*(local_j[0]+local_j[1]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace; 00963 n_a.vd[8*para.number_of_nodes + index] = rho_times_coeff - 1.f/12.f*(local_j[0]+local_j[1]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace; 00964 n_a.vd[9*para.number_of_nodes + index] = rho_times_coeff + 1.f/12.f*(local_j[0]-local_j[1]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace; 00965 n_a.vd[10*para.number_of_nodes + index] = rho_times_coeff - 1.f/12.f*(local_j[0]-local_j[1]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace; 00966 00967 tmp1 = local_pi[0] + local_pi[5]; 00968 tmp2 = 2.0f*local_pi[3]; 00969 00970 n_a.vd[11*para.number_of_nodes + index] = rho_times_coeff + 1.f/12.f*(local_j[0]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace; 00971 n_a.vd[12*para.number_of_nodes + index] = rho_times_coeff - 1.f/12.f*(local_j[0]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace; 00972 n_a.vd[13*para.number_of_nodes + index] = rho_times_coeff + 1.f/12.f*(local_j[0]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace; 00973 n_a.vd[14*para.number_of_nodes + index] = rho_times_coeff - 1.f/12.f*(local_j[0]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace; 00974 00975 tmp1 = local_pi[2] + local_pi[5]; 00976 tmp2 = 2.0f*local_pi[4]; 00977 00978 n_a.vd[15*para.number_of_nodes + index] = rho_times_coeff + 1.f/12.f*(local_j[1]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace; 00979 n_a.vd[16*para.number_of_nodes + index] = rho_times_coeff - 1.f/12.f*(local_j[1]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace; 00980 n_a.vd[17*para.number_of_nodes + index] = rho_times_coeff + 1.f/12.f*(local_j[1]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace; 00981 n_a.vd[18*para.number_of_nodes + index] = rho_times_coeff - 1.f/12.f*(local_j[1]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace; 00982 00983 /**set different seed for randomgen on every node */ 00984 n_a.seed[index] = para.your_seed + index; 00985 } 00986 } 00987 /** kernel to calculate local populations from hydrodynamic fields 00988 * from given flow field velocities. The mapping is given in terms of 00989 * the equilibrium distribution. 00990 * 00991 * Eq. (2.15) Ladd, J. Fluid Mech. 271, 295-309 (1994) 00992 * Eq. (4) in Berk Usta, Ladd and Butler, JCP 122, 094902 (2005) 00993 * 00994 * @param n_a the current nodes array (double buffering!) 00995 * @param single_nodeindex the node to set the velocity for 00996 * @param velocity the velocity to set 00997 */ 00998 __global__ void set_u_equilibrium(LB_nodes_gpu n_a, int single_nodeindex,float *velocity) { 00999 01000 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01001 01002 if(index == 0){ 01003 01004 /** default values for fields in lattice units */ 01005 float mode[19]; 01006 calc_mode(mode, n_a, single_nodeindex); 01007 float Rho = mode[0] + para.rho*para.agrid*para.agrid*para.agrid; 01008 01009 float v[3]; 01010 v[0] = velocity[0]; 01011 v[1] = velocity[1]; 01012 v[2] = velocity[2]; 01013 01014 float pi[6] = { Rho*c_sound_sq, 0.0f, Rho*c_sound_sq, 0.0f, 0.0f, Rho*c_sound_sq }; 01015 01016 float rhoc_sq = Rho*c_sound_sq; 01017 float avg_rho = para.rho*para.agrid*para.agrid*para.agrid; 01018 float local_rho, local_j[3], *local_pi, trace; 01019 01020 local_rho = Rho; 01021 01022 local_j[0] = Rho * v[0]; 01023 local_j[1] = Rho * v[1]; 01024 local_j[2] = Rho * v[2]; 01025 01026 local_pi = pi; 01027 01028 /** reduce the pressure tensor to the part needed here */ 01029 local_pi[0] -= rhoc_sq; 01030 local_pi[2] -= rhoc_sq; 01031 local_pi[5] -= rhoc_sq; 01032 01033 trace = local_pi[0] + local_pi[2] + local_pi[5]; 01034 01035 float rho_times_coeff; 01036 float tmp1,tmp2; 01037 01038 /** update the q=0 sublattice */ 01039 n_a.vd[0*para.number_of_nodes + single_nodeindex] = 1.f/3.f * (local_rho-avg_rho) - 1.f/2.f*trace; 01040 01041 /** update the q=1 sublattice */ 01042 rho_times_coeff = 1.f/18.f * (local_rho-avg_rho); 01043 01044 n_a.vd[1*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/6.f*local_j[0] + 1.f/4.f*local_pi[0] - 1.f/12.f*trace; 01045 n_a.vd[2*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/6.f*local_j[0] + 1.f/4.f*local_pi[0] - 1.f/12.f*trace; 01046 n_a.vd[3*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/6.f*local_j[1] + 1.f/4.f*local_pi[2] - 1.f/12.f*trace; 01047 n_a.vd[4*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/6.f*local_j[1] + 1.f/4.f*local_pi[2] - 1.f/12.f*trace; 01048 n_a.vd[5*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/6.f*local_j[2] + 1.f/4.f*local_pi[5] - 1.f/12.f*trace; 01049 n_a.vd[6*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/6.f*local_j[2] + 1.f/4.f*local_pi[5] - 1.f/12.f*trace; 01050 01051 /** update the q=2 sublattice */ 01052 rho_times_coeff = 1.f/36.f * (local_rho-avg_rho); 01053 01054 tmp1 = local_pi[0] + local_pi[2]; 01055 tmp2 = 2.0f*local_pi[1]; 01056 n_a.vd[7*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/12.f*(local_j[0]+local_j[1]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace; 01057 n_a.vd[8*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/12.f*(local_j[0]+local_j[1]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace; 01058 n_a.vd[9*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/12.f*(local_j[0]-local_j[1]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace; 01059 n_a.vd[10*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/12.f*(local_j[0]-local_j[1]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace; 01060 01061 tmp1 = local_pi[0] + local_pi[5]; 01062 tmp2 = 2.0f*local_pi[3]; 01063 01064 n_a.vd[11*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/12.f*(local_j[0]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace; 01065 n_a.vd[12*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/12.f*(local_j[0]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace; 01066 n_a.vd[13*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/12.f*(local_j[0]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace; 01067 n_a.vd[14*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/12.f*(local_j[0]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace; 01068 01069 tmp1 = local_pi[2] + local_pi[5]; 01070 tmp2 = 2.0f*local_pi[4]; 01071 01072 n_a.vd[15*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/12.f*(local_j[1]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace; 01073 n_a.vd[16*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/12.f*(local_j[1]+local_j[2]) + 1.f/8.f*(tmp1+tmp2) - 1.f/24.f*trace; 01074 n_a.vd[17*para.number_of_nodes + single_nodeindex] = rho_times_coeff + 1.f/12.f*(local_j[1]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace; 01075 n_a.vd[18*para.number_of_nodes + single_nodeindex] = rho_times_coeff - 1.f/12.f*(local_j[1]-local_j[2]) + 1.f/8.f*(tmp1-tmp2) - 1.f/24.f*trace; 01076 01077 } 01078 } 01079 /** kernel for the initalisation of the particle force array 01080 * @param *particle_force Pointer to local particle force (Output) 01081 * @param *part Pointer to the particle rn seed storearray (Output) 01082 */ 01083 __global__ void init_particle_force(LB_particle_force_gpu *particle_force, LB_particle_seed_gpu *part){ 01084 01085 unsigned int part_index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01086 01087 if(part_index<para.number_of_particles){ 01088 particle_force[part_index].f[0] = 0.0f; 01089 particle_force[part_index].f[1] = 0.0f; 01090 particle_force[part_index].f[2] = 0.0f; 01091 01092 part[part_index].seed = para.your_seed + part_index; 01093 } 01094 01095 } 01096 01097 /** kernel for the initalisation of the partikel force array 01098 * @param *particle_force pointer to local particle force (Input) 01099 */ 01100 __global__ void reset_particle_force(LB_particle_force_gpu *particle_force){ 01101 01102 unsigned int part_index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01103 01104 if(part_index<para.number_of_particles){ 01105 particle_force[part_index].f[0] = 0.0f; 01106 particle_force[part_index].f[1] = 0.0f; 01107 particle_force[part_index].f[2] = 0.0f; 01108 } 01109 } 01110 01111 /** (re-)initialization of the node force / set up of external force in lb units 01112 * @param node_f Pointer to local node force (Input) 01113 */ 01114 __global__ void reinit_node_force(LB_node_force_gpu node_f){ 01115 01116 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01117 01118 if(index<para.number_of_nodes){ 01119 #ifdef EXTERNAL_FORCES 01120 if(para.external_force){ 01121 node_f.force[0*para.number_of_nodes + index] = para.ext_force[0]*powf(para.agrid,4)*para.tau*para.tau; 01122 node_f.force[1*para.number_of_nodes + index] = para.ext_force[1]*powf(para.agrid,4)*para.tau*para.tau; 01123 node_f.force[2*para.number_of_nodes + index] = para.ext_force[2]*powf(para.agrid,4)*para.tau*para.tau; 01124 } 01125 else{ 01126 node_f.force[0*para.number_of_nodes + index] = 0.0f; 01127 node_f.force[1*para.number_of_nodes + index] = 0.0f; 01128 node_f.force[2*para.number_of_nodes + index] = 0.0f; 01129 } 01130 #else 01131 node_f.force[0*para.number_of_nodes + index] = 0.0f; 01132 node_f.force[1*para.number_of_nodes + index] = 0.0f; 01133 node_f.force[2*para.number_of_nodes + index] = 0.0f; 01134 #endif 01135 } 01136 } 01137 01138 /**set the boundary flag for all boundary nodes 01139 * @param boundary_node_list The indices of the boundary nodes 01140 * @param boundary_index_list The flag representing the corresponding boundary 01141 * @param number_of_boundnodes The number of boundary nodes 01142 * @param n_a Pointer to local node residing in array a (Input) 01143 * @param n_b Pointer to local node residing in array b (Input) 01144 */ 01145 __global__ void init_boundaries(int *boundary_node_list, int *boundary_index_list, int number_of_boundnodes, LB_nodes_gpu n_a, LB_nodes_gpu n_b){ 01146 01147 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01148 01149 if(index<number_of_boundnodes){ 01150 n_a.boundary[boundary_node_list[index]] = boundary_index_list[index]; 01151 n_b.boundary[boundary_node_list[index]] = boundary_index_list[index]; 01152 } 01153 } 01154 01155 /**reset the boundary flag of every node 01156 * @param n_a Pointer to local node residing in array a (Input) 01157 * @param n_b Pointer to local node residing in array b (Input) 01158 */ 01159 __global__ void reset_boundaries(LB_nodes_gpu n_a, LB_nodes_gpu n_b){ 01160 01161 size_t index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01162 01163 if(index<para.number_of_nodes){ 01164 n_a.boundary[index] = n_b.boundary[index] = 0; 01165 } 01166 } 01167 01168 /** integrationstep of the lb-fluid-solver 01169 * @param n_a Pointer to local node residing in array a (Input) 01170 * @param n_b Pointer to local node residing in array b (Input) 01171 * @param *d_v Pointer to local device values (Input) 01172 * @param node_f Pointer to local node force (Input) 01173 */ 01174 __global__ void integrate(LB_nodes_gpu n_a, LB_nodes_gpu n_b, LB_values_gpu *d_v, LB_node_force_gpu node_f){ 01175 01176 /**every node is connected to a thread via the index*/ 01177 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01178 /**the 19 moments (modes) are only temporary register values */ 01179 float mode[19]; 01180 LB_randomnr_gpu rng; 01181 01182 if(index<para.number_of_nodes){ 01183 /** storing the seed into a register value*/ 01184 rng.seed = n_a.seed[index]; 01185 /**calc_m_from_n*/ 01186 calc_m_from_n(n_a, index, mode); 01187 /**lb_relax_modes*/ 01188 relax_modes(mode, index, node_f); 01189 /**lb_thermalize_modes */ 01190 if (para.fluct) thermalize_modes(mode, index, &rng); 01191 #ifdef EXTERNAL_FORCES 01192 /**if external force is used apply node force */ 01193 apply_forces(index, mode, node_f); 01194 #else 01195 /**if partcles are used apply node forces*/ 01196 if (para.number_of_particles) apply_forces(index, mode, node_f); 01197 #endif 01198 /**lb_calc_n_from_modes_push*/ 01199 normalize_modes(mode); 01200 /**calc of velocity densities and streaming with pbc*/ 01201 calc_n_from_modes_push(n_b, mode, index); 01202 /** rewriting the seed back to the global memory*/ 01203 n_b.seed[index] = rng.seed; 01204 } 01205 } 01206 01207 /** part interaction kernel 01208 * @param n_a Pointer to local node residing in array a (Input) 01209 * @param *particle_data Pointer to the particle position and velocity (Input) 01210 * @param *particle_force Pointer to the particle force (Input) 01211 * @param *part Pointer to the rn array of the particles (Input) 01212 * @param node_f Pointer to local node force (Input) 01213 */ 01214 __global__ void calc_fluid_particle_ia(LB_nodes_gpu n_a, LB_particle_gpu *particle_data, LB_particle_force_gpu *particle_force, LB_node_force_gpu node_f, LB_particle_seed_gpu *part){ 01215 01216 unsigned int part_index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01217 unsigned int node_index[8]; 01218 float delta[8]; 01219 float delta_j[3]; 01220 LB_randomnr_gpu rng_part; 01221 01222 if(part_index<para.number_of_particles){ 01223 01224 rng_part.seed = part[part_index].seed; 01225 /**calc of the force which act on the particle */ 01226 calc_viscous_force(n_a, delta, particle_data, particle_force, part_index, &rng_part, delta_j, node_index); 01227 /**calc of the force which acts back to the fluid node */ 01228 calc_node_force(delta, delta_j, node_index, node_f); 01229 part[part_index].seed = rng_part.seed; 01230 } 01231 } 01232 01233 /**Bounce back boundary read kernel 01234 * @param n_a Pointer to local node residing in array a (Input) 01235 * @param n_b Pointer to local node residing in array b (Input) 01236 * @param LB_boundary_velocity The constant velocity at the boundary, set by the user (Input) 01237 * @param LB_boundary_force The force on the boundary nodes (Output) 01238 */ 01239 __global__ void bb_read(LB_nodes_gpu n_a, LB_nodes_gpu n_b, float* LB_boundary_velocity, float* LB_boundary_force){ 01240 01241 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01242 01243 if(index<para.number_of_nodes){ 01244 bounce_back_read(n_b, n_a, index, LB_boundary_velocity, LB_boundary_force); 01245 } 01246 } 01247 01248 /**Bounce back boundary write kernel 01249 * @param n_a Pointer to local node residing in array a (Input) 01250 * @param n_b Pointer to local node residing in array b (Input) 01251 */ 01252 __global__ void bb_write(LB_nodes_gpu n_a, LB_nodes_gpu n_b){ 01253 01254 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01255 01256 if(index<para.number_of_nodes){ 01257 bounce_back_write(n_b, n_a, index); 01258 } 01259 } 01260 01261 /** get physical values of the nodes (density, velocity, ...) 01262 * @param n_a Pointer to local node residing in array a (Input) 01263 * @param *d_v Pointer to local device values (Input) 01264 */ 01265 __global__ void values(LB_nodes_gpu n_a, LB_values_gpu *d_v){ 01266 01267 float mode[19]; 01268 unsigned int singlenode = 0; 01269 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01270 01271 if(index<para.number_of_nodes){ 01272 calc_m_from_n(n_a, index, mode); 01273 calc_values(n_a, mode, d_v, index, singlenode); 01274 } 01275 } 01276 01277 /** get boundary flags 01278 * @param n_a Pointer to local node residing in array a (Input) 01279 * @param device_bound_array Pointer to local device values (Input) 01280 */ 01281 __global__ void lb_get_boundaries(LB_nodes_gpu n_a, unsigned int *device_bound_array){ 01282 01283 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01284 01285 if(index<para.number_of_nodes){ 01286 device_bound_array[index] = n_a.boundary[index]; 01287 } 01288 } 01289 01290 /**set extern force on single nodes kernel 01291 * @param n_extern_nodeforces number of nodes (Input) 01292 * @param *extern_nodeforces Pointer to extern node force array (Input) 01293 * @param node_f node force struct (Output) 01294 */ 01295 __global__ void init_extern_nodeforces(int n_extern_nodeforces, LB_extern_nodeforce_gpu *extern_nodeforces, LB_node_force_gpu node_f){ 01296 01297 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01298 01299 if(index<n_extern_nodeforces){ 01300 node_f.force[0*para.number_of_nodes + extern_nodeforces[index].index] = extern_nodeforces[index].force[0]*powf(para.agrid,4)*para.tau*para.tau; 01301 node_f.force[1*para.number_of_nodes + extern_nodeforces[index].index] = extern_nodeforces[index].force[1]*powf(para.agrid,4)*para.tau*para.tau; 01302 node_f.force[2*para.number_of_nodes + extern_nodeforces[index].index] = extern_nodeforces[index].force[2]*powf(para.agrid,4)*para.tau*para.tau; 01303 } 01304 } 01305 01306 /**print single node values kernel 01307 * @param single_nodeindex index of the node (Input) 01308 * @param *d_p_v Pointer to result storage array (Input) 01309 * @param n_a Pointer to local node residing in array a (Input) 01310 */ 01311 __global__ void lb_print_node(int single_nodeindex, LB_values_gpu *d_p_v, LB_nodes_gpu n_a){ 01312 01313 float mode[19]; 01314 unsigned int singlenode = 1; 01315 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01316 01317 if(index == 0){ 01318 calc_m_from_n(n_a, single_nodeindex, mode); 01319 calc_values(n_a, mode, d_p_v, single_nodeindex, singlenode); 01320 } 01321 } 01322 /**calculate mass of the hole fluid kernel 01323 * @param *sum Pointer to result storage value (Output) 01324 * @param n_a Pointer to local node residing in array a (Input) 01325 */ 01326 __global__ void calc_mass(LB_nodes_gpu n_a, float *sum) { 01327 float mode[1]; 01328 01329 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01330 01331 if(index<para.number_of_nodes){ 01332 calc_mode(mode, n_a, index); 01333 float Rho = mode[0] + para.rho*para.agrid*para.agrid*para.agrid; 01334 //if(n_a.boundary[index]){ 01335 //mode[0] = 0.f; 01336 //} 01337 atomicadd(&(sum[0]), Rho); 01338 } 01339 } 01340 /**calculate momentum of the hole fluid kernel 01341 * @param node_f node force struct (Input) 01342 * @param *sum Pointer to result storage value (Output) 01343 * @param n_a Pointer to local node residing in array a (Input) 01344 */ 01345 __global__ void momentum(LB_nodes_gpu n_a, float *sum, LB_node_force_gpu node_f) { 01346 float mode[4]; 01347 01348 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01349 01350 if(index<para.number_of_nodes){ 01351 calc_mode(mode, n_a, index); 01352 if(n_a.boundary[index]){ 01353 mode[1] = mode[2] = mode[3] = 0.f; 01354 } 01355 atomicadd(&(sum[0]), mode[1]+node_f.force[0*para.number_of_nodes + index]); 01356 atomicadd(&(sum[1]), mode[2]+node_f.force[1*para.number_of_nodes + index]); 01357 atomicadd(&(sum[2]), mode[3]+node_f.force[2*para.number_of_nodes + index]); 01358 } 01359 } 01360 01361 /**calculate temperature of the fluid kernel 01362 * @param *cpu_jsquared Pointer to result storage value (Output) 01363 * @param n_a Pointer to local node residing in array a (Input) 01364 */ 01365 __global__ void temperature(LB_nodes_gpu n_a, float *cpu_jsquared) { 01366 float mode[4]; 01367 float jsquared = 0.f; 01368 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01369 01370 if(index<para.number_of_nodes){ 01371 calc_mode(mode, n_a, index); 01372 if(n_a.boundary[index]){ 01373 jsquared = 0.f; 01374 } 01375 else{ 01376 jsquared = mode[1]*mode[1]+mode[2]*mode[2]+mode[3]*mode[3]; 01377 } 01378 atomicadd(cpu_jsquared, jsquared); 01379 } 01380 } 01381 /**print single node boundary flag 01382 * @param single_nodeindex index of the node (Input) 01383 * @param *device_flag Pointer to result storage array (Input) 01384 * @param n_a Pointer to local node residing in array a (Input) 01385 */ 01386 __global__ void lb_get_boundary_flag(int single_nodeindex, unsigned int *device_flag, LB_nodes_gpu n_a){ 01387 01388 unsigned int index = blockIdx.y * gridDim.x * blockDim.x + blockDim.x * blockIdx.x + threadIdx.x; 01389 01390 if(index == 0){ 01391 device_flag[0] = n_a.boundary[single_nodeindex]; 01392 } 01393 } 01394 /**erroroutput for memory allocation and memory copy 01395 * @param err cuda error code 01396 * @param *file .cu file were the error took place 01397 * @param line line of the file were the error took place 01398 */ 01399 void _cuda_safe_mem(cudaError_t err, char *file, unsigned int line){ 01400 if( cudaSuccess != err) { 01401 fprintf(stderr, "Cuda Memory error at %s:%u.\n", file, line); 01402 printf("CUDA error: %s\n", cudaGetErrorString(err)); 01403 exit(EXIT_FAILURE); 01404 } else { 01405 _err=cudaGetLastError(); \ 01406 if (_err != cudaSuccess) { 01407 fprintf(stderr, "Error found during memory operation. Possibly however from an failed operation before. %s:%u.\n", file, line); 01408 printf("CUDA error: %s\n", cudaGetErrorString(err)); 01409 exit(EXIT_FAILURE); 01410 } 01411 } 01412 } 01413 #define cuda_safe_mem(a) _cuda_safe_mem((a), __FILE__, __LINE__) 01414 01415 #define KERNELCALL(_f, _a, _b, _params) \ 01416 _f<<<_a, _b, 0, stream[0]>>>_params; \ 01417 _err=cudaGetLastError(); \ 01418 if (_err!=cudaSuccess){ \ 01419 printf("CUDA error: %s\n", cudaGetErrorString(_err)); \ 01420 fprintf(stderr, "error calling %s with dim %d %d %d #thpb %d in %s:%u\n", #_f, _a.x, _a.y, _a.z, _b, __FILE__, __LINE__); \ 01421 exit(EXIT_FAILURE); \ 01422 } 01423 /*********************************************************/ 01424 /** \name Host functions to setup and call kernels */ 01425 /*********************************************************/ 01426 /**********************************************************************/ 01427 /* Host funktions to setup and call kernels*/ 01428 /**********************************************************************/ 01429 01430 /**initialization for the lb gpu fluid called from host 01431 * @param *lbpar_gpu Pointer to parameters to setup the lb field 01432 */ 01433 void lb_init_GPU(LB_parameters_gpu *lbpar_gpu){ 01434 01435 if(initflag){ 01436 cudaFree(device_values); 01437 cudaFree(nodes_a.vd); 01438 cudaFree(nodes_b.vd); 01439 cudaFree(nodes_a.seed); 01440 cudaFree(nodes_b.seed); 01441 cudaFree(nodes_a.boundary); 01442 cudaFree(nodes_b.boundary); 01443 cudaFree(node_f.force); 01444 cudaFree(gpu_check); 01445 } 01446 /** Allocate structs in device memory*/ 01447 size_of_values = lbpar_gpu->number_of_nodes * sizeof(LB_values_gpu); 01448 size_of_forces = lbpar_gpu->number_of_particles * sizeof(LB_particle_force_gpu); 01449 size_of_positions = lbpar_gpu->number_of_particles * sizeof(LB_particle_gpu); 01450 size_of_seed = lbpar_gpu->number_of_particles * sizeof(LB_particle_seed_gpu); 01451 01452 cuda_safe_mem(cudaMalloc((void**)&device_values, size_of_values)); 01453 01454 01455 cuda_safe_mem(cudaMalloc((void**)&nodes_a.vd, lbpar_gpu->number_of_nodes * 19 * sizeof(float))); 01456 cuda_safe_mem(cudaMalloc((void**)&nodes_b.vd, lbpar_gpu->number_of_nodes * 19 * sizeof(float))); 01457 01458 cuda_safe_mem(cudaMalloc((void**)&nodes_a.seed, lbpar_gpu->number_of_nodes * sizeof(unsigned int))); 01459 cuda_safe_mem(cudaMalloc((void**)&nodes_a.boundary, lbpar_gpu->number_of_nodes * sizeof(unsigned int))); 01460 cuda_safe_mem(cudaMalloc((void**)&nodes_b.seed, lbpar_gpu->number_of_nodes * sizeof(unsigned int))); 01461 cuda_safe_mem(cudaMalloc((void**)&nodes_b.boundary, lbpar_gpu->number_of_nodes * sizeof(unsigned int))); 01462 01463 cuda_safe_mem(cudaMalloc((void**)&node_f.force, lbpar_gpu->number_of_nodes * 3 * sizeof(float))); 01464 //maybe coalesced alloc 01465 cuda_safe_mem(cudaMalloc((void**)&particle_force, size_of_forces)); 01466 cuda_safe_mem(cudaMalloc((void**)&particle_data, size_of_positions)); 01467 01468 cuda_safe_mem(cudaMalloc((void**)&part, size_of_seed)); 01469 01470 /**write parameters in const memory*/ 01471 cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu))); 01472 /**check flag if lb gpu init works*/ 01473 cuda_safe_mem(cudaMalloc((void**)&gpu_check, sizeof(int))); 01474 initflag = 1; 01475 h_gpu_check = (int*)malloc(sizeof(int)); 01476 01477 /** values for the kernel call */ 01478 int threads_per_block = 64; 01479 int blocks_per_grid_y = 4; 01480 int blocks_per_grid_x = (lbpar_gpu->number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y); 01481 dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1); 01482 01483 cudaStreamCreate(&stream[0]); 01484 /** values for the particle kernel */ 01485 int threads_per_block_particles = 64; 01486 int blocks_per_grid_particles_y = 4; 01487 int blocks_per_grid_particles_x = (lbpar_gpu->number_of_particles + threads_per_block_particles * blocks_per_grid_particles_y - 1)/(threads_per_block_particles * blocks_per_grid_particles_y); 01488 dim3 dim_grid_particles = make_uint3(blocks_per_grid_particles_x, blocks_per_grid_particles_y, 1); 01489 01490 KERNELCALL(reset_boundaries, dim_grid, threads_per_block, (nodes_a, nodes_b)); 01491 01492 /** calc of veloctiydensities from given parameters and initialize the Node_Force array with zero */ 01493 KERNELCALL(calc_n_equilibrium, dim_grid, threads_per_block, (nodes_a, gpu_check)); 01494 /** init part forces with zero*/ 01495 if(lbpar_gpu->number_of_particles) KERNELCALL(init_particle_force, dim_grid_particles, threads_per_block_particles, (particle_force, part)); 01496 KERNELCALL(reinit_node_force, dim_grid, threads_per_block, (node_f)); 01497 01498 intflag = 1; 01499 current_nodes = &nodes_a; 01500 h_gpu_check[0] = 0; 01501 cuda_safe_mem(cudaMemcpy(h_gpu_check, gpu_check, sizeof(int), cudaMemcpyDeviceToHost)); 01502 //fprintf(stderr, "initialization of lb gpu code %i\n", lbpar_gpu->number_of_nodes); 01503 cudaThreadSynchronize(); 01504 if(!h_gpu_check[0]){ 01505 fprintf(stderr, "initialization of lb gpu code failed! \n"); 01506 errexit(); 01507 } 01508 } 01509 /** reinitialization for the lb gpu fluid called from host 01510 * @param *lbpar_gpu Pointer to parameters to setup the lb field 01511 */ 01512 void lb_reinit_GPU(LB_parameters_gpu *lbpar_gpu){ 01513 01514 /**write parameters in const memory*/ 01515 cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu))); 01516 01517 /** values for the kernel call */ 01518 int threads_per_block = 64; 01519 int blocks_per_grid_y = 4; 01520 int blocks_per_grid_x = (lbpar_gpu->number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y); 01521 dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1); 01522 01523 /** calc of veloctiydensities from given parameters and initialize the Node_Force array with zero */ 01524 KERNELCALL(calc_n_equilibrium, dim_grid, threads_per_block, (nodes_a, gpu_check)); 01525 } 01526 01527 /**setup and call particle reallocation from the host 01528 * @param *lbpar_gpu Pointer to parameters to setup the lb field 01529 * @param **host_data Pointer to host information data 01530 */ 01531 void lb_realloc_particle_GPU(LB_parameters_gpu *lbpar_gpu, LB_particle_gpu **host_data){ 01532 01533 if(partinitflag){ 01534 cudaFreeHost(*host_data); 01535 cudaFree(particle_force); 01536 cudaFree(particle_data); 01537 cudaFree(part); 01538 } 01539 /** Allocate struct for particle positions */ 01540 size_of_forces = lbpar_gpu->number_of_particles * sizeof(LB_particle_force_gpu); 01541 size_of_positions = lbpar_gpu->number_of_particles * sizeof(LB_particle_gpu); 01542 size_of_seed = lbpar_gpu->number_of_particles * sizeof(LB_particle_seed_gpu); 01543 #if !defined __CUDA_ARCH__ || __CUDA_ARCH__ >= 200 01544 /**pinned memory mode - use special function to get OS-pinned memory*/ 01545 cudaHostAlloc((void**)host_data, size_of_positions, cudaHostAllocWriteCombined); 01546 #else 01547 cudaMallocHost((void**)host_data, size_of_positions); 01548 #endif 01549 01550 cuda_safe_mem(cudaMalloc((void**)&particle_force, size_of_forces)); 01551 cuda_safe_mem(cudaMalloc((void**)&particle_data, size_of_positions)); 01552 cuda_safe_mem(cudaMalloc((void**)&part, size_of_seed)); 01553 //copy parameters, especially number of parts to gpu mem 01554 cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu))); 01555 //mem init of particles ok 01556 partinitflag = 1; 01557 /** values for the particle kernel */ 01558 int threads_per_block_particles = 64; 01559 int blocks_per_grid_particles_y = 4; 01560 int blocks_per_grid_particles_x = (lbpar_gpu->number_of_particles + threads_per_block_particles * blocks_per_grid_particles_y - 1)/(threads_per_block_particles * blocks_per_grid_particles_y); 01561 dim3 dim_grid_particles = make_uint3(blocks_per_grid_particles_x, blocks_per_grid_particles_y, 1); 01562 01563 if(lbpar_gpu->number_of_particles) KERNELCALL(init_particle_force, dim_grid_particles, threads_per_block_particles, (particle_force, part)); 01564 } 01565 #ifdef LB_BOUNDARIES_GPU 01566 /** setup and call boundaries from the host 01567 * @param host_n_lb_boundaries number of LB boundaries 01568 * @param number_of_boundnodes number of boundnodes 01569 * @param host_boundary_node_list The indices of the boundary nodes 01570 * @param host_boundary_index_list The flag representing the corresponding boundary 01571 * @param host_LB_Boundary_velocity The constant velocity at the boundary, set by the user (Input) 01572 */ 01573 void lb_init_boundaries_GPU(int host_n_lb_boundaries, int number_of_boundnodes, int *host_boundary_node_list, int* host_boundary_index_list, float* host_LB_Boundary_velocity){ 01574 int temp = host_n_lb_boundaries; 01575 01576 size_of_boundindex = number_of_boundnodes*sizeof(int); 01577 cuda_safe_mem(cudaMalloc((void**)&boundary_node_list, size_of_boundindex)); 01578 cuda_safe_mem(cudaMalloc((void**)&boundary_index_list, size_of_boundindex)); 01579 cuda_safe_mem(cudaMemcpy(boundary_index_list, host_boundary_index_list, size_of_boundindex, cudaMemcpyHostToDevice)); 01580 cuda_safe_mem(cudaMemcpy(boundary_node_list, host_boundary_node_list, size_of_boundindex, cudaMemcpyHostToDevice)); 01581 01582 cuda_safe_mem(cudaMalloc((void**)&LB_boundary_force , 3*host_n_lb_boundaries*sizeof(float))); 01583 cuda_safe_mem(cudaMalloc((void**)&LB_boundary_velocity, 3*host_n_lb_boundaries*sizeof(float))); 01584 cuda_safe_mem(cudaMemcpy(LB_boundary_velocity, host_LB_Boundary_velocity, 3*n_lb_boundaries*sizeof(float), cudaMemcpyHostToDevice)); 01585 cuda_safe_mem(cudaMemcpyToSymbol(n_lb_boundaries_gpu, &temp, sizeof(int))); 01586 01587 01588 /** values for the kernel call */ 01589 int threads_per_block = 64; 01590 int blocks_per_grid_y = 4; 01591 int blocks_per_grid_x = (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y); 01592 dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1); 01593 01594 KERNELCALL(reset_boundaries, dim_grid, threads_per_block, (nodes_a, nodes_b)); 01595 01596 if (n_lb_boundaries == 0) { 01597 cudaThreadSynchronize(); 01598 return; 01599 } 01600 if(number_of_boundnodes == 0){ 01601 fprintf(stderr, "WARNING: boundary cmd executed but no boundary node found!\n"); 01602 } else{ 01603 int threads_per_block_bound = 64; 01604 int blocks_per_grid_bound_y = 4; 01605 int blocks_per_grid_bound_x = (number_of_boundnodes + threads_per_block_bound * blocks_per_grid_bound_y - 1) /(threads_per_block_bound * blocks_per_grid_bound_y); 01606 dim3 dim_grid_bound = make_uint3(blocks_per_grid_bound_x, blocks_per_grid_bound_y, 1); 01607 01608 KERNELCALL(init_boundaries, dim_grid_bound, threads_per_block_bound, (boundary_node_list, boundary_index_list, number_of_boundnodes, nodes_a, nodes_b)); 01609 } 01610 01611 cudaThreadSynchronize(); 01612 } 01613 #endif 01614 /**setup and call extern single node force initialization from the host 01615 * @param *lbpar_gpu Pointer to host parameter struct 01616 */ 01617 void lb_reinit_extern_nodeforce_GPU(LB_parameters_gpu *lbpar_gpu){ 01618 01619 cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu))); 01620 01621 /** values for the kernel call */ 01622 int threads_per_block = 64; 01623 int blocks_per_grid_y = 4; 01624 int blocks_per_grid_x = (lbpar_gpu->number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y); 01625 dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1); 01626 01627 KERNELCALL(reinit_node_force, dim_grid, threads_per_block, (node_f)); 01628 01629 } 01630 /**setup and call extern single node force initialization from the host 01631 * @param n_extern_nodeforces number of nodes on which the external force has to be applied 01632 * @param *host_extern_nodeforces Pointer to the host extern node forces 01633 * @param *lbpar_gpu Pointer to host parameter struct 01634 */ 01635 void lb_init_extern_nodeforces_GPU(int n_extern_nodeforces, LB_extern_nodeforce_gpu *host_extern_nodeforces, LB_parameters_gpu *lbpar_gpu){ 01636 01637 size_of_extern_nodeforces = n_extern_nodeforces*sizeof(LB_extern_nodeforce_gpu); 01638 cuda_safe_mem(cudaMalloc((void**)&extern_nodeforces, size_of_extern_nodeforces)); 01639 cudaMemcpy(extern_nodeforces, host_extern_nodeforces, size_of_extern_nodeforces, cudaMemcpyHostToDevice); 01640 01641 if(lbpar_gpu->external_force == 0)cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu))); 01642 01643 int threads_per_block_exf = 64; 01644 int blocks_per_grid_exf_y = 4; 01645 int blocks_per_grid_exf_x = (n_extern_nodeforces + threads_per_block_exf * blocks_per_grid_exf_y - 1) /(threads_per_block_exf * blocks_per_grid_exf_y); 01646 dim3 dim_grid_exf = make_uint3(blocks_per_grid_exf_x, blocks_per_grid_exf_y, 1); 01647 01648 KERNELCALL(init_extern_nodeforces, dim_grid_exf, threads_per_block_exf, (n_extern_nodeforces, extern_nodeforces, node_f)); 01649 cudaFree(extern_nodeforces); 01650 } 01651 01652 /**setup and call particle kernel from the host 01653 * @param **host_data Pointer to the host particle positions and velocities 01654 */ 01655 void lb_particle_GPU(LB_particle_gpu *host_data){ 01656 01657 /** get espresso md particle values*/ 01658 cudaMemcpyAsync(particle_data, host_data, size_of_positions, cudaMemcpyHostToDevice, stream[0]); 01659 /** call of the particle kernel */ 01660 /** values for the particle kernel */ 01661 int threads_per_block_particles = 64; 01662 int blocks_per_grid_particles_y = 4; 01663 int blocks_per_grid_particles_x = (lbpar_gpu.number_of_particles + threads_per_block_particles * blocks_per_grid_particles_y - 1)/(threads_per_block_particles * blocks_per_grid_particles_y); 01664 dim3 dim_grid_particles = make_uint3(blocks_per_grid_particles_x, blocks_per_grid_particles_y, 1); 01665 01666 KERNELCALL(calc_fluid_particle_ia, dim_grid_particles, threads_per_block_particles, (*current_nodes, particle_data, particle_force, node_f, part)); 01667 } 01668 /** setup and call kernel to copy particle forces to host 01669 * @param *host_forces contains the particle force computed on the GPU 01670 */ 01671 void lb_copy_forces_GPU(LB_particle_force_gpu *host_forces){ 01672 01673 /** Copy result from device memory to host memory*/ 01674 cudaMemcpy(host_forces, particle_force, size_of_forces, cudaMemcpyDeviceToHost); 01675 01676 /** values for the particle kernel */ 01677 int threads_per_block_particles = 64; 01678 int blocks_per_grid_particles_y = 4; 01679 int blocks_per_grid_particles_x = (lbpar_gpu.number_of_particles + threads_per_block_particles * blocks_per_grid_particles_y - 1)/(threads_per_block_particles * blocks_per_grid_particles_y); 01680 dim3 dim_grid_particles = make_uint3(blocks_per_grid_particles_x, blocks_per_grid_particles_y, 1); 01681 01682 /** reset part forces with zero*/ 01683 KERNELCALL(reset_particle_force, dim_grid_particles, threads_per_block_particles, (particle_force)); 01684 01685 cudaThreadSynchronize(); 01686 } 01687 01688 /** setup and call kernel for getting macroscopic fluid values of all nodes 01689 * @param *host_values struct to save the gpu values 01690 */ 01691 void lb_get_values_GPU(LB_values_gpu *host_values){ 01692 01693 /** values for the kernel call */ 01694 int threads_per_block = 64; 01695 int blocks_per_grid_y = 4; 01696 int blocks_per_grid_x = (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y); 01697 dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1); 01698 01699 KERNELCALL(values, dim_grid, threads_per_block, (*current_nodes, device_values)); 01700 cudaMemcpy(host_values, device_values, size_of_values, cudaMemcpyDeviceToHost); 01701 01702 } 01703 01704 /** get all the boundary flags for all nodes 01705 * @param host_bound_array here go the values of the boundary flag 01706 */ 01707 void lb_get_boundary_flags_GPU(unsigned int* host_bound_array){ 01708 01709 unsigned int* device_bound_array; 01710 cuda_safe_mem(cudaMalloc((void**)&device_bound_array, lbpar_gpu.number_of_nodes*sizeof(unsigned int))); 01711 /** values for the kernel call */ 01712 int threads_per_block = 64; 01713 int blocks_per_grid_y = 4; 01714 int blocks_per_grid_x = (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) / (threads_per_block * blocks_per_grid_y); 01715 dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1); 01716 01717 KERNELCALL(lb_get_boundaries, dim_grid, threads_per_block, (*current_nodes, device_bound_array)); 01718 01719 cudaMemcpy(host_bound_array, device_bound_array, lbpar_gpu.number_of_nodes*sizeof(unsigned int), cudaMemcpyDeviceToHost); 01720 01721 cudaFree(device_bound_array); 01722 01723 } 01724 01725 /** setup and call kernel for getting macroscopic fluid values of a single node*/ 01726 void lb_print_node_GPU(int single_nodeindex, LB_values_gpu *host_print_values){ 01727 01728 LB_values_gpu *device_print_values; 01729 cuda_safe_mem(cudaMalloc((void**)&device_print_values, sizeof(LB_values_gpu))); 01730 int threads_per_block_print = 1; 01731 int blocks_per_grid_print_y = 1; 01732 int blocks_per_grid_print_x = 1; 01733 dim3 dim_grid_print = make_uint3(blocks_per_grid_print_x, blocks_per_grid_print_y, 1); 01734 01735 KERNELCALL(lb_print_node, dim_grid_print, threads_per_block_print, (single_nodeindex, device_print_values, *current_nodes)); 01736 01737 cudaMemcpy(host_print_values, device_print_values, sizeof(LB_values_gpu), cudaMemcpyDeviceToHost); 01738 cudaFree(device_print_values); 01739 01740 } 01741 /** setup and call kernel to calculate the total momentum of the hole fluid 01742 * @param *mass value of the mass calcutated on the GPU 01743 */ 01744 void lb_calc_fluid_mass_GPU(double* mass){ 01745 01746 float* tot_mass; 01747 float cpu_mass = 0.f ; 01748 cuda_safe_mem(cudaMalloc((void**)&tot_mass, sizeof(float))); 01749 cudaMemcpy(tot_mass, &cpu_mass, sizeof(float), cudaMemcpyHostToDevice); 01750 01751 /** values for the kernel call */ 01752 int threads_per_block = 64; 01753 int blocks_per_grid_y = 4; 01754 int blocks_per_grid_x = (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y); 01755 dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1); 01756 01757 KERNELCALL(calc_mass, dim_grid, threads_per_block,(*current_nodes, tot_mass)); 01758 01759 cudaMemcpy(&cpu_mass, tot_mass, sizeof(float), cudaMemcpyDeviceToHost); 01760 01761 cudaFree(tot_mass); 01762 mass[0] = (double)(cpu_mass); 01763 } 01764 01765 /** setup and call kernel to calculate the total momentum of the hole fluid 01766 * @param host_mom value of the momentum calcutated on the GPU 01767 */ 01768 void lb_calc_fluid_momentum_GPU(double* host_mom){ 01769 01770 float* tot_momentum; 01771 float host_momentum[3] = { 0.f, 0.f, 0.f}; 01772 cuda_safe_mem(cudaMalloc((void**)&tot_momentum, 3*sizeof(float))); 01773 cudaMemcpy(tot_momentum, host_momentum, 3*sizeof(float), cudaMemcpyHostToDevice); 01774 01775 /** values for the kernel call */ 01776 int threads_per_block = 64; 01777 int blocks_per_grid_y = 4; 01778 int blocks_per_grid_x = (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y); 01779 dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1); 01780 01781 KERNELCALL(momentum, dim_grid, threads_per_block,(*current_nodes, tot_momentum, node_f)); 01782 01783 cudaMemcpy(host_momentum, tot_momentum, 3*sizeof(float), cudaMemcpyDeviceToHost); 01784 01785 cudaFree(tot_momentum); 01786 host_mom[0] = (double)(host_momentum[0]* lbpar_gpu.agrid/lbpar_gpu.tau); 01787 host_mom[1] = (double)(host_momentum[1]* lbpar_gpu.agrid/lbpar_gpu.tau); 01788 host_mom[2] = (double)(host_momentum[2]* lbpar_gpu.agrid/lbpar_gpu.tau); 01789 } 01790 /** setup and call kernel to calculate the temperature of the hole fluid 01791 * @param host_temp value of the temperatur calcutated on the GPU 01792 */ 01793 void lb_calc_fluid_temperature_GPU(double* host_temp){ 01794 float host_jsquared = 0.f; 01795 float* device_jsquared; 01796 cuda_safe_mem(cudaMalloc((void**)&device_jsquared, sizeof(float))); 01797 cudaMemcpy(device_jsquared, &host_jsquared, sizeof(float), cudaMemcpyHostToDevice); 01798 01799 /** values for the kernel call */ 01800 int threads_per_block = 64; 01801 int blocks_per_grid_y = 4; 01802 int blocks_per_grid_x = (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y); 01803 dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1); 01804 01805 KERNELCALL(temperature, dim_grid, threads_per_block,(*current_nodes, device_jsquared)); 01806 01807 cudaMemcpy(&host_jsquared, device_jsquared, sizeof(float), cudaMemcpyDeviceToHost); 01808 01809 host_temp[0] = (double)(host_jsquared*1./(3.f*lbpar_gpu.rho*lbpar_gpu.dim_x*lbpar_gpu.dim_y*lbpar_gpu.dim_z*lbpar_gpu.tau*lbpar_gpu.tau*lbpar_gpu.agrid)); 01810 } 01811 /** setup and call kernel for getting macroscopic fluid values of all nodes 01812 * @param *host_values struct to save the gpu values 01813 */ 01814 void lb_save_checkpoint_GPU(float *host_checkpoint_vd, unsigned int *host_checkpoint_seed, unsigned int *host_checkpoint_boundary, float *host_checkpoint_force){ 01815 01816 cudaMemcpy(host_checkpoint_vd, current_nodes->vd, lbpar_gpu.number_of_nodes * 19 * sizeof(float), cudaMemcpyDeviceToHost); 01817 cudaMemcpy(host_checkpoint_seed, current_nodes->seed, lbpar_gpu.number_of_nodes * sizeof(unsigned int), cudaMemcpyDeviceToHost); 01818 cudaMemcpy(host_checkpoint_boundary, current_nodes->boundary, lbpar_gpu.number_of_nodes * sizeof(unsigned int), cudaMemcpyDeviceToHost); 01819 cudaMemcpy(host_checkpoint_force, node_f.force, lbpar_gpu.number_of_nodes * 3 * sizeof(float), cudaMemcpyDeviceToHost); 01820 01821 } 01822 /** setup and call kernel for setting macroscopic fluid values of all nodes 01823 * @param *host_values struct to set stored values 01824 */ 01825 void lb_load_checkpoint_GPU(float *host_checkpoint_vd, unsigned int *host_checkpoint_seed, unsigned int *host_checkpoint_boundary, float *host_checkpoint_force){ 01826 01827 cudaMemcpy(current_nodes->vd, host_checkpoint_vd, lbpar_gpu.number_of_nodes * 19 * sizeof(float), cudaMemcpyHostToDevice); 01828 intflag = 1; 01829 cudaMemcpy(current_nodes->seed, host_checkpoint_seed, lbpar_gpu.number_of_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice); 01830 cudaMemcpy(current_nodes->boundary, host_checkpoint_boundary, lbpar_gpu.number_of_nodes * sizeof(unsigned int), cudaMemcpyHostToDevice); 01831 cudaMemcpy(node_f.force, host_checkpoint_force, lbpar_gpu.number_of_nodes * 3 * sizeof(float), cudaMemcpyHostToDevice); 01832 01833 } 01834 01835 /** setup and call kernel to get the boundary flag of a single node 01836 * @param single_nodeindex number of the node to get the flag for 01837 * @param host_flag her goes the value of the boundary flag 01838 */ 01839 void lb_get_boundary_flag_GPU(int single_nodeindex, unsigned int* host_flag){ 01840 01841 unsigned int* device_flag; 01842 cuda_safe_mem(cudaMalloc((void**)&device_flag, sizeof(unsigned int))); 01843 int threads_per_block_flag = 1; 01844 int blocks_per_grid_flag_y = 1; 01845 int blocks_per_grid_flag_x = 1; 01846 dim3 dim_grid_flag = make_uint3(blocks_per_grid_flag_x, blocks_per_grid_flag_y, 1); 01847 01848 KERNELCALL(lb_get_boundary_flag, dim_grid_flag, threads_per_block_flag, (single_nodeindex, device_flag, *current_nodes)); 01849 01850 cudaMemcpy(host_flag, device_flag, sizeof(unsigned int), cudaMemcpyDeviceToHost); 01851 01852 cudaFree(device_flag); 01853 01854 } 01855 /** set the net velocity at a single node 01856 * @param single_nodeindex the node to set the velocity for 01857 * @param host_velocity the velocity to set 01858 */ 01859 void lb_set_node_velocity_GPU(int single_nodeindex, float* host_velocity){ 01860 01861 float* device_velocity; 01862 cuda_safe_mem(cudaMalloc((void**)&device_velocity, 3*sizeof(float))); 01863 cudaMemcpy(device_velocity, host_velocity, 3*sizeof(float), cudaMemcpyHostToDevice); 01864 int threads_per_block_flag = 1; 01865 int blocks_per_grid_flag_y = 1; 01866 int blocks_per_grid_flag_x = 1; 01867 dim3 dim_grid_flag = make_uint3(blocks_per_grid_flag_x, blocks_per_grid_flag_y, 1); 01868 01869 KERNELCALL(set_u_equilibrium, dim_grid_flag, threads_per_block_flag, (*current_nodes, single_nodeindex, device_velocity)); 01870 01871 cudaFree(device_velocity); 01872 01873 } 01874 /** reinit of params 01875 * @param *lbpar_gpu struct containing the paramters of the fluid 01876 */ 01877 void reinit_parameters_GPU(LB_parameters_gpu *lbpar_gpu){ 01878 01879 /**write parameters in const memory*/ 01880 cuda_safe_mem(cudaMemcpyToSymbol(para, lbpar_gpu, sizeof(LB_parameters_gpu))); 01881 } 01882 /**integration kernel for the lb gpu fluid update called from host */ 01883 void lb_integrate_GPU(){ 01884 01885 /** values for the kernel call */ 01886 int threads_per_block = 64; 01887 int blocks_per_grid_y = 4; 01888 int blocks_per_grid_x = (lbpar_gpu.number_of_nodes + threads_per_block * blocks_per_grid_y - 1) /(threads_per_block * blocks_per_grid_y); 01889 dim3 dim_grid = make_uint3(blocks_per_grid_x, blocks_per_grid_y, 1); 01890 01891 #ifdef LB_BOUNDARIES_GPU 01892 if (n_lb_boundaries > 0) 01893 cuda_safe_mem(cudaMemset ( LB_boundary_force, 0, 3*n_lb_boundaries*sizeof(float))); 01894 #endif 01895 01896 /**call of fluid step*/ 01897 if (intflag == 1){ 01898 KERNELCALL(integrate, dim_grid, threads_per_block, (nodes_a, nodes_b, device_values, node_f)); 01899 current_nodes = &nodes_b; 01900 #ifdef LB_BOUNDARIES_GPU 01901 01902 if (n_lb_boundaries > 0) { 01903 KERNELCALL(bb_read, dim_grid, threads_per_block, (nodes_a, nodes_b, LB_boundary_velocity, LB_boundary_force)); 01904 // KERNELCALL(bb_write, dim_grid, threads_per_block, (nodes_a, nodes_b)); 01905 } 01906 #endif 01907 intflag = 0; 01908 } 01909 else{ 01910 KERNELCALL(integrate, dim_grid, threads_per_block, (nodes_b, nodes_a, device_values, node_f)); 01911 current_nodes = &nodes_a; 01912 #ifdef LB_BOUNDARIES_GPU 01913 01914 if (n_lb_boundaries > 0) { 01915 KERNELCALL(bb_read, dim_grid, threads_per_block, (nodes_b, nodes_a, LB_boundary_velocity, LB_boundary_force)); 01916 // KERNELCALL(bb_write, dim_grid, threads_per_block, (nodes_b, nodes_a)); 01917 } 01918 #endif 01919 intflag = 1; 01920 } 01921 } 01922 01923 void lb_gpu_get_boundary_forces(double* forces) { 01924 #ifdef LB_BOUNDARIES_GPU 01925 float* temp = (float*) malloc(3*n_lb_boundaries*sizeof(float)); 01926 cuda_safe_mem(cudaMemcpy(temp, LB_boundary_force, 3*n_lb_boundaries*sizeof(float), cudaMemcpyDeviceToHost)); 01927 for (int i =0; i<3*n_lb_boundaries; i++) { 01928 forces[i]=(double)temp[i]; 01929 } 01930 free(temp); 01931 #endif 01932 } 01933 01934 /** free gpu memory kernel called from the host (not used anymore) */ 01935 void lb_free_GPU(){ 01936 // Free device memory 01937 cudaFree(device_values); 01938 cudaFree(¶); 01939 cudaFree(&nodes_a); 01940 cudaFree(&nodes_b); 01941 cudaFree(particle_force); 01942 cudaFree(particle_data); 01943 cudaFree(&node_f); 01944 cudaFree(part); 01945 cudaStreamDestroy(stream[0]); 01946 } 01947 #endif /* LB_GPU */
1.7.5.1