ESPResSo
Extensible Simulation Package for Research on Soft Matter Systems
Loading...
Searching...
No Matches
p3m_gpu_cuda.cu
Go to the documentation of this file.
1/*
2 * Copyright (C) 2010-2022 The ESPResSo project
3 *
4 * This file is part of ESPResSo.
5 *
6 * ESPResSo is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * ESPResSo is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20/**
21 * @file
22 *
23 * P3M electrostatics on GPU.
24 *
25 * The corresponding header file is @ref p3m_gpu_cuda.cuh.
26 */
27
28#include "config/config.hpp"
29
30#ifdef ELECTROSTATICS
31
32#define P3M_GPU_FLOAT
33// #define P3M_GPU_REAL_DOUBLE
34
35#ifdef P3M_GPU_FLOAT
36#define REAL_TYPE float
37#define FFT_TYPE_COMPLEX cufftComplex
38#define FFT_FORW_FFT cufftExecR2C
39#define FFT_BACK_FFT cufftExecC2R
40#define FFT_PLAN_FORW_FLAG CUFFT_R2C
41#define FFT_PLAN_BACK_FLAG CUFFT_C2R
42#endif
43
44#ifdef P3M_GPU_REAL_DOUBLE
45#define REAL_TYPE double
46#define FFT_TYPE_COMPLEX cufftDoubleComplex
47#define FFT_FORW_FFT cufftExecD2Z
48#define FFT_BACK_FFT cufftExecZ2D
49#define FFT_PLAN_FORW_FLAG CUFFT_D2Z
50#define FFT_PLAN_BACK_FLAG CUFFT_Z2D
51#endif
52
54
55#include "cuda/utils.cuh"
56#include "system/System.hpp"
57
60#include <utils/math/sinc.hpp>
61#include <utils/math/sqr.hpp>
62
63#include <cuda.h>
64#include <cufft.h>
65
66#include <cstdio>
67#include <cstdlib>
68#include <stdexcept>
69
70#if defined(OMPI_MPI_H) || defined(_MPI_H)
71#error CU-file includes mpi.h! This should not happen!
72#endif
73
74using Utils::int_pow;
75using Utils::sqr;
76
77struct P3MGpuData {
78 /** Charge mesh */
80 /** Force meshes */
84 /** Influence Function */
86 /** Charge assignment order */
87 int cao;
88 /** Total number of mesh points (including padding) */
90 /** Ewald parameter */
92 /** Number of particles */
93 unsigned int n_part;
94 /** Box size */
96 /** Mesh dimensions */
97 int mesh[3];
98 /** Padded size */
100 /** Inverse mesh spacing */
102 /** Position shift */
104};
105
107 /** Forward FFT plan */
108 cufftHandle forw_plan;
109 /** Backward FFT plan */
110 cufftHandle back_plan;
111};
112
117
119
121 auto const free_device_pointer = [](auto *&ptr) {
122 if (ptr != nullptr) {
123 cuda_safe_mem(cudaFree(reinterpret_cast<void *>(ptr)));
124 ptr = nullptr;
125 }
126 };
127 free_device_pointer(p3m_gpu_data.charge_mesh);
128 free_device_pointer(p3m_gpu_data.force_mesh_x);
129 free_device_pointer(p3m_gpu_data.force_mesh_y);
130 free_device_pointer(p3m_gpu_data.force_mesh_z);
131 free_device_pointer(p3m_gpu_data.G_hat);
132 cufftDestroy(p3m_fft.forw_plan);
133 cufftDestroy(p3m_fft.back_plan);
134 is_initialized = false;
135 }
136};
137
138template <int cao>
139__device__ void static Aliasing_sums_ik(const P3MGpuData p, int NX, int NY,
140 int NZ, REAL_TYPE *Zaehler,
141 REAL_TYPE *Nenner) {
142 REAL_TYPE S1, S2, S3;
143 REAL_TYPE zwi;
144 int MX, MY, MZ;
145 REAL_TYPE NMX, NMY, NMZ;
146 REAL_TYPE NM2;
147 REAL_TYPE TE;
148 REAL_TYPE Leni[3];
149 REAL_TYPE Meshi[3];
150 for (int i = 0; i < 3; ++i) {
151 Leni[i] = 1.0f / p.box[i];
152 Meshi[i] = 1.0f / static_cast<REAL_TYPE>(p.mesh[i]);
153 }
154
155 Zaehler[0] = Zaehler[1] = Zaehler[2] = *Nenner = 0.0;
156
157 for (MX = -P3M_BRILLOUIN; MX <= P3M_BRILLOUIN; MX++) {
158 NMX = static_cast<REAL_TYPE>(((NX > p.mesh[0] / 2) ? NX - p.mesh[0] : NX) +
159 p.mesh[0] * MX);
160 S1 = int_pow<2 * cao>(Utils::sinc(Meshi[0] * NMX));
161 for (MY = -P3M_BRILLOUIN; MY <= P3M_BRILLOUIN; MY++) {
162 NMY = static_cast<REAL_TYPE>(
163 ((NY > p.mesh[1] / 2) ? NY - p.mesh[1] : NY) + p.mesh[1] * MY);
164 S2 = S1 * int_pow<2 * cao>(Utils::sinc(Meshi[1] * NMY));
165 for (MZ = -P3M_BRILLOUIN; MZ <= P3M_BRILLOUIN; MZ++) {
166 NMZ = static_cast<REAL_TYPE>(
167 ((NZ > p.mesh[2] / 2) ? NZ - p.mesh[2] : NZ) + p.mesh[2] * MZ);
168 S3 = S2 * int_pow<2 * cao>(Utils::sinc(Meshi[2] * NMZ));
169
170 NM2 = sqr(NMX * Leni[0]) + sqr(NMY * Leni[1]) + sqr(NMZ * Leni[2]);
171 *Nenner += S3;
172
173 TE = exp(-sqr(Utils::pi<REAL_TYPE>() / (p.alpha)) * NM2);
174 zwi = S3 * TE / NM2;
175 Zaehler[0] += NMX * zwi * Leni[0];
176 Zaehler[1] += NMY * zwi * Leni[1];
177 Zaehler[2] += NMZ * zwi * Leni[2];
178 }
179 }
180 }
181}
182
183/* Calculate influence function */
184template <int cao>
186
187 const auto NX = static_cast<int>(blockDim.x * blockIdx.x + threadIdx.x);
188 const auto NY = static_cast<int>(blockDim.y * blockIdx.y + threadIdx.y);
189 const auto NZ = static_cast<int>(blockDim.z * blockIdx.z + threadIdx.z);
190 REAL_TYPE Dnx, Dny, Dnz;
191 REAL_TYPE Zaehler[3] = {0.0, 0.0, 0.0}, Nenner = 0.0;
192 REAL_TYPE zwi;
193 auto index = 0;
194 REAL_TYPE Leni[3];
195 for (int i = 0; i < 3; ++i) {
196 Leni[i] = 1.0f / p.box[i];
197 }
198
199 if ((NX >= p.mesh[0]) || (NY >= p.mesh[1]) || (NZ >= (p.mesh[2] / 2 + 1)))
200 return;
201
202 index = NX * p.mesh[1] * (p.mesh[2] / 2 + 1) + NY * (p.mesh[2] / 2 + 1) + NZ;
203
204 if (((NX == 0) && (NY == 0) && (NZ == 0)) ||
205 ((NX % (p.mesh[0] / 2) == 0) && (NY % (p.mesh[1] / 2) == 0) &&
206 (NZ % (p.mesh[2] / 2) == 0))) {
207 p.G_hat[index] = 0;
208 } else {
209 Aliasing_sums_ik<cao>(p, NX, NY, NZ, Zaehler, &Nenner);
210
211 Dnx = static_cast<REAL_TYPE>((NX > p.mesh[0] / 2) ? NX - p.mesh[0] : NX);
212 Dny = static_cast<REAL_TYPE>((NY > p.mesh[1] / 2) ? NY - p.mesh[1] : NY);
213 Dnz = static_cast<REAL_TYPE>((NZ > p.mesh[2] / 2) ? NZ - p.mesh[2] : NZ);
214
215 zwi = Dnx * Zaehler[0] * Leni[0] + Dny * Zaehler[1] * Leni[1] +
216 Dnz * Zaehler[2] * Leni[2];
217 zwi /= ((sqr(Dnx * Leni[0]) + sqr(Dny * Leni[1]) + sqr(Dnz * Leni[2])) *
218 sqr(Nenner));
219 p.G_hat[index] = 2 * zwi / Utils::pi<REAL_TYPE>();
220 }
221}
222
223#ifdef P3M_GPU_REAL_DOUBLE
224__device__ double atomicAdd(double *address, double val) {
225 unsigned long long int *address_as_ull = (unsigned long long int *)address;
226 unsigned long long int old = *address_as_ull, assumed;
227 do {
228 assumed = old;
229 old = atomicCAS(address_as_ull, assumed,
230 __double_as_longlong(val + __longlong_as_double(assumed)));
231 } while (assumed != old);
232 return __longlong_as_double(old);
233}
234#endif
235
236namespace {
237__device__ inline auto linear_index_r(P3MGpuData const &p, int i, int j,
238 int k) {
239 return static_cast<unsigned int>(p.mesh[1] * p.mesh_z_padded * i +
240 p.mesh_z_padded * j + k);
241}
242
243__device__ inline auto linear_index_k(P3MGpuData const &p, int i, int j,
244 int k) {
245 return static_cast<unsigned int>(p.mesh[1] * (p.mesh[2] / 2 + 1) * i +
246 (p.mesh[2] / 2 + 1) * j + k);
247}
248} // namespace
249
250__global__ void apply_diff_op(const P3MGpuData p) {
251 auto const linear_index = linear_index_k(p, static_cast<int>(blockIdx.x),
252 static_cast<int>(blockIdx.y),
253 static_cast<int>(threadIdx.x));
254
255 auto const bidx = static_cast<int>(blockIdx.x);
256 auto const bidy = static_cast<int>(blockIdx.y);
257 auto const nx = (bidx > p.mesh[0] / 2) ? bidx - p.mesh[0] : bidx;
258 auto const ny = (bidy > p.mesh[1] / 2) ? bidy - p.mesh[1] : bidy;
259 auto const nz = static_cast<int>(threadIdx.x);
260
261 const FFT_TYPE_COMPLEX meshw = p.charge_mesh[linear_index];
263 buf.x = -2.0f * Utils::pi<float>() * meshw.y;
264 buf.y = 2.0f * Utils::pi<float>() * meshw.x;
265
266 p.force_mesh_x[linear_index].x =
267 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(nx) * buf.x / p.box[0];
268 p.force_mesh_x[linear_index].y =
269 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(nx) * buf.y / p.box[0];
270
271 p.force_mesh_y[linear_index].x =
272 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(ny) * buf.x / p.box[1];
273 p.force_mesh_y[linear_index].y =
274 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(ny) * buf.y / p.box[1];
275
276 p.force_mesh_z[linear_index].x =
277 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(nz) * buf.x / p.box[2];
278 p.force_mesh_z[linear_index].y =
279 static_cast<decltype(FFT_TYPE_COMPLEX::x)>(nz) * buf.y / p.box[2];
280}
281
282__device__ inline int wrap_index(const int ind, const int mesh) {
283 if (ind < 0)
284 return ind + mesh;
285 if (ind >= mesh)
286 return ind - mesh;
287 return ind;
288}
289
290__global__ void apply_influence_function(const P3MGpuData p) {
291 auto const linear_index = linear_index_k(p, static_cast<int>(blockIdx.x),
292 static_cast<int>(blockIdx.y),
293 static_cast<int>(threadIdx.x));
294
295 p.charge_mesh[linear_index].x *= p.G_hat[linear_index];
296 p.charge_mesh[linear_index].y *= p.G_hat[linear_index];
297}
298
299template <int cao, bool shared>
301 float const *const __restrict__ part_pos,
302 float const *const __restrict__ part_q,
303 unsigned int const parts_per_block) {
304 auto const part_in_block = threadIdx.x / static_cast<unsigned int>(cao);
305 auto const cao_id_x =
306 threadIdx.x - part_in_block * static_cast<unsigned int>(cao);
307 /* id of the particle */
308 auto const id =
309 parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block;
310 if (id >= params.n_part)
311 return;
312 /* position relative to the closest gird point */
313 REAL_TYPE m_pos[3];
314 /* index of the nearest mesh point */
315 int nmp_x, nmp_y, nmp_z;
316
317 auto *charge_mesh = (REAL_TYPE *)params.charge_mesh;
318
319 m_pos[0] = part_pos[3 * id + 0] * params.hi[0] - params.pos_shift;
320 m_pos[1] = part_pos[3 * id + 1] * params.hi[1] - params.pos_shift;
321 m_pos[2] = part_pos[3 * id + 2] * params.hi[2] - params.pos_shift;
322
323 nmp_x = static_cast<int>(floorf(m_pos[0] + 0.5f));
324 nmp_y = static_cast<int>(floorf(m_pos[1] + 0.5f));
325 nmp_z = static_cast<int>(floorf(m_pos[2] + 0.5f));
326
327 m_pos[0] -= static_cast<REAL_TYPE>(nmp_x);
328 m_pos[1] -= static_cast<REAL_TYPE>(nmp_y);
329 m_pos[2] -= static_cast<REAL_TYPE>(nmp_z);
330
331 nmp_x = wrap_index(nmp_x + static_cast<int>(cao_id_x), params.mesh[0]);
332 nmp_y = wrap_index(nmp_y + static_cast<int>(threadIdx.y), params.mesh[1]);
333 nmp_z = wrap_index(nmp_z + static_cast<int>(threadIdx.z), params.mesh[2]);
334
335 auto const index = linear_index_r(params, nmp_x, nmp_y, nmp_z);
336
337 extern __shared__ float weights[];
338
339 if (shared) {
340 auto const offset = static_cast<unsigned int>(cao) * part_in_block;
341 if ((threadIdx.y < 3u) && (threadIdx.z == 0u)) {
342 weights[3u * offset + 3u * cao_id_x + threadIdx.y] =
343 Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[threadIdx.y]);
344 }
345
347
348 auto const c = weights[3u * offset + 3u * cao_id_x] *
349 weights[3u * offset + 3u * threadIdx.y + 1u] *
350 weights[3u * offset + 3u * threadIdx.z + 2u] * part_q[id];
351 atomicAdd(&(charge_mesh[index]), c);
352
353 } else {
354 auto const c =
355 Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[0]) * part_q[id] *
356 Utils::bspline<cao>(static_cast<int>(threadIdx.y), m_pos[1]) *
357 Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]);
358 atomicAdd(&(charge_mesh[index]), c);
359 }
360}
361
363 float const *const __restrict__ part_pos,
364 float const *const __restrict__ part_q) {
365 auto const cao = static_cast<unsigned int>(params.cao);
366 auto const cao3 = int_pow<3>(cao);
367 unsigned int parts_per_block = 1u, n_blocks = 1u;
368
369 while ((parts_per_block + 1u) * cao3 <= 1024u) {
370 parts_per_block++;
371 }
372 if ((params.n_part % parts_per_block) == 0u)
373 n_blocks = std::max<unsigned>(1u, params.n_part / parts_per_block);
374 else
375 n_blocks = params.n_part / parts_per_block + 1u;
376
377 dim3 block(parts_per_block * cao, cao, cao);
378 dim3 grid(n_blocks, 1u, 1u);
379 while (grid.x > 65536u) {
380 grid.y++;
381 if ((n_blocks % grid.y) == 0u)
382 grid.x = std::max<unsigned>(1u, n_blocks / grid.y);
383 else
384 grid.x = n_blocks / grid.y + 1u;
385 }
386
387 auto const data_length =
388 3 * static_cast<std::size_t>(parts_per_block * cao) * sizeof(REAL_TYPE);
389 switch (cao) {
390 case 1:
391 (assign_charge_kernel<1, false>)<<<grid, block, 0, nullptr>>>(
392 params, part_pos, part_q, parts_per_block);
393 break;
394 case 2:
395 (assign_charge_kernel<2, false>)<<<grid, block, 0, nullptr>>>(
396 params, part_pos, part_q, parts_per_block);
397 break;
398 case 3:
399 (assign_charge_kernel<3, true>)<<<grid, block, data_length, nullptr>>>(
400 params, part_pos, part_q, parts_per_block);
401 break;
402 case 4:
403 (assign_charge_kernel<4, true>)<<<grid, block, data_length, nullptr>>>(
404 params, part_pos, part_q, parts_per_block);
405 break;
406 case 5:
407 (assign_charge_kernel<5, true>)<<<grid, block, data_length, nullptr>>>(
408 params, part_pos, part_q, parts_per_block);
409 break;
410 case 6:
411 (assign_charge_kernel<6, true>)<<<grid, block, data_length, nullptr>>>(
412 params, part_pos, part_q, parts_per_block);
413 break;
414 case 7:
415 (assign_charge_kernel<7, true>)<<<grid, block, data_length, nullptr>>>(
416 params, part_pos, part_q, parts_per_block);
417 break;
418 default:
419 break;
420 }
421 cuda_check_errors_exit(block, grid, "assign_charge", __FILE__, __LINE__);
422}
423
424template <int cao, bool shared>
426 float const *const __restrict__ part_pos,
427 float const *const __restrict__ part_q,
428 float *const __restrict__ part_f,
429 REAL_TYPE prefactor,
430 unsigned int const parts_per_block) {
431 auto const part_in_block = threadIdx.x / static_cast<unsigned int>(cao);
432 auto const cao_id_x =
433 threadIdx.x - part_in_block * static_cast<unsigned int>(cao);
434 /* id of the particle */
435 auto const id =
436 parts_per_block * (blockIdx.x * gridDim.y + blockIdx.y) + part_in_block;
437 if (id >= params.n_part)
438 return;
439 /* position relative to the closest grid point */
440 REAL_TYPE m_pos[3];
441 /* index of the nearest mesh point */
442 int nmp_x, nmp_y, nmp_z;
443
444 m_pos[0] = part_pos[3 * id + 0] * params.hi[0] - params.pos_shift;
445 m_pos[1] = part_pos[3 * id + 1] * params.hi[1] - params.pos_shift;
446 m_pos[2] = part_pos[3 * id + 2] * params.hi[2] - params.pos_shift;
447
448 nmp_x = static_cast<int>(floorf(m_pos[0] + REAL_TYPE{0.5}));
449 nmp_y = static_cast<int>(floorf(m_pos[1] + REAL_TYPE{0.5}));
450 nmp_z = static_cast<int>(floorf(m_pos[2] + REAL_TYPE{0.5}));
451
452 m_pos[0] -= static_cast<REAL_TYPE>(nmp_x);
453 m_pos[1] -= static_cast<REAL_TYPE>(nmp_y);
454 m_pos[2] -= static_cast<REAL_TYPE>(nmp_z);
455
456 nmp_x = wrap_index(nmp_x + static_cast<int>(cao_id_x), params.mesh[0]);
457 nmp_y = wrap_index(nmp_y + static_cast<int>(threadIdx.y), params.mesh[1]);
458 nmp_z = wrap_index(nmp_z + static_cast<int>(threadIdx.z), params.mesh[2]);
459
460 auto const index = linear_index_r(params, nmp_x, nmp_y, nmp_z);
461
462 extern __shared__ float weights[];
463
464 REAL_TYPE c;
465 if (shared) {
466 auto const offset = static_cast<unsigned int>(cao) * part_in_block;
467 if ((threadIdx.y < 3u) && (threadIdx.z == 0u)) {
468 weights[3u * offset + 3u * cao_id_x + threadIdx.y] =
469 Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[threadIdx.y]);
470 }
471
473
474 c = -prefactor * weights[3u * offset + 3u * cao_id_x] *
475 weights[3u * offset + 3u * threadIdx.y + 1u] *
476 weights[3u * offset + 3u * threadIdx.z + 2u] * part_q[id];
477 } else {
478 c = -prefactor * part_q[id] *
479 Utils::bspline<cao>(static_cast<int>(cao_id_x), m_pos[0]) *
480 Utils::bspline<cao>(static_cast<int>(threadIdx.y), m_pos[1]) *
481 Utils::bspline<cao>(static_cast<int>(threadIdx.z), m_pos[2]);
482 }
483
484 const REAL_TYPE *force_mesh_x = (REAL_TYPE *)params.force_mesh_x;
485 const REAL_TYPE *force_mesh_y = (REAL_TYPE *)params.force_mesh_y;
486 const REAL_TYPE *force_mesh_z = (REAL_TYPE *)params.force_mesh_z;
487
488 atomicAdd(&(part_f[3u * id + 0u]), c * force_mesh_x[index]);
489 atomicAdd(&(part_f[3u * id + 1u]), c * force_mesh_y[index]);
490 atomicAdd(&(part_f[3u * id + 2u]), c * force_mesh_z[index]);
491}
492
494 float const *const __restrict__ part_pos,
495 float const *const __restrict__ part_q,
496 float *const __restrict__ part_f,
497 REAL_TYPE const prefactor) {
498 auto const cao = params.cao;
499 auto const cao3 = int_pow<3>(cao);
500 unsigned int parts_per_block = 1u, n_blocks = 1u;
501
502 while ((parts_per_block + 1u) * static_cast<unsigned int>(cao3) <= 1024u) {
503 parts_per_block++;
504 }
505
506 if ((params.n_part % parts_per_block) == 0u)
507 n_blocks = std::max<unsigned>(1u, params.n_part / parts_per_block);
508 else
509 n_blocks = params.n_part / parts_per_block + 1u;
510
511 dim3 block(parts_per_block * static_cast<unsigned int>(cao),
512 static_cast<unsigned int>(cao), static_cast<unsigned int>(cao));
513 dim3 grid(n_blocks, 1u, 1u);
514 while (grid.x > 65536u) {
515 grid.y++;
516 if (n_blocks % grid.y == 0u)
517 grid.x = std::max<unsigned>(1u, n_blocks / grid.y);
518 else
519 grid.x = n_blocks / grid.y + 1u;
520 }
521
522 /* Switch for assignment templates, the shared version only is faster for cao
523 * > 2 */
524 auto const data_length =
525 3u *
526 static_cast<std::size_t>(parts_per_block *
527 static_cast<unsigned int>(cao)) *
528 sizeof(float);
529 switch (cao) {
530 case 1:
531 (assign_forces_kernel<1, false>)<<<grid, block, 0, nullptr>>>(
532 params, part_pos, part_q, part_f, prefactor, parts_per_block);
533 break;
534 case 2:
535 (assign_forces_kernel<2, false>)<<<grid, block, 0, nullptr>>>(
536 params, part_pos, part_q, part_f, prefactor, parts_per_block);
537 break;
538 case 3:
539 (assign_forces_kernel<3, true>)<<<grid, block, data_length, nullptr>>>(
540 params, part_pos, part_q, part_f, prefactor, parts_per_block);
541 break;
542 case 4:
543 (assign_forces_kernel<4, true>)<<<grid, block, data_length, nullptr>>>(
544 params, part_pos, part_q, part_f, prefactor, parts_per_block);
545 break;
546 case 5:
547 (assign_forces_kernel<5, true>)<<<grid, block, data_length, nullptr>>>(
548 params, part_pos, part_q, part_f, prefactor, parts_per_block);
549 break;
550 case 6:
551 (assign_forces_kernel<6, true>)<<<grid, block, data_length, nullptr>>>(
552 params, part_pos, part_q, part_f, prefactor, parts_per_block);
553 break;
554 case 7:
555 (assign_forces_kernel<7, true>)<<<grid, block, data_length, nullptr>>>(
556 params, part_pos, part_q, part_f, prefactor, parts_per_block);
557 break;
558 default:
559 break;
560 }
561 cuda_check_errors_exit(block, grid, "assign_forces", __FILE__, __LINE__);
562}
563
564/* Init the internal data structures of the P3M GPU.
565 * Mainly allocation on the device and influence function calculation.
566 * Be advised: this needs mesh^3*5*sizeof(REAL_TYPE) of device memory.
567 * We use real to complex FFTs, so the size of the reciprocal mesh
568 * is (cuFFT convention) Nx x Ny x [ Nz /2 + 1 ].
569 */
570void p3m_gpu_init(std::shared_ptr<P3MGpuParams> &data, int cao,
571 const int mesh[3], double alpha, Utils::Vector3d const &box_l,
572 unsigned n_part) {
573 if (mesh[0] == -1 && mesh[1] == -1 && mesh[2] == -1)
574 throw std::runtime_error("P3M: invalid mesh size");
575
576 if (not data) {
577 data = std::make_shared<P3MGpuParams>();
578 }
579
580 auto &p3m_gpu_data = data->p3m_gpu_data;
581 bool do_reinit = false, mesh_changed = false;
582 p3m_gpu_data.n_part = n_part;
583
584 if (not data->is_initialized or p3m_gpu_data.alpha != alpha) {
585 p3m_gpu_data.alpha = static_cast<REAL_TYPE>(alpha);
586 do_reinit = true;
587 }
588
589 if (not data->is_initialized or p3m_gpu_data.cao != cao) {
590 p3m_gpu_data.cao = cao;
591 // NOLINTNEXTLINE(bugprone-integer-division)
592 p3m_gpu_data.pos_shift = static_cast<REAL_TYPE>((p3m_gpu_data.cao - 1) / 2);
593 do_reinit = true;
594 }
595
596 if (not data->is_initialized or (p3m_gpu_data.mesh[0] != mesh[0]) or
597 (p3m_gpu_data.mesh[1] != mesh[1]) or (p3m_gpu_data.mesh[2] != mesh[2])) {
598 std::copy(mesh, mesh + 3, p3m_gpu_data.mesh);
599 mesh_changed = true;
600 do_reinit = true;
601 }
602
603 if (not data->is_initialized or (p3m_gpu_data.box[0] != box_l[0]) or
604 (p3m_gpu_data.box[1] != box_l[1]) or (p3m_gpu_data.box[2] != box_l[2])) {
605 std::copy(box_l.begin(), box_l.end(), p3m_gpu_data.box);
606 do_reinit = true;
607 }
608
609 p3m_gpu_data.mesh_z_padded = (mesh[2] / 2 + 1) * 2;
610 p3m_gpu_data.mesh_size = mesh[0] * mesh[1] * p3m_gpu_data.mesh_z_padded;
611
612 for (int i = 0; i < 3; i++) {
613 p3m_gpu_data.hi[i] =
614 static_cast<REAL_TYPE>(p3m_gpu_data.mesh[i]) / p3m_gpu_data.box[i];
615 }
616
617 if (data->is_initialized and mesh_changed) {
618 data->free_device_memory();
619 data->is_initialized = false;
620 }
621
622 if (not data->is_initialized and p3m_gpu_data.mesh_size > 0) {
623 /* Size of the complex mesh Nx * Ny * ( Nz / 2 + 1 ) */
624 auto const cmesh_size =
625 static_cast<std::size_t>(p3m_gpu_data.mesh[0]) *
626 static_cast<std::size_t>(p3m_gpu_data.mesh[1]) *
627 static_cast<std::size_t>(p3m_gpu_data.mesh[2] / 2 + 1);
628 auto const mesh_len = cmesh_size * sizeof(FFT_TYPE_COMPLEX);
629 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.charge_mesh), mesh_len));
630 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_x), mesh_len));
631 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_y), mesh_len));
632 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.force_mesh_z), mesh_len));
633 cuda_safe_mem(cudaMalloc((void **)&(p3m_gpu_data.G_hat),
634 cmesh_size * sizeof(REAL_TYPE)));
635
636 if (cufftPlan3d(&(data->p3m_fft.forw_plan), mesh[0], mesh[1], mesh[2],
637 FFT_PLAN_FORW_FLAG) != CUFFT_SUCCESS or
638 cufftPlan3d(&(data->p3m_fft.back_plan), mesh[0], mesh[1], mesh[2],
639 FFT_PLAN_BACK_FLAG) != CUFFT_SUCCESS) {
640 throw std::runtime_error("Unable to create fft plan");
641 }
642 }
643
644 if ((do_reinit or not data->is_initialized) and p3m_gpu_data.mesh_size > 0) {
645 dim3 grid(1, 1, 1);
646 dim3 block(1, 1, 1);
647 block.x = static_cast<unsigned>(512 / mesh[0] + 1);
648 block.y = static_cast<unsigned>(mesh[1]);
649 block.z = 1;
650 grid.x = static_cast<unsigned>(mesh[0]) / block.x + 1;
651 grid.z = static_cast<unsigned>(mesh[2]) / 2 + 1;
652
653 switch (p3m_gpu_data.cao) {
654 case 1:
655 KERNELCALL(calculate_influence_function_device<1>, grid, block,
656 p3m_gpu_data);
657 break;
658 case 2:
659 KERNELCALL(calculate_influence_function_device<2>, grid, block,
660 p3m_gpu_data);
661 break;
662 case 3:
663 KERNELCALL(calculate_influence_function_device<3>, grid, block,
664 p3m_gpu_data);
665 break;
666 case 4:
667 KERNELCALL(calculate_influence_function_device<4>, grid, block,
668 p3m_gpu_data);
669 break;
670 case 5:
671 KERNELCALL(calculate_influence_function_device<5>, grid, block,
672 p3m_gpu_data);
673 break;
674 case 6:
675 KERNELCALL(calculate_influence_function_device<6>, grid, block,
676 p3m_gpu_data);
677 break;
678 case 7:
679 KERNELCALL(calculate_influence_function_device<7>, grid, block,
680 p3m_gpu_data);
681 break;
682 }
683 }
684 if (p3m_gpu_data.mesh_size > 0)
685 data->is_initialized = true;
686}
687
688/**
689 * \brief The long-range part of the P3M algorithm.
690 */
692 double prefactor, unsigned n_part) {
693 auto &p3m_gpu_data = data.p3m_gpu_data;
694 p3m_gpu_data.n_part = n_part;
695
696 if (p3m_gpu_data.n_part == 0u)
697 return;
698
699 auto const positions_device = gpu.get_particle_positions_device();
700 auto const charges_device = gpu.get_particle_charges_device();
701 auto const forces_device = gpu.get_particle_forces_device();
702
703 dim3 gridConv(static_cast<unsigned>(p3m_gpu_data.mesh[0]),
704 static_cast<unsigned>(p3m_gpu_data.mesh[1]), 1u);
705 dim3 threadsConv(static_cast<unsigned>(p3m_gpu_data.mesh[2] / 2 + 1), 1u, 1u);
706
707 auto const volume =
708 Utils::product(Utils::Vector3<REAL_TYPE>(p3m_gpu_data.box));
709 auto const pref = static_cast<REAL_TYPE>(prefactor) / (volume * REAL_TYPE{2});
710
711 cuda_safe_mem(cudaMemset(p3m_gpu_data.charge_mesh, 0,
712 static_cast<std::size_t>(p3m_gpu_data.mesh_size) *
713 sizeof(REAL_TYPE)));
714
715 /* Interpolate the charges to the mesh */
716 assign_charges(p3m_gpu_data, positions_device, charges_device);
717
718 /* Do forward FFT of the charge mesh */
720 (REAL_TYPE *)p3m_gpu_data.charge_mesh,
721 p3m_gpu_data.charge_mesh) != CUFFT_SUCCESS) {
722 fprintf(stderr, "CUFFT error: Forward FFT failed\n");
723 return;
724 }
725
726 /* Do convolution */
727 KERNELCALL(apply_influence_function, gridConv, threadsConv, p3m_gpu_data);
728
729 /* Take derivative */
730 KERNELCALL(apply_diff_op, gridConv, threadsConv, p3m_gpu_data);
731
732 /* Transform the components of the electric field back */
733 FFT_BACK_FFT(data.p3m_fft.back_plan, p3m_gpu_data.force_mesh_x,
734 (REAL_TYPE *)p3m_gpu_data.force_mesh_x);
735 FFT_BACK_FFT(data.p3m_fft.back_plan, p3m_gpu_data.force_mesh_y,
736 (REAL_TYPE *)p3m_gpu_data.force_mesh_y);
737 FFT_BACK_FFT(data.p3m_fft.back_plan, p3m_gpu_data.force_mesh_z,
738 (REAL_TYPE *)p3m_gpu_data.force_mesh_z);
739
740 /* Assign the forces from the mesh back to the particles */
741 assign_forces(p3m_gpu_data, positions_device, charges_device, forces_device,
742 pref);
743}
744
745#endif // ELECTROSTATICS
float u[3]
__syncthreads()
Particle data communication manager for the GPU.
float * get_particle_charges_device() const
float * get_particle_forces_device() const
float * get_particle_positions_device() const
void cuda_check_errors_exit(const dim3 &block, const dim3 &grid, const char *function, const char *file, unsigned int line)
In case of error during a CUDA operation, print the error message and exit.
This file contains the defaults for ESPResSo.
#define P3M_BRILLOUIN
P3M: Number of Brillouin zones taken into account in the calculation of the optimal influence functio...
Definition config.hpp:53
static double * block(double *p, std::size_t index, std::size_t size)
Definition elc.cpp:174
T product(Vector< T, N > const &v)
Definition Vector.hpp:359
DEVICE_QUALIFIER T sinc(T d)
Calculates the sinc-function as sin(PI*x)/(PI*x).
Definition sinc.hpp:45
DEVICE_QUALIFIER constexpr T sqr(T x)
Calculates the SQuaRe of x.
Definition sqr.hpp:26
DEVICE_QUALIFIER constexpr T int_pow(T x)
Calculate integer powers.
Definition int_pow.hpp:61
__device__ auto linear_index_k(P3MGpuData const &p, int i, int j, int k)
__device__ auto linear_index_r(P3MGpuData const &p, int i, int j, int k)
__device__ static void Aliasing_sums_ik(const P3MGpuData p, int NX, int NY, int NZ, REAL_TYPE *Zaehler, REAL_TYPE *Nenner)
void p3m_gpu_init(std::shared_ptr< P3MGpuParams > &data, int cao, const int mesh[3], double alpha, Utils::Vector3d const &box_l, unsigned n_part)
#define FFT_BACK_FFT
__global__ void assign_charge_kernel(P3MGpuData const params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, unsigned int const parts_per_block)
__global__ void apply_influence_function(const P3MGpuData p)
void assign_charges(P3MGpuData const &params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q)
#define FFT_PLAN_FORW_FLAG
__global__ void apply_diff_op(const P3MGpuData p)
void p3m_gpu_add_farfield_force(P3MGpuParams &data, GpuParticleData &gpu, double prefactor, unsigned n_part)
The long-range part of the P3M algorithm.
#define REAL_TYPE
void assign_forces(P3MGpuData const &params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, float *const __restrict__ part_f, REAL_TYPE const prefactor)
__global__ void calculate_influence_function_device(const P3MGpuData p)
#define FFT_TYPE_COMPLEX
__device__ int wrap_index(const int ind, const int mesh)
#define FFT_PLAN_BACK_FLAG
__global__ void assign_forces_kernel(P3MGpuData const params, float const *const __restrict__ part_pos, float const *const __restrict__ part_q, float *const __restrict__ part_f, REAL_TYPE prefactor, unsigned int const parts_per_block)
#define FFT_FORW_FFT
static SteepestDescentParameters params
Currently active steepest descent instance.
int mesh[3]
Mesh dimensions.
REAL_TYPE pos_shift
Position shift.
REAL_TYPE * G_hat
Influence Function.
FFT_TYPE_COMPLEX * force_mesh_x
Force meshes.
int mesh_z_padded
Padded size.
int mesh_size
Total number of mesh points (including padding)
int cao
Charge assignment order.
unsigned int n_part
Number of particles.
FFT_TYPE_COMPLEX * force_mesh_z
REAL_TYPE hi[3]
Inverse mesh spacing.
FFT_TYPE_COMPLEX * charge_mesh
Charge mesh.
REAL_TYPE box[3]
Box size.
FFT_TYPE_COMPLEX * force_mesh_y
REAL_TYPE alpha
Ewald parameter.
cufftHandle forw_plan
Forward FFT plan.
cufftHandle back_plan
Backward FFT plan.
void free_device_memory()
P3MGpuData p3m_gpu_data
P3MGpuFftPlan p3m_fft
DEVICE_QUALIFIER constexpr iterator begin() noexcept
Definition Array.hpp:128
DEVICE_QUALIFIER constexpr iterator end() noexcept
Definition Array.hpp:140
#define cuda_safe_mem(a)
Definition utils.cuh:71
#define KERNELCALL(_function, _grid, _block,...)
Definition utils.cuh:77