Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
2. Sort field cell index
3. Parallel prefix sum
4. Rearrange particle arrays
Parameter
----------
fld : a Field object
Contains the list of InterpolationGrid objects with
the field values as well as the prefix sum.
"""
# Shortcut for interpolation grids
grid = fld.interp
# Get the threads per block and the blocks per grid
dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot )
dim_grid_2d_flat, dim_block_2d_flat = \
cuda_tpb_bpg_1d( self.prefix_sum.shape[0] )
# ------------------------
# Sorting of the particles
# ------------------------
# Get the cell index of each particle
# (defined by iz_lower and ir_lower)
get_cell_idx_per_particle[dim_grid_1d, dim_block_1d](
self.cell_idx,
self.sorted_idx,
self.x, self.y, self.z,
grid[0].invdz, grid[0].zmin, grid[0].Nz,
grid[0].invdr, grid[0].rmin, grid[0].Nr)
# Sort the cell index array and modify the sorted_idx array
# accordingly. The value of the sorted_idx array corresponds
# to the index of the sorted particle in the other particle
# arrays.
Sort the particles by performing the following steps:
1. Get fied cell index
2. Sort field cell index
3. Parallel prefix sum
4. Rearrange particle arrays
Parameter
----------
fld : a Field object
Contains the list of InterpolationGrid objects with
the field values as well as the prefix sum.
"""
# Shortcut for interpolation grids
grid = fld.interp
# Get the threads per block and the blocks per grid
dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot )
dim_grid_2d_flat, dim_block_2d_flat = \
cuda_tpb_bpg_1d( self.prefix_sum.shape[0] )
# ------------------------
# Sorting of the particles
# ------------------------
# Get the cell index of each particle
# (defined by iz_lower and ir_lower)
get_cell_idx_per_particle[dim_grid_1d, dim_block_1d](
self.cell_idx,
self.sorted_idx,
self.x, self.y, self.z,
grid[0].invdz, grid[0].zmin, grid[0].Nz,
grid[0].invdr, grid[0].rmin, grid[0].Nr)
# Sort the cell index array and modify the sorted_idx array
# accordingly. The value of the sorted_idx array corresponds
use_cuda = self.use_cuda
# Create temporary arrays (on CPU or GPU, depending on `use_cuda`)
nscatter_per_batch = allocate_empty(N_batch, use_cuda, dtype=np.int64)
nscatter_per_elec = allocate_empty(elec.Ntot, use_cuda, dtype=np.int64)
photon_n = allocate_empty(elec.Ntot, use_cuda, dtype=np.float64)
# Prepare random numbers
if self.use_cuda:
seed = np.random.randint( 256 )
random_states = create_xoroshiro128p_states( N_batch, seed )
# For each electron, calculate the local density of photons
# *in the frame of the simulation*
if use_cuda:
bpg, tpg = cuda_tpb_bpg_1d( elec.Ntot )
get_photon_density_gaussian_cuda[ bpg, tpg ]( photon_n, elec.Ntot,
elec.x, elec.y, elec.z, c*t, self.photon_n_lab_peak,
self.inv_laser_waist2, self.inv_laser_ctau2,
self.laser_initial_z0, self.gamma_boost, self.beta_boost )
else:
get_photon_density_gaussian_numba( photon_n, elec.Ntot,
elec.x, elec.y, elec.z, c*t, self.photon_n_lab_peak,
self.inv_laser_waist2, self.inv_laser_ctau2,
self.laser_initial_z0, self.gamma_boost, self.beta_boost )
# Determine the electrons that scatter, and count them in each batch
# (one thread per batch on GPU; parallel loop over batches on CPU)
if use_cuda:
batch_grid_1d, batch_block_1d = cuda_tpb_bpg_1d( N_batch )
determine_scatterings_cuda[ batch_grid_1d, batch_block_1d ](
N_batch, self.batch_size, elec.Ntot,
float_recv_left, float_recv_right, uint_recv_left, uint_recv_right:
arrays of shape (n_float,Nptcl) and (n_int,Nptcl) where Nptcl
is the number of particles that are received to the left
proc and right proc respectively, and where n_float and n_int
are the number of float and integer quantities respectively
These arrays are always on the CPU (since they were used for MPI)
"""
# Get the new number of particles
old_Ntot = species.Ntot
n_left = float_recv_left.shape[1]
n_right = float_recv_right.shape[1]
new_Ntot = old_Ntot + n_left + n_right
# Get the threads per block and the blocks per grid
n_left_grid, n_left_block = cuda_tpb_bpg_1d( n_left )
n_right_grid, n_right_block = cuda_tpb_bpg_1d( n_right )
n_old_grid, n_old_block = cuda_tpb_bpg_1d( old_Ntot )
# Iterate over particle attributes
# Build list of float attributes to copy
attr_list = [ (species,'x'), (species,'y'), (species,'z'), \
(species,'ux'), (species,'uy'), (species,'uz'), \
(species,'inv_gamma'), (species,'w') ]
if species.ionizer is not None:
attr_list += [ (species.ionizer, 'w_times_level') ]
# Loop through the float quantities
for i_attr in range( len(attr_list) ):
# Copy the proper buffers to the GPU
left_buffer = cuda.to_device( float_recv_left[i_attr] )
right_buffer = cuda.to_device( float_recv_right[i_attr] )
# Initialize the new particle array
Parameters:
-----------
dt: float, seconds
The timestep that should be used for the push
(This can be typically be half of the simulation timestep)
x_push, y_push, z_push: float, dimensionless
Multiplying coefficient for the momenta in x, y and z
e.g. if x_push=1., the particles are pushed forward in x
if x_push=-1., the particles are pushed backward in x
"""
# GPU (CUDA) version
if self.use_cuda:
# Get the threads per block and the blocks per grid
dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot )
# Call the CUDA Kernel for push in x
push_x_gpu[dim_grid_1d, dim_block_1d](
self.x, self.y, self.z,
self.ux, self.uy, self.uz,
self.inv_gamma, dt, x_push, y_push, z_push )
# The particle array is unsorted after the push in x
self.sorted = False
# CPU version
else:
push_x_numba( self.x, self.y, self.z,
self.ux, self.uy, self.uz,
self.inv_gamma, self.Ntot,
dt, x_push, y_push, z_push )
small-size array into the full-size one)
grid: a list of InterpolationGrid objects
Contains the full-size array rho
"""
Nm = len(grid)
if type(grid[0].rho) is np.ndarray:
# The large-size array rho is on the CPU
for m in range( Nm ):
grid[m].rho[ iz_min:iz_min+2 ] += self.rho_buffer[m]
else:
# The large-size array rho is on the GPU
# Copy the small-size buffer to the GPU
cuda.to_device( self.rho_buffer, to=self.d_rho_buffer )
# On the GPU: add the small-size buffers to the large-size array
dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( grid[0].Nr, TPB=64 )
for m in range( Nm ):
add_rho_to_gpu_array[dim_grid_1d, dim_block_1d]( iz_min,
self.d_rho_buffer, grid[m].rho, m )
grid : a list of InterpolationGrid objects
(one InterpolationGrid object per azimuthal mode)
Contains the field values on the interpolation grid
"""
# Skip gathering for neutral particles (e.g. photons)
if self.q == 0:
return
# Number of modes
Nm = len(grid)
# GPU (CUDA) version
if self.use_cuda:
# Get the threads per block and the blocks per grid
dim_grid_1d, dim_block_1d = \
cuda_tpb_bpg_1d( self.Ntot, TPB=self.gather_tpb )
# Call the CUDA Kernel for the gathering of E and B Fields
if self.particle_shape == 'linear':
if Nm == 2:
# Optimized version for 2 modes
gather_field_gpu_linear[dim_grid_1d, dim_block_1d](
self.x, self.y, self.z,
grid[0].invdz, grid[0].zmin, grid[0].Nz,
grid[0].invdr, grid[0].rmin, grid[0].Nr,
grid[0].Er, grid[0].Et, grid[0].Ez,
grid[1].Er, grid[1].Et, grid[1].Ez,
grid[0].Br, grid[0].Bt, grid[0].Bz,
grid[1].Br, grid[1].Bt, grid[1].Bz,
self.Ex, self.Ey, self.Ez,
self.Bx, self.By, self.Bz)
else:
# Generic version for arbitrary number of modes
float_recv_left, float_recv_right, uint_recv_left, uint_recv_right:
arrays of shape (n_float,Nptcl) and (n_int,Nptcl) where Nptcl
is the number of particles that are received to the left
proc and right proc respectively, and where n_float and n_int
are the number of float and integer quantities respectively
These arrays are always on the CPU (since they were used for MPI)
"""
# Get the new number of particles
old_Ntot = species.Ntot
n_left = float_recv_left.shape[1]
n_right = float_recv_right.shape[1]
new_Ntot = old_Ntot + n_left + n_right
# Get the threads per block and the blocks per grid
n_left_grid, n_left_block = cuda_tpb_bpg_1d( n_left )
n_right_grid, n_right_block = cuda_tpb_bpg_1d( n_right )
n_old_grid, n_old_block = cuda_tpb_bpg_1d( old_Ntot )
# Iterate over particle attributes
# Build list of float attributes to copy
attr_list = [ (species,'x'), (species,'y'), (species,'z'), \
(species,'ux'), (species,'uy'), (species,'uz'), \
(species,'inv_gamma'), (species,'w') ]
if species.ionizer is not None:
attr_list += [ (species.ionizer, 'w_times_level') ]
# Loop through the float quantities
for i_attr in range( len(attr_list) ):
# Copy the proper buffers to the GPU
left_buffer = cuda.to_device( float_recv_left[i_attr] )
right_buffer = cuda.to_device( float_recv_right[i_attr] )
# Initialize the new particle array
particle_array = cuda.device_array( (new_Ntot,), dtype=np.float64)
Parameters
----------
species: an fbpic Particles object
use_cuda: bool
If True, the new arrays are device arrays, and copying is done on GPU.
If False, the arrays are on CPU, and copying is done on CPU.
old_Ntot, new_Ntot: int
Size of the old and new arrays (with old_Ntot < new_Ntot)
"""
# Check if the data is on the GPU
data_on_gpu = (type(species.w) is not np.ndarray)
# On GPU, use one thread per particle
if data_on_gpu:
ptcl_grid_1d, ptcl_block_1d = cuda_tpb_bpg_1d( old_Ntot )
# Iterate over particle attributes and copy the old particles
for attr in ['x', 'y', 'z', 'ux', 'uy', 'uz', 'w', 'inv_gamma',
'Ex', 'Ey', 'Ez', 'Bx', 'By', 'Bz']:
old_array = getattr(species, attr)
new_array = allocate_empty( new_Ntot, data_on_gpu, dtype=np.float64 )
if data_on_gpu:
copy_particle_data_cuda[ ptcl_grid_1d, ptcl_block_1d ](
old_Ntot, old_array, new_array )
else:
copy_particle_data_numba( old_Ntot, old_array, new_array )
setattr( species, attr, new_array )
# Copy the tracking id, if needed
if species.tracker is not None:
old_array = species.tracker.id
new_array = allocate_empty( new_Ntot, use_cuda, dtype=np.uint64 )
n_int = species.n_integer_quantities
if left_proc is not None:
float_send_left = np.empty((n_float, N_send_l), dtype=np.float64)
uint_send_left = np.empty((n_int, N_send_l), dtype=np.uint64)
else:
float_send_left = np.empty((n_float, 0), dtype=np.float64)
uint_send_left = np.empty((n_int, 0), dtype=np.uint64)
if right_proc is not None:
float_send_right = np.empty((n_float, N_send_r), dtype=np.float64)
uint_send_right = np.empty((n_int, N_send_r), dtype=np.uint64)
else:
float_send_right = np.empty((n_float, 0), dtype=np.float64)
uint_send_right = np.empty((n_int, 0), dtype=np.uint64)
# Get the threads per block and the blocks per grid
dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( species.Ntot )
# Float quantities:
# Build list of float attributes to copy
attr_list = [ (species,'x'), (species,'y'), (species,'z'),
(species,'ux'), (species,'uy'), (species,'uz'),
(species,'inv_gamma'), (species,'w') ]
if species.ionizer is not None:
attr_list.append( (species.ionizer,'w_times_level') )
# Loop through the float attributes
for i_attr in range(n_float):
# Initialize 3 buffer arrays on the GPU (need to be initialized
# inside the loop, as `copy_to_host` invalidates these arrays)
left_buffer = cuda.device_array((N_send_l,), dtype=np.float64)
right_buffer = cuda.device_array((N_send_r,), dtype=np.float64)
stay_buffer = cuda.device_array((new_Ntot,), dtype=np.float64)
# Check that the buffers are still on GPU
# (safeguard against automatic memory management)