CUDA Flocking Simulation

Credits

• University of Pennsylvania CIS 565 2020 Project 1
• "Efficient Neighbor Searching for Agent-based
Simulation on GPU"

Boid Simulation

Separation Pseudocode

``````def seperation(boid: Boid, boids: Boid[]) {
c: Vec3 = 0

for (b : boids) {
if b != boid and distance(b, boid) < seperation_distance
c -= (neighbor.position - boid.position)
}

return c * seperation_scale
}``````

Alignment Pseudocode

``````def alignment(boid: Boid, boids: Boid[]) {
perceived_velocity: Vec3 = 0
neighbors_count = 0

for (b : boids) {
if b != boid and distance(b, boid) < alignment_distance
perceived_velocity += b.velocity - boid.velocity
++neighbors_count
}

if (neighbors_count >= 0) perceived_velocity /= neighbors_count

return perceived_velocity * alignment_scale
}``````

Cohesion Pseudocode

``````def cohesion(boid: Boid, boids: Boid[]) {
center_of_mass: Vec3 = 0
neighbors_count = 0

for (b : boids) {
if b != boid and distance(b, boid) < cohesion_distance
center_of_mass += b.position
++neighbors_count
}

if (neighbors_count > 0) {
center_of_mass /= neighbors_count
return (center_of_mass - boid.position) * cohesion_scale
}

return 0
}``````

Naive Implementation: Pseudocode

``````def step_naive(pos, vel1, vel2) {
for parallel (boid : boids) {
new_vel = vel1[boid];
for (others : boids) {
// Accumulate for new_vel
}
vel2[boid] = new_vel;
}

update_pos(pos, vel2);

swap(vel1, vel2);
}``````

Naive Implementation: Pseudocode

``````def step_naive(pos, vel1, vel2) {
for parallel (boid : boids) {
new_vel = vel1[boid];
for (others : boids) {
// Accumulate for new_vel
}
vel2[boid] = new_vel;
}

update_pos(pos, vel2);

swap(vel1, vel2);
}``````

Explicit Euler

Naive Implementation: Pseudocode

``````def step_naive(pos, vel1, vel2) {
for parallel (boid : boids) {
new_vel = vel1[boid];
for (others : boids) {
// Accumulate for new_vel
}
vel2[boid] = new_vel;
}

update_pos(pos, vel2);

swap(vel1, vel2);
}``````

Buffer Ping-Ponging

Uniform Grid

Sort:

Thrust Library to Rescue

``````thrust::sort_by_key(grid_indices,
grid_indices + objects_count,
boid_indices);``````

Sort:

Sort:

Indirection:

Uniform Grid: Pseudocode

``````def step_uniform_grid(pos, vel1, vel2) {
indices, grid_indices = compute_indices();

sort_by_key(grid_indices, indices);

grid_first, grid_last = identify_first_last(grid_indices);

for parallel (boid : boids) {
neighbor_grid_cells = calculate_neighbor_grid_cells(boid);

new_vel = vel1[boid];
for (cell : neighbor_grid_cells) {
for (neighbor from grid_first[cell] to grid_last[cell]) {
neighbor_pos = pos[indices[neighbor]];
neighbor_vel = vel[indices[neighbor]];
// Accumulate for new_vel
}
}
vel2[boid] = new_vel;
}

update_pos(pos, vel2);
swap(vel1, vel2);
}``````

Cutting the middleman

``````thrust::copy(pos, pos + objects_count, pos_sorted);
thrust::copy(vel, vel + objects_count, vel_sorted);
thrust::sort_by_key(grid_indices,
grid_indices + objects_count,
pos_sorted,
vel_sorted);``````

Cutting the middleman

``````thrust::copy(pos, pos + objects_count, pos_sorted);
thrust::copy(vel, vel + objects_count, vel_sorted);
thrust::sort_by_key(grid_indices,
grid_indices + objects_count,
pos_sorted,
vel_sorted);``````

Cutting the middleman

``````thrust::sort_by_key(grid_indices,
grid_indices + objects_count,
indices);
thrust::gather(indices, indices + objects_count, pos, pos_sorted);
thrust::gather(indices, indices + objects_count, vel, vel_sorted);``````

Coherent Grid: Pseudocode

``````def step_coherent_grid(pos, vel1, vel2) {
indices, grid_indices = compute_indices();

pos_sorted, vel_sorted = pos, vel1
sort_by_key(grid_indices, pos_sorted, vel_sorted);

grid_first, grid_last = identify_first_last(grid_indices);

for parallel (boid : boids) {
neighbor_grid_cells = calculate_neighbor_grid_cells(boid);

new_vel = vel1[boid];
for (cell : neighbor_grid_cells) {
for (neighbor from grid_first[cell] to grid_last[cell]) {
neighbor_pos = pos_sorted[neighbor];
neighbor_vel = vel_sorted[neighbor];
// Accumulate for new_vel
}
}
vel2[boid] = new_vel;
}

update_pos(pos, vel2);
swap(vel1, vel2);
swap(pos, pos_sorted);
}``````

Shared-memory Optimization

Slow

CUDA Memory

Slow

Faster (Read-Only on Device)

CUDA Memory

Slow

Faster (Read-Only on Device)

Fast

Shared Grid Pseudocode

``````def step_shared_grid(pos, vel1, vel2) {
// same

for parallel (warp: warps) {
while (not process all neighbors) {

// calculate all neighbor grid cells in a wrap

// copy some boids in neighbor cells to shared memory based on capacity

for parallel boids in wrap {
for (neighbor from shared_first to shared_last) {
// Accumulate for new_vel
}
vel2[boid] = new_vel;
}
}
}

// same
}``````