Skip to content

Commit

Permalink
Add GpuArrayBuffer and BatchedUniformBuffer (#8204)
Browse files Browse the repository at this point in the history
# Objective

- Add a type for uploading a Rust `Vec<T>` to a GPU `array<T>`.
- Makes progress towards #89.

## Solution

- Port @superdump's `BatchedUniformBuffer` to bevy main, as a fallback
for WebGL2, which doesn't support storage buffers.
- Rather than getting an `array<T>` in a shader, you get an `array<T,
N>`, and have to rebind every N elements via dynamic offsets.
- Add `GpuArrayBuffer` to abstract over
`StorageBuffer<Vec<T>>`/`BatchedUniformBuffer`.

## Future Work
Add a shader macro kinda thing to abstract over the following
automatically:
#8204 (review)

---

## Changelog
* Added `GpuArrayBuffer`, `GpuComponentArrayBufferPlugin`,
`GpuArrayBufferable`, and `GpuArrayBufferIndex` types.
* Added `DynamicUniformBuffer::new_with_alignment()`.

---------

Co-authored-by: Robert Swain <[email protected]>
Co-authored-by: François <[email protected]>
Co-authored-by: Teodor Tanasoaia <[email protected]>
Co-authored-by: IceSentry <[email protected]>
Co-authored-by: Vincent <[email protected]>
Co-authored-by: robtfm <[email protected]>
  • Loading branch information
7 people authored Jul 21, 2023
1 parent 264195e commit ad011d0
Show file tree
Hide file tree
Showing 8 changed files with 359 additions and 1 deletion.
55 changes: 55 additions & 0 deletions crates/bevy_render/src/gpu_component_array_buffer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
use crate::{
render_resource::{GpuArrayBuffer, GpuArrayBufferable},
renderer::{RenderDevice, RenderQueue},
Render, RenderApp, RenderSet,
};
use bevy_app::{App, Plugin};
use bevy_ecs::{
prelude::{Component, Entity},
schedule::IntoSystemConfigs,
system::{Commands, Query, Res, ResMut},
};
use std::marker::PhantomData;

/// This plugin prepares the components of the corresponding type for the GPU
/// by storing them in a [`GpuArrayBuffer`].
pub struct GpuComponentArrayBufferPlugin<C: Component + GpuArrayBufferable>(PhantomData<C>);

impl<C: Component + GpuArrayBufferable> Plugin for GpuComponentArrayBufferPlugin<C> {
fn build(&self, app: &mut App) {
if let Ok(render_app) = app.get_sub_app_mut(RenderApp) {
render_app
.insert_resource(GpuArrayBuffer::<C>::new(
render_app.world.resource::<RenderDevice>(),
))
.add_systems(
Render,
prepare_gpu_component_array_buffers::<C>.in_set(RenderSet::Prepare),
);
}
}
}

impl<C: Component + GpuArrayBufferable> Default for GpuComponentArrayBufferPlugin<C> {
fn default() -> Self {
Self(PhantomData::<C>)
}
}

fn prepare_gpu_component_array_buffers<C: Component + GpuArrayBufferable>(
mut commands: Commands,
render_device: Res<RenderDevice>,
render_queue: Res<RenderQueue>,
mut gpu_array_buffer: ResMut<GpuArrayBuffer<C>>,
components: Query<(Entity, &C)>,
) {
gpu_array_buffer.clear();

let entities = components
.iter()
.map(|(entity, component)| (entity, gpu_array_buffer.push(component.clone())))
.collect::<Vec<_>>();
commands.insert_or_spawn_batch(entities);

gpu_array_buffer.write_buffer(&render_device, &render_queue);
}
1 change: 1 addition & 0 deletions crates/bevy_render/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pub mod extract_component;
mod extract_param;
pub mod extract_resource;
pub mod globals;
pub mod gpu_component_array_buffer;
pub mod mesh;
pub mod pipelined_rendering;
pub mod primitives;
Expand Down
152 changes: 152 additions & 0 deletions crates/bevy_render/src/render_resource/batched_uniform_buffer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
use super::{GpuArrayBufferIndex, GpuArrayBufferable};
use crate::{
render_resource::DynamicUniformBuffer,
renderer::{RenderDevice, RenderQueue},
};
use encase::{
private::{ArrayMetadata, BufferMut, Metadata, RuntimeSizedArray, WriteInto, Writer},
ShaderType,
};
use std::{marker::PhantomData, num::NonZeroU64};
use wgpu::{BindingResource, Limits};

// 1MB else we will make really large arrays on macOS which reports very large
// `max_uniform_buffer_binding_size`. On macOS this ends up being the minimum
// size of the uniform buffer as well as the size of each chunk of data at a
// dynamic offset.
#[cfg(any(not(feature = "webgl"), not(target_arch = "wasm32")))]
const MAX_REASONABLE_UNIFORM_BUFFER_BINDING_SIZE: u32 = 1 << 20;

// WebGL2 quirk: using uniform buffers larger than 4KB will cause extremely
// long shader compilation times, so the limit needs to be lower on WebGL2.
// This is due to older shader compilers/GPUs that don't support dynamically
// indexing uniform buffers, and instead emulate it with large switch statements
// over buffer indices that take a long time to compile.
#[cfg(all(feature = "webgl", target_arch = "wasm32"))]
const MAX_REASONABLE_UNIFORM_BUFFER_BINDING_SIZE: u32 = 1 << 12;

/// Similar to [`DynamicUniformBuffer`], except every N elements (depending on size)
/// are grouped into a batch as an `array<T, N>` in WGSL.
///
/// This reduces the number of rebindings required due to having to pass dynamic
/// offsets to bind group commands, and if indices into the array can be passed
/// in via other means, it enables batching of draw commands.
pub struct BatchedUniformBuffer<T: GpuArrayBufferable> {
// Batches of fixed-size arrays of T are written to this buffer so that
// each batch in a fixed-size array can be bound at a dynamic offset.
uniforms: DynamicUniformBuffer<MaxCapacityArray<Vec<T>>>,
// A batch of T are gathered into this `MaxCapacityArray` until it is full,
// then it is written into the `DynamicUniformBuffer`, cleared, and new T
// are gathered here, and so on for each batch.
temp: MaxCapacityArray<Vec<T>>,
current_offset: u32,
dynamic_offset_alignment: u32,
}

impl<T: GpuArrayBufferable> BatchedUniformBuffer<T> {
pub fn batch_size(limits: &Limits) -> usize {
(limits
.max_uniform_buffer_binding_size
.min(MAX_REASONABLE_UNIFORM_BUFFER_BINDING_SIZE) as u64
/ T::min_size().get()) as usize
}

pub fn new(limits: &Limits) -> Self {
let capacity = Self::batch_size(limits);
let alignment = limits.min_uniform_buffer_offset_alignment;

Self {
uniforms: DynamicUniformBuffer::new_with_alignment(alignment as u64),
temp: MaxCapacityArray(Vec::with_capacity(capacity), capacity),
current_offset: 0,
dynamic_offset_alignment: alignment,
}
}

#[inline]
pub fn size(&self) -> NonZeroU64 {
self.temp.size()
}

pub fn clear(&mut self) {
self.uniforms.clear();
self.current_offset = 0;
self.temp.0.clear();
}

pub fn push(&mut self, component: T) -> GpuArrayBufferIndex<T> {
let result = GpuArrayBufferIndex {
index: self.temp.0.len() as u32,
dynamic_offset: Some(self.current_offset),
element_type: PhantomData,
};
self.temp.0.push(component);
if self.temp.0.len() == self.temp.1 {
self.flush();
}
result
}

pub fn flush(&mut self) {
self.uniforms.push(self.temp.clone());

self.current_offset +=
align_to_next(self.temp.size().get(), self.dynamic_offset_alignment as u64) as u32;

self.temp.0.clear();
}

pub fn write_buffer(&mut self, device: &RenderDevice, queue: &RenderQueue) {
if !self.temp.0.is_empty() {
self.flush();
}
self.uniforms.write_buffer(device, queue);
}

#[inline]
pub fn binding(&self) -> Option<BindingResource> {
let mut binding = self.uniforms.binding();
if let Some(BindingResource::Buffer(binding)) = &mut binding {
// MaxCapacityArray is runtime-sized so can't use T::min_size()
binding.size = Some(self.size());
}
binding
}
}

#[inline]
fn align_to_next(value: u64, alignment: u64) -> u64 {
debug_assert!(alignment & (alignment - 1) == 0);
((value - 1) | (alignment - 1)) + 1
}

// ----------------------------------------------------------------------------
// MaxCapacityArray was implemented by Teodor Tanasoaia for encase. It was
// copied here as it was not yet included in an encase release and it is
// unclear if it is the correct long-term solution for encase.

#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
struct MaxCapacityArray<T>(T, usize);

impl<T> ShaderType for MaxCapacityArray<T>
where
T: ShaderType<ExtraMetadata = ArrayMetadata>,
{
type ExtraMetadata = ArrayMetadata;

const METADATA: Metadata<Self::ExtraMetadata> = T::METADATA;

fn size(&self) -> ::core::num::NonZeroU64 {
Self::METADATA.stride().mul(self.1.max(1) as u64).0
}
}

impl<T> WriteInto for MaxCapacityArray<T>
where
T: WriteInto + RuntimeSizedArray,
{
fn write_into<B: BufferMut>(&self, writer: &mut Writer<B>) {
debug_assert!(self.0.len() <= self.1);
self.0.write_into(writer);
}
}
2 changes: 2 additions & 0 deletions crates/bevy_render/src/render_resource/buffer_vec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ use wgpu::BufferUsages;
/// from system RAM to VRAM.
///
/// Other options for storing GPU-accessible data are:
/// * [`StorageBuffer`](crate::render_resource::StorageBuffer)
/// * [`DynamicStorageBuffer`](crate::render_resource::DynamicStorageBuffer)
/// * [`UniformBuffer`](crate::render_resource::UniformBuffer)
/// * [`DynamicUniformBuffer`](crate::render_resource::DynamicUniformBuffer)
/// * [`GpuArrayBuffer`](crate::render_resource::GpuArrayBuffer)
/// * [`BufferVec`](crate::render_resource::BufferVec)
/// * [`Texture`](crate::render_resource::Texture)
pub struct BufferVec<T: Pod> {
Expand Down
129 changes: 129 additions & 0 deletions crates/bevy_render/src/render_resource/gpu_array_buffer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
use super::StorageBuffer;
use crate::{
render_resource::batched_uniform_buffer::BatchedUniformBuffer,
renderer::{RenderDevice, RenderQueue},
};
use bevy_ecs::{prelude::Component, system::Resource};
use encase::{private::WriteInto, ShaderSize, ShaderType};
use std::{marker::PhantomData, mem};
use wgpu::{BindGroupLayoutEntry, BindingResource, BindingType, BufferBindingType, ShaderStages};

/// Trait for types able to go in a [`GpuArrayBuffer`].
pub trait GpuArrayBufferable: ShaderType + ShaderSize + WriteInto + Clone {}
impl<T: ShaderType + ShaderSize + WriteInto + Clone> GpuArrayBufferable for T {}

/// Stores an array of elements to be transferred to the GPU and made accessible to shaders as a read-only array.
///
/// On platforms that support storage buffers, this is equivalent to [`StorageBuffer<Vec<T>>`].
/// Otherwise, this falls back to a dynamic offset uniform buffer with the largest
/// array of T that fits within a uniform buffer binding (within reasonable limits).
///
/// Other options for storing GPU-accessible data are:
/// * [`StorageBuffer`](crate::render_resource::StorageBuffer)
/// * [`DynamicStorageBuffer`](crate::render_resource::DynamicStorageBuffer)
/// * [`UniformBuffer`](crate::render_resource::UniformBuffer)
/// * [`DynamicUniformBuffer`](crate::render_resource::DynamicUniformBuffer)
/// * [`BufferVec`](crate::render_resource::BufferVec)
/// * [`Texture`](crate::render_resource::Texture)
#[derive(Resource)]
pub enum GpuArrayBuffer<T: GpuArrayBufferable> {
Uniform(BatchedUniformBuffer<T>),
Storage((StorageBuffer<Vec<T>>, Vec<T>)),
}

impl<T: GpuArrayBufferable> GpuArrayBuffer<T> {
pub fn new(device: &RenderDevice) -> Self {
let limits = device.limits();
if limits.max_storage_buffers_per_shader_stage == 0 {
GpuArrayBuffer::Uniform(BatchedUniformBuffer::new(&limits))
} else {
GpuArrayBuffer::Storage((StorageBuffer::default(), Vec::new()))
}
}

pub fn clear(&mut self) {
match self {
GpuArrayBuffer::Uniform(buffer) => buffer.clear(),
GpuArrayBuffer::Storage((_, buffer)) => buffer.clear(),
}
}

pub fn push(&mut self, value: T) -> GpuArrayBufferIndex<T> {
match self {
GpuArrayBuffer::Uniform(buffer) => buffer.push(value),
GpuArrayBuffer::Storage((_, buffer)) => {
let index = buffer.len() as u32;
buffer.push(value);
GpuArrayBufferIndex {
index,
dynamic_offset: None,
element_type: PhantomData,
}
}
}
}

pub fn write_buffer(&mut self, device: &RenderDevice, queue: &RenderQueue) {
match self {
GpuArrayBuffer::Uniform(buffer) => buffer.write_buffer(device, queue),
GpuArrayBuffer::Storage((buffer, vec)) => {
buffer.set(mem::take(vec));
buffer.write_buffer(device, queue);
}
}
}

pub fn binding_layout(
binding: u32,
visibility: ShaderStages,
device: &RenderDevice,
) -> BindGroupLayoutEntry {
BindGroupLayoutEntry {
binding,
visibility,
ty: if device.limits().max_storage_buffers_per_shader_stage == 0 {
BindingType::Buffer {
ty: BufferBindingType::Uniform,
has_dynamic_offset: true,
// BatchedUniformBuffer uses a MaxCapacityArray that is runtime-sized, so we use
// None here and let wgpu figure out the size.
min_binding_size: None,
}
} else {
BindingType::Buffer {
ty: BufferBindingType::Storage { read_only: true },
has_dynamic_offset: false,
min_binding_size: Some(T::min_size()),
}
},
count: None,
}
}

pub fn binding(&self) -> Option<BindingResource> {
match self {
GpuArrayBuffer::Uniform(buffer) => buffer.binding(),
GpuArrayBuffer::Storage((buffer, _)) => buffer.binding(),
}
}

pub fn batch_size(device: &RenderDevice) -> Option<u32> {
let limits = device.limits();
if limits.max_storage_buffers_per_shader_stage == 0 {
Some(BatchedUniformBuffer::<T>::batch_size(&limits) as u32)
} else {
None
}
}
}

/// An index into a [`GpuArrayBuffer`] for a given element.
#[derive(Component)]
pub struct GpuArrayBufferIndex<T: GpuArrayBufferable> {
/// The index to use in a shader into the array.
pub index: u32,
/// The dynamic offset to use when setting the bind group in a pass.
/// Only used on platforms that don't support storage buffers.
pub dynamic_offset: Option<u32>,
pub element_type: PhantomData<T>,
}
3 changes: 3 additions & 0 deletions crates/bevy_render/src/render_resource/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
mod batched_uniform_buffer;
mod bind_group;
mod bind_group_layout;
mod buffer;
mod buffer_vec;
mod gpu_array_buffer;
mod pipeline;
mod pipeline_cache;
mod pipeline_specializer;
Expand All @@ -15,6 +17,7 @@ pub use bind_group::*;
pub use bind_group_layout::*;
pub use buffer::*;
pub use buffer_vec::*;
pub use gpu_array_buffer::*;
pub use pipeline::*;
pub use pipeline_cache::*;
pub use pipeline_specializer::*;
Expand Down
2 changes: 2 additions & 0 deletions crates/bevy_render/src/render_resource/storage_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use wgpu::{util::BufferInitDescriptor, BindingResource, BufferBinding, BufferUsa
/// * [`DynamicStorageBuffer`](crate::render_resource::DynamicStorageBuffer)
/// * [`UniformBuffer`](crate::render_resource::UniformBuffer)
/// * [`DynamicUniformBuffer`](crate::render_resource::DynamicUniformBuffer)
/// * [`GpuArrayBuffer`](crate::render_resource::GpuArrayBuffer)
/// * [`BufferVec`](crate::render_resource::BufferVec)
/// * [`Texture`](crate::render_resource::Texture)
///
Expand Down Expand Up @@ -154,6 +155,7 @@ impl<T: ShaderType + WriteInto> StorageBuffer<T> {
/// * [`StorageBuffer`](crate::render_resource::StorageBuffer)
/// * [`UniformBuffer`](crate::render_resource::UniformBuffer)
/// * [`DynamicUniformBuffer`](crate::render_resource::DynamicUniformBuffer)
/// * [`GpuArrayBuffer`](crate::render_resource::GpuArrayBuffer)
/// * [`BufferVec`](crate::render_resource::BufferVec)
/// * [`Texture`](crate::render_resource::Texture)
///
Expand Down
Loading

0 comments on commit ad011d0

Please sign in to comment.