pub trait ComputeLayer<F: Field>: 'static + Sync {
type DevMem: ComputeMemory<F>;
type Exec;
type KernelExec: KernelExecutor<F, ExprEval = Self::ExprEval>;
type OpValue;
type ExprEval: Sync;
Show 16 methods
// Required methods
fn host_alloc(&self, n: usize) -> impl AsMut<[F]> + '_;
fn copy_h2d(
&self,
src: &[F],
dst: &mut FSliceMut<'_, F, Self>,
) -> Result<(), Error>;
fn copy_d2h(
&self,
src: FSlice<'_, F, Self>,
dst: &mut [F],
) -> Result<(), Error>;
fn copy_d2d(
&self,
src: FSlice<'_, F, Self>,
dst: &mut FSliceMut<'_, F, Self>,
) -> Result<(), Error>;
fn execute(
&self,
f: impl FnOnce(&mut Self::Exec) -> Result<Vec<Self::OpValue>, Error>,
) -> Result<Vec<F>, Error>;
fn compile_expr(
&self,
expr: &ArithCircuit<F>,
) -> Result<Self::ExprEval, Error>;
fn accumulate_kernels(
&self,
exec: &mut Self::Exec,
map: impl Sync + for<'a> Fn(&'a mut Self::KernelExec, usize, Vec<KernelBuffer<'a, F, <Self::KernelExec as KernelExecutor<F>>::Mem>>) -> Result<Vec<<Self::KernelExec as KernelExecutor<F>>::Value>, Error>,
mem_maps: Vec<KernelMemMap<'_, F, Self::DevMem>>,
) -> Result<Vec<Self::OpValue>, Error>;
fn inner_product(
&self,
exec: &mut Self::Exec,
a_in: SubfieldSlice<'_, F, Self::DevMem>,
b_in: <Self::DevMem as ComputeMemory<F>>::FSlice<'_>,
) -> Result<Self::OpValue, Error>;
fn tensor_expand(
&self,
exec: &mut Self::Exec,
log_n: usize,
coordinates: &[F],
data: &mut <Self::DevMem as ComputeMemory<F>>::FSliceMut<'_>,
) -> Result<(), Error>;
fn fold_left<'a>(
&'a self,
exec: &'a mut Self::Exec,
mat: SubfieldSlice<'_, F, Self::DevMem>,
vec: <Self::DevMem as ComputeMemory<F>>::FSlice<'_>,
out: &mut <Self::DevMem as ComputeMemory<F>>::FSliceMut<'_>,
) -> Result<(), Error>;
fn fold_right<'a>(
&'a self,
exec: &'a mut Self::Exec,
mat: SubfieldSlice<'_, F, Self::DevMem>,
vec: <Self::DevMem as ComputeMemory<F>>::FSlice<'_>,
out: &mut <Self::DevMem as ComputeMemory<F>>::FSliceMut<'_>,
) -> Result<(), Error>;
fn fri_fold<FSub>(
&self,
exec: &mut Self::Exec,
ntt: &(impl AdditiveNTT<FSub> + Sync),
log_len: usize,
log_batch_size: usize,
challenges: &[F],
data_in: FSlice<'_, F, Self>,
data_out: &mut FSliceMut<'_, F, Self>,
) -> Result<(), Error>
where FSub: BinaryField,
F: ExtensionField<FSub>;
fn extrapolate_line(
&self,
exec: &mut Self::Exec,
evals_0: &mut FSliceMut<'_, F, Self>,
evals_1: FSlice<'_, F, Self>,
z: F,
) -> Result<(), Error>;
fn compute_composite(
&self,
exec: &mut Self::Exec,
inputs: &SlicesBatch<FSlice<'_, F, Self>>,
output: &mut FSliceMut<'_, F, Self>,
composition: &Self::ExprEval,
) -> Result<(), Error>;
// Provided methods
fn join<Out1, Out2>(
&self,
exec: &mut Self::Exec,
op1: impl FnOnce(&mut Self::Exec) -> Result<Out1, Error>,
op2: impl FnOnce(&mut Self::Exec) -> Result<Out2, Error>,
) -> Result<(Out1, Out2), Error> { ... }
fn map<Out, I: ExactSizeIterator>(
&self,
exec: &mut Self::Exec,
iter: I,
map: impl Fn(&mut Self::Exec, I::Item) -> Result<Out, Error>,
) -> Result<Vec<Out>, Error> { ... }
}
Expand description
A hardware abstraction layer (HAL) for compute operations.
Required Associated Types§
Sourcetype DevMem: ComputeMemory<F>
type DevMem: ComputeMemory<F>
The device memory.
Sourcetype KernelExec: KernelExecutor<F, ExprEval = Self::ExprEval>
type KernelExec: KernelExecutor<F, ExprEval = Self::ExprEval>
The executor that can execute operations on a kernel-level granularity (i.e., a single core).
Required Methods§
Sourcefn host_alloc(&self, n: usize) -> impl AsMut<[F]> + '_
fn host_alloc(&self, n: usize) -> impl AsMut<[F]> + '_
Allocates a slice of memory on the host that is prepared for transfers to/from the device.
Depending on the compute layer, this may perform steps beyond just allocating memory. For example, it may allocate huge pages or map the allocated memory to the IOMMU.
The returned buffer is lifetime bound to the compute layer, allowing return types to have drop methods referencing data in the compute layer.
Sourcefn copy_h2d(
&self,
src: &[F],
dst: &mut FSliceMut<'_, F, Self>,
) -> Result<(), Error>
fn copy_h2d( &self, src: &[F], dst: &mut FSliceMut<'_, F, Self>, ) -> Result<(), Error>
Copy data from the host to the device.
§Preconditions
src
anddst
must have the same length.src
must be a slice of a buffer returned bySelf::host_alloc
.
Sourcefn copy_d2h(&self, src: FSlice<'_, F, Self>, dst: &mut [F]) -> Result<(), Error>
fn copy_d2h(&self, src: FSlice<'_, F, Self>, dst: &mut [F]) -> Result<(), Error>
Copy data from the device to the host.
§Preconditions
src
anddst
must have the same length.dst
must be a slice of a buffer returned bySelf::host_alloc
.
Sourcefn copy_d2d(
&self,
src: FSlice<'_, F, Self>,
dst: &mut FSliceMut<'_, F, Self>,
) -> Result<(), Error>
fn copy_d2d( &self, src: FSlice<'_, F, Self>, dst: &mut FSliceMut<'_, F, Self>, ) -> Result<(), Error>
Sourcefn execute(
&self,
f: impl FnOnce(&mut Self::Exec) -> Result<Vec<Self::OpValue>, Error>,
) -> Result<Vec<F>, Error>
fn execute( &self, f: impl FnOnce(&mut Self::Exec) -> Result<Vec<Self::OpValue>, Error>, ) -> Result<Vec<F>, Error>
Executes an operation.
A HAL operation is an abstract function that runs with an executor reference.
Sourcefn compile_expr(&self, expr: &ArithCircuit<F>) -> Result<Self::ExprEval, Error>
fn compile_expr(&self, expr: &ArithCircuit<F>) -> Result<Self::ExprEval, Error>
Compiles an arithmetic expression to the evaluator.
Sourcefn accumulate_kernels(
&self,
exec: &mut Self::Exec,
map: impl Sync + for<'a> Fn(&'a mut Self::KernelExec, usize, Vec<KernelBuffer<'a, F, <Self::KernelExec as KernelExecutor<F>>::Mem>>) -> Result<Vec<<Self::KernelExec as KernelExecutor<F>>::Value>, Error>,
mem_maps: Vec<KernelMemMap<'_, F, Self::DevMem>>,
) -> Result<Vec<Self::OpValue>, Error>
fn accumulate_kernels( &self, exec: &mut Self::Exec, map: impl Sync + for<'a> Fn(&'a mut Self::KernelExec, usize, Vec<KernelBuffer<'a, F, <Self::KernelExec as KernelExecutor<F>>::Mem>>) -> Result<Vec<<Self::KernelExec as KernelExecutor<F>>::Value>, Error>, mem_maps: Vec<KernelMemMap<'_, F, Self::DevMem>>, ) -> Result<Vec<Self::OpValue>, Error>
Launch many kernels in parallel and accumulate the scalar results with field addition.
This method provides low-level access to schedule parallel kernel executions on the compute
platform. A kernel is a program that executes synchronously in one thread, with access to
local memory buffers. When the environment launches a kernel, it sets up the kernel’s local
memory according to the memory mapping specifications provided by the mem_maps
parameter.
The mapped buffers have type KernelBuffer
, and they may be read-write or read-only.
When the kernel exits, it returns a small number of computed values as field elements. The
vector of returned scalars is accumulated via binary field addition across all kernels and
returned from the call.
This method is fairly general but also designed to fit the specific needs of the sumcheck protocol. That motivates the choice that returned values are small-length vectors that are accumulated with field addition.
§Buffer chunking
The kernel local memory buffers are thought of as slices of a larger buffer, which may or
may not exist somewhere else. Each kernel operates on a chunk of the larger buffers. For
example, the KernelMemMap::Chunked
mapping specifies that each kernel operates on a
read-only chunk of a buffer in global device memory. The KernelMemMap::Local
mapping
specifies that a kernel operates on a local scratchpad initialized with zeros and discarded
at the end of kernel execution (sort of like /dev/null).
This ComputeLayer
object can decide how many kernels to launch and thus how large
each kernel’s buffer chunks are. The number of chunks must be a power of two. This
information is provided to the kernel specification closure as an argument.
§Kernel specification
The kernel logic is constructed within a closure, which is the map
parameter. The closure
has three parameters:
kernel_exec
- the kernel execution environment.log_chunks
- the binary logarithm of the number of kernels that are launched.buffers
- a vector of kernel-local buffers.
The closure must respect certain assumptions:
- The kernel closure control flow is identical on each invocation when
log_chunks
is unchanged.
ComputeLayer
implementations are free to call the specification closure multiple times,
for example with different values for log_chunks
.
§Arguments
map
- the kernel specification closure. See the “Kernel specification” section above.mem_maps
- the memory mappings for the kernel-local buffers.
Sourcefn inner_product(
&self,
exec: &mut Self::Exec,
a_in: SubfieldSlice<'_, F, Self::DevMem>,
b_in: <Self::DevMem as ComputeMemory<F>>::FSlice<'_>,
) -> Result<Self::OpValue, Error>
fn inner_product( &self, exec: &mut Self::Exec, a_in: SubfieldSlice<'_, F, Self::DevMem>, b_in: <Self::DevMem as ComputeMemory<F>>::FSlice<'_>, ) -> Result<Self::OpValue, Error>
Returns the inner product of a vector of subfield elements with big field elements.
§Arguments
a_in
- the first input slice of subfield elements.b_in
- the second input slice ofF
elements.
§Throws
- if
tower_level
ora_in
is greater thanF::TOWER_LEVEL
- unless
a_in
andb_in
contain the same number of elements, and the number is a power of two
§Returns
Returns the inner product of a_in
and b_in
.
Sourcefn tensor_expand(
&self,
exec: &mut Self::Exec,
log_n: usize,
coordinates: &[F],
data: &mut <Self::DevMem as ComputeMemory<F>>::FSliceMut<'_>,
) -> Result<(), Error>
fn tensor_expand( &self, exec: &mut Self::Exec, log_n: usize, coordinates: &[F], data: &mut <Self::DevMem as ComputeMemory<F>>::FSliceMut<'_>, ) -> Result<(), Error>
Computes the iterative tensor product of the input with the given coordinates.
This operation modifies the data buffer in place.
§Mathematical Definition
This operation accepts parameters
- $n \in \mathbb{N}$ (
log_n
), - $k \in \mathbb{N}$ (
coordinates.len()
), - $v \in L^{2^n}$ (
data[..1 << log_n]
), - $r \in L^k$ (
coordinates
),
and computes the vector
$$ v \otimes (1 - r_0, r_0) \otimes \ldots \otimes (1 - r_{k-1}, r_{k-1}) $$
§Throws
- unless
2**(log_n + coordinates.len())
equalsdata.len()
Sourcefn fold_left<'a>(
&'a self,
exec: &'a mut Self::Exec,
mat: SubfieldSlice<'_, F, Self::DevMem>,
vec: <Self::DevMem as ComputeMemory<F>>::FSlice<'_>,
out: &mut <Self::DevMem as ComputeMemory<F>>::FSliceMut<'_>,
) -> Result<(), Error>
fn fold_left<'a>( &'a self, exec: &'a mut Self::Exec, mat: SubfieldSlice<'_, F, Self::DevMem>, vec: <Self::DevMem as ComputeMemory<F>>::FSlice<'_>, out: &mut <Self::DevMem as ComputeMemory<F>>::FSliceMut<'_>, ) -> Result<(), Error>
Computes left matrix-vector multiplication of a subfield matrix with a big field vector.
§Mathematical Definition
This operation accepts
- $n \in \mathbb{N}$ (
out.len()
), - $m \in \mathbb{N}$ (
vec.len()
), - $M \in K^{n \times m}$ (
mat
), - $v \in K^m$ (
vec
),
and computes the vector $Mv$.
§Args
mat
- a slice of elements from a subfield ofF
.vec
- a slice ofF
elements.out
- a buffer for the output vector ofF
elements.
§Throws
- Returns an error if
mat.len()
does not equalvec.len() * out.len()
. - Returns an error if
mat
is not a subfield ofF
.
Sourcefn fold_right<'a>(
&'a self,
exec: &'a mut Self::Exec,
mat: SubfieldSlice<'_, F, Self::DevMem>,
vec: <Self::DevMem as ComputeMemory<F>>::FSlice<'_>,
out: &mut <Self::DevMem as ComputeMemory<F>>::FSliceMut<'_>,
) -> Result<(), Error>
fn fold_right<'a>( &'a self, exec: &'a mut Self::Exec, mat: SubfieldSlice<'_, F, Self::DevMem>, vec: <Self::DevMem as ComputeMemory<F>>::FSlice<'_>, out: &mut <Self::DevMem as ComputeMemory<F>>::FSliceMut<'_>, ) -> Result<(), Error>
Computes right matrix-vector multiplication of a subfield matrix with a big field vector.
§Mathematical Definition
This operation accepts
- $n \in \mathbb{N}$ (
vec.len()
), - $m \in \mathbb{N}$ (
out.len()
), - $M \in K^{n \times m}$ (
mat
), - $v \in K^m$ (
vec
),
and computes the vector $((v’)M)’$. The prime denotes a transpose
§Args
mat
- a slice of elements from a subfield ofF
.vec
- a slice ofF
elements.out
- a buffer for the output vector ofF
elements.
§Throws
- Returns an error if
mat.len()
does not equalvec.len() * out.len()
. - Returns an error if
mat
is not a subfield ofF
.
Sourcefn fri_fold<FSub>(
&self,
exec: &mut Self::Exec,
ntt: &(impl AdditiveNTT<FSub> + Sync),
log_len: usize,
log_batch_size: usize,
challenges: &[F],
data_in: FSlice<'_, F, Self>,
data_out: &mut FSliceMut<'_, F, Self>,
) -> Result<(), Error>where
FSub: BinaryField,
F: ExtensionField<FSub>,
fn fri_fold<FSub>(
&self,
exec: &mut Self::Exec,
ntt: &(impl AdditiveNTT<FSub> + Sync),
log_len: usize,
log_batch_size: usize,
challenges: &[F],
data_in: FSlice<'_, F, Self>,
data_out: &mut FSliceMut<'_, F, Self>,
) -> Result<(), Error>where
FSub: BinaryField,
F: ExtensionField<FSub>,
FRI-fold the interleaved codeword using the given challenges.
The FRI-fold operation folds a length $2^{n+b+\eta}$ vector of field elements into a length $2^n$ vector of field elements. $n$ is the log block length of the code, $b$ is the log batch size, and $b + \eta$ is the number of challenge elements. The operation has the following mathematical structure:
- Split the challenge vector into two parts: $c_0$ with length $b$ and $c_1$ with length $\eta$.
- Low fold the input data with the tensor expansion of $c_0.
- Apply $\eta$ layers of the inverse additive NTT to the data.
- Low fold the input data with the tensor expansion of $c_1.
The algorithm to perform steps 3 and 4 can be combined into a linear amount of work, whereas step 3 on its own would require $\eta$ independent linear passes.
See DP24, Section 4.2 for more details.
This operation writes the result out-of-place into an output buffer.
§Arguments
ntt
- the NTT instance, used to look up the twiddle values.log_len
- $n + \eta$, the binary logarithm of the code length.log_batch_size
- $b$, the binary logarithm of the interleaved code batch size.challenges
- the folding challenges, with length $b + \eta$.data_in
- an input vector, with length $2^{n + b + \eta}$.data_out
- an output buffer, with length $2^n$.
Sourcefn extrapolate_line(
&self,
exec: &mut Self::Exec,
evals_0: &mut FSliceMut<'_, F, Self>,
evals_1: FSlice<'_, F, Self>,
z: F,
) -> Result<(), Error>
fn extrapolate_line( &self, exec: &mut Self::Exec, evals_0: &mut FSliceMut<'_, F, Self>, evals_1: FSlice<'_, F, Self>, z: F, ) -> Result<(), Error>
Extrapolates a line between a vector of evaluations at 0 and evaluations at 1.
Given two values $y_0, y_1$, this operation computes the value $y_z = y_0 + (y_1 - y_0) z$, which is the value of the line that interpolates $(0, y_0), (1, y_1)$ at $z$. This computes this operation in parallel over two vectors of big field elements of equal sizes.
The operation writes the result back in-place into the evals_0
buffer.
§Args
evals_0
- this is both an input and output buffer. As in input, it is populated with the values $y_0$, which are the line’s values at 0.evals_1
- an input buffer with the values $y_1$, which are the line’s values at 1.z
- the scalar evaluation point.
§Throws
- if
evals_0
andevals_1
are not equal sizes. - if the sizes of
evals_0
andevals_1
are not powers of two.
Sourcefn compute_composite(
&self,
exec: &mut Self::Exec,
inputs: &SlicesBatch<FSlice<'_, F, Self>>,
output: &mut FSliceMut<'_, F, Self>,
composition: &Self::ExprEval,
) -> Result<(), Error>
fn compute_composite( &self, exec: &mut Self::Exec, inputs: &SlicesBatch<FSlice<'_, F, Self>>, output: &mut FSliceMut<'_, F, Self>, composition: &Self::ExprEval, ) -> Result<(), Error>
Computes the elementwise application of a compiled arithmetic expression to multiple input slices.
This operation applies the composition expression to each row of input values, where a row consists of one element from each input slice at the same index position. The results are stored in the output slice.
§Mathematical Definition
Given:
- Multiple input slices $P_0, \ldots, P_{m-1}$, each of length $2^n$ elements
- A composition function $C(X_0, \ldots, X_{m-1})$
- An output slice $P_{\text{out}}$ of length $2^n$ elements
This operation computes:
$$ P_{\text{out}}[i] = C(P_0[i], \ldots, P_{m-1}[i]) \quad \forall i \in {0, \ldots, 2^n- 1} $$
§Arguments
exec
- The execution environment.inputs
- A slice of input slices, where each slice contains field elements.output
- A mutable output slice where the results will be stored.composition
- The compiled arithmetic expression to apply.
§Throws
- Returns an error if any input or output slice has a length that is not a power of two.
- Returns an error if the input and output slices do not all have the same length.
Provided Methods§
Dyn Compatibility§
This trait is not dyn compatible.
In older versions of Rust, dyn compatibility was called "object safety", so this trait is not object safe.