pub trait QuantMethod:
Send
+ Sync
+ Debug
+ QuantizedSerde {
Show 15 methods
// Required methods
fn new(method: QuantMethodConfig) -> Result<Self>
where Self: Sized;
fn dequantize_w(&self) -> Result<Tensor>;
fn forward(&self, a: &Tensor) -> Result<Tensor>;
fn quantized_act_type(&self) -> Option<DType>;
fn dtype_and_device(&self) -> (DType, Device);
fn add_delta_w(&self, delta: &Tensor) -> Result<Arc<dyn QuantMethod>>;
fn apply_isq(
self: Arc<Self>,
dtype: Option<IsqType>,
device: Device,
n_quantized: &AtomicUsize,
imatrix_weight: Option<Vec<f32>>,
) -> Result<Arc<dyn QuantMethod>>;
fn maybe_to_gguf_quant(self: Arc<Self>) -> Result<Arc<dyn QuantMethod>>;
fn get_bias_mut(&mut self) -> Option<&mut Tensor>;
fn get_max_isq_cpu_threads(&self, dtype: IsqType) -> Option<NonZeroUsize>;
// Provided methods
fn forward_autocast(&self, a: &Tensor) -> Result<Tensor> { ... }
fn forward_via_half(&self, a: &Tensor) -> Result<Tensor> { ... }
fn unquant_weight_bias(&self) -> Option<(Tensor, Option<Tensor>)> { ... }
fn begin_track_stats(&mut self) -> Result<()> { ... }
fn end_track_stats(&self) -> Result<Tensor> { ... }
}
Expand description
Quantized method for a quantized matmul.
Required Methods§
fn new(method: QuantMethodConfig) -> Result<Self>where
Self: Sized,
fn dequantize_w(&self) -> Result<Tensor>
sourcefn forward(&self, a: &Tensor) -> Result<Tensor>
fn forward(&self, a: &Tensor) -> Result<Tensor>
Compute matmul of self
and a
. self
should contain the weights.
sourcefn quantized_act_type(&self) -> Option<DType>
fn quantized_act_type(&self) -> Option<DType>
If a quantized method, return the activation dtype.
sourcefn dtype_and_device(&self) -> (DType, Device)
fn dtype_and_device(&self) -> (DType, Device)
Weight dtype and device
sourcefn add_delta_w(&self, delta: &Tensor) -> Result<Arc<dyn QuantMethod>>
fn add_delta_w(&self, delta: &Tensor) -> Result<Arc<dyn QuantMethod>>
Add a delta weight from LoRA to the weights. This should be prescaled with alpha.
sourcefn apply_isq(
self: Arc<Self>,
dtype: Option<IsqType>,
device: Device,
n_quantized: &AtomicUsize,
imatrix_weight: Option<Vec<f32>>,
) -> Result<Arc<dyn QuantMethod>>
fn apply_isq( self: Arc<Self>, dtype: Option<IsqType>, device: Device, n_quantized: &AtomicUsize, imatrix_weight: Option<Vec<f32>>, ) -> Result<Arc<dyn QuantMethod>>
If the quant is backed by a qmatmul.
sourcefn maybe_to_gguf_quant(self: Arc<Self>) -> Result<Arc<dyn QuantMethod>>
fn maybe_to_gguf_quant(self: Arc<Self>) -> Result<Arc<dyn QuantMethod>>
Convert to an equivalent gguf quantization, if applicable.
sourcefn get_bias_mut(&mut self) -> Option<&mut Tensor>
fn get_bias_mut(&mut self) -> Option<&mut Tensor>
If the quant is backed by a qmatmul.
fn get_max_isq_cpu_threads(&self, dtype: IsqType) -> Option<NonZeroUsize>
Provided Methods§
sourcefn forward_autocast(&self, a: &Tensor) -> Result<Tensor>
fn forward_autocast(&self, a: &Tensor) -> Result<Tensor>
Compute matmul of self
and a
. self
should contain the weights.
Automatically cast to required quantization actiation type and back
sourcefn forward_via_half(&self, a: &Tensor) -> Result<Tensor>
fn forward_via_half(&self, a: &Tensor) -> Result<Tensor>
Compute matmul of self
and a
. self
should contain the weights.
This may go via half precision if it is supported.
fn unquant_weight_bias(&self) -> Option<(Tensor, Option<Tensor>)>
sourcefn begin_track_stats(&mut self) -> Result<()>
fn begin_track_stats(&mut self) -> Result<()>
Begin tracking stats into an ImatrixLayerStats
sourcefn end_track_stats(&self) -> Result<Tensor>
fn end_track_stats(&self) -> Result<Tensor>
End tracking stats into an ImatrixLayerStats. Returns the computed imatrix.