Trait TokenizerEnv
pub trait TokenizerEnv: Send {
// Required methods
fn tok_trie(&self) -> &TokTrie;
fn tokenize_bytes(&self, s: &[u8]) -> Vec<u32>;
// Provided methods
fn tokenize_bytes_marker(&self, s: &[u8]) -> (Vec<u32>, usize) { ... }
fn tokenize(&self, s: &str) -> Vec<u32> { ... }
fn tokenize_special(&self, s: &str) -> Vec<u32> { ... }
fn eos_token(&self) -> u32 { ... }
fn tokenize_is_canonical(&self) -> bool { ... }
}
Required Methods§
fn tokenize_bytes(&self, s: &[u8]) -> Vec<u32>
fn tokenize_bytes(&self, s: &[u8]) -> Vec<u32>
Tokenize a given byte sequence. It may or may not interpret <|special_tokens|> as special.
Provided Methods§
fn tokenize_bytes_marker(&self, s: &[u8]) -> (Vec<u32>, usize)
fn tokenize_bytes_marker(&self, s: &[u8]) -> (Vec<u32>, usize)
Tokenize a given byte sequence. It will interpret text starting with SPECIAL_TOKEN_MARKER as special tokens. Returns tokens, and number of tokens are should never be re-tokenized (because they were specified using the special token marker).
fn tokenize(&self, s: &str) -> Vec<u32>
fn tokenize(&self, s: &str) -> Vec<u32>
Tokenize a string coming from user. It may or may not interpret <|special_tokens|> as special.
fn tokenize_special(&self, s: &str) -> Vec<u32>
fn tokenize_special(&self, s: &str) -> Vec<u32>
Tokenize a string. It will interpret <|special_tokens|> as special.
fn tokenize_is_canonical(&self) -> bool
fn tokenize_is_canonical(&self) -> bool
If this returns true, this tokenizer always returns canonical tokenizations and can be used for forcing tokens. Non-canonical tokenizers will typically just use TokTrie::greedy_tokenize().