Trait TokenizerEnv
pub trait TokenizerEnv: Send {
// Required methods
fn tok_trie(&self) -> &TokTrie;
fn tokenize_bytes(&self, s: &[u8]) -> Vec<u32>;
// Provided methods
fn tokenize_bytes_marker(&self, s: &[u8]) -> Vec<u32> { ... }
fn tokenize(&self, s: &str) -> Vec<u32> { ... }
fn tokenize_special(&self, s: &str) -> Vec<u32> { ... }
fn eos_token(&self) -> u32 { ... }
fn tokenize_is_canonical(&self) -> bool { ... }
}
Required Methods§
fn tokenize_bytes(&self, s: &[u8]) -> Vec<u32>
fn tokenize_bytes(&self, s: &[u8]) -> Vec<u32>
Tokenize a given byte sequence. It may or may not interpret <|special_tokens|> as special.
Provided Methods§
fn tokenize_bytes_marker(&self, s: &[u8]) -> Vec<u32>
fn tokenize_bytes_marker(&self, s: &[u8]) -> Vec<u32>
Tokenize a given byte sequence. It will interpret text starting with SPECIAL_TOKEN_MARKER as special tokens.
fn tokenize(&self, s: &str) -> Vec<u32>
fn tokenize(&self, s: &str) -> Vec<u32>
Tokenize a string coming from user. It may or may not interpret <|special_tokens|> as special.
fn tokenize_special(&self, s: &str) -> Vec<u32>
fn tokenize_special(&self, s: &str) -> Vec<u32>
Tokenize a string. It will interpret <|special_tokens|> as special.
fn tokenize_is_canonical(&self) -> bool
fn tokenize_is_canonical(&self) -> bool
If this returns true, this tokenizer always returns canonical tokenizations and can be used for forcing tokens. Non-canonical tokenizers will typically just use TokTrie::greedy_tokenize().