//! Text tokenization for full-text indexing and querying. //! //! The tokenizer is intentionally simple and deterministic so that the same //! configuration produces identical token streams at index time and query //! time: //! //! * Splits on any non-alphanumeric character (Unicode-aware via //! [`char::is_alphanumeric`]). `"don't"` therefore tokenizes to //! `["don", "t"]`. //! * Optional lowercasing (Unicode-aware, on by default). //! * Optional diacritic folding for common Latin characters //! (`café` → `cafe`). Folding is applied **after** lowercasing, so the //! fold table only needs lowercase entries; if you disable lowercasing, //! uppercase accented characters pass through unfolded. //! * Optional stopword removal and min/max token length filtering. Filtered //! tokens still advance the position counter, so positional gaps are //! preserved. use std::collections::HashSet; /// A single token produced by the tokenizer. #[derive(Debug, Clone, PartialEq, Eq)] pub struct Token { /// The normalized token text. pub text: String, /// Zero-based position of the token within the input field. Positions /// advance for every raw token, including ones removed by stopword or /// length filtering. pub position: u32, } /// Configuration for [`Tokenizer`]. #[derive(Debug, Clone)] pub struct TokenizerConfig { /// Lowercase tokens (Unicode-aware). Default: `true`. pub lowercase: bool, /// Fold common Latin diacritics to ASCII (`é` → `e`, `ß` → `ss`). /// Default: `true`. pub fold_diacritics: bool, /// Tokens shorter than this many characters are dropped. Default: `1`. pub min_token_len: usize, /// Tokens longer than this many characters are dropped (not truncated). /// Default: `64`. pub max_token_len: usize, /// Stopwords to drop. Compared **after** normalization, so entries should /// be lowercase when `lowercase` is enabled. Default: empty. pub stopwords: HashSet, } impl Default for TokenizerConfig { fn default() -> Self { TokenizerConfig { lowercase: true, fold_diacritics: true, min_token_len: 1, max_token_len: 64, stopwords: HashSet::new(), } } } /// A deterministic, configuration-driven tokenizer. #[derive(Debug, Clone)] pub struct Tokenizer { config: TokenizerConfig, } impl Default for Tokenizer { fn default() -> Self { Tokenizer::new(TokenizerConfig::default()) } } impl Tokenizer { /// Create a tokenizer with the given configuration. pub fn new(config: TokenizerConfig) -> Self { Tokenizer { config } } /// Access the active configuration. pub fn config(&self) -> &TokenizerConfig { &self.config } /// Tokenize `text` into a vector of normalized tokens with positions. pub fn tokenize(&self, text: &str) -> Vec { let mut out = Vec::new(); let mut current = String::new(); let mut position: u32 = 0; for ch in text.chars() { if ch.is_alphanumeric() { if self.config.lowercase { for lc in ch.to_lowercase() { self.push_normalized(lc, &mut current); } } else { self.push_normalized(ch, &mut current); } } else if !current.is_empty() { self.flush(&mut current, &mut position, &mut out); } } if !current.is_empty() { self.flush(&mut current, &mut position, &mut out); } out } fn push_normalized(&self, c: char, buf: &mut String) { if self.config.fold_diacritics { fold_into(c, buf); } else { buf.push(c); } } fn flush(&self, current: &mut String, position: &mut u32, out: &mut Vec) { let token = std::mem::take(current); let pos = *position; // Positions always advance, even for filtered tokens. *position += 1; let char_len = token.chars().count(); if char_len < self.config.min_token_len || char_len > self.config.max_token_len { return; } if self.config.stopwords.contains(&token) { return; } out.push(Token { text: token, position: pos, }); } } /// Fold a single (already lowercased) character into `buf`, mapping common /// Latin diacritics to their ASCII base form. Characters without an entry are /// pushed unchanged. fn fold_into(c: char, buf: &mut String) { match c { 'à' | 'á' | 'â' | 'ã' | 'ä' | 'å' | 'ā' | 'ă' | 'ą' => buf.push('a'), 'ç' | 'ć' | 'č' => buf.push('c'), 'è' | 'é' | 'ê' | 'ë' | 'ē' | 'ė' | 'ę' => buf.push('e'), 'ì' | 'í' | 'î' | 'ï' | 'ī' | 'į' => buf.push('i'), 'ñ' | 'ń' => buf.push('n'), 'ò' | 'ó' | 'ô' | 'õ' | 'ö' | 'ø' | 'ō' => buf.push('o'), 'ù' | 'ú' | 'û' | 'ü' | 'ū' => buf.push('u'), 'ý' | 'ÿ' => buf.push('y'), 'š' | 'ś' => buf.push('s'), 'ž' | 'ź' | 'ż' => buf.push('z'), 'đ' => buf.push('d'), 'ł' => buf.push('l'), 'ß' => buf.push_str("ss"), 'æ' => buf.push_str("ae"), 'œ' => buf.push_str("oe"), _ => buf.push(c), } } #[cfg(test)] mod tests { use super::*; fn texts(tokens: &[Token]) -> Vec<&str> { tokens.iter().map(|t| t.text.as_str()).collect() } #[test] fn basic_split_and_lowercase() { let t = Tokenizer::default(); let tokens = t.tokenize("Hello, World! Rust2024"); assert_eq!(texts(&tokens), vec!["hello", "world", "rust2024"]); assert_eq!( tokens.iter().map(|t| t.position).collect::>(), vec![0, 1, 2] ); } #[test] fn apostrophes_split() { let t = Tokenizer::default(); assert_eq!(texts(&t.tokenize("don't")), vec!["don", "t"]); } #[test] fn diacritic_folding() { let t = Tokenizer::default(); assert_eq!( texts(&t.tokenize("Café Köln œuvre straße")), vec!["cafe", "koln", "oeuvre", "strasse"] ); } #[test] fn folding_can_be_disabled() { let t = Tokenizer::new(TokenizerConfig { fold_diacritics: false, ..TokenizerConfig::default() }); assert_eq!(texts(&t.tokenize("Café")), vec!["café"]); } #[test] fn stopwords_preserve_positions() { let mut stop = HashSet::new(); stop.insert("world".to_string()); let t = Tokenizer::new(TokenizerConfig { stopwords: stop, ..TokenizerConfig::default() }); let tokens = t.tokenize("hello world again"); assert_eq!(texts(&tokens), vec!["hello", "again"]); assert_eq!(tokens[0].position, 0); assert_eq!(tokens[1].position, 2); } #[test] fn length_filters() { let t = Tokenizer::new(TokenizerConfig { min_token_len: 2, max_token_len: 5, ..TokenizerConfig::default() }); let tokens = t.tokenize("a ab abcde abcdef"); assert_eq!(texts(&tokens), vec!["ab", "abcde"]); // Positions still advance for dropped tokens. assert_eq!(tokens[0].position, 1); assert_eq!(tokens[1].position, 2); } #[test] fn empty_and_punctuation_only() { let t = Tokenizer::default(); assert!(t.tokenize("").is_empty()); assert!(t.tokenize("... --- !!!").is_empty()); } }