//! Persistent inverted index segments. //! //! An [`InvertedSegment`] is an immutable, self-contained full-text index over //! one storage segment's documents (addressed by their local `u32` doc //! ordinals, the same ordinal space used by the filter doc-sets). Segments are //! serialized to a single blob that lives in object storage under the //! namespace's index prefix and is cached on local disk; on load, the term //! dictionary and per-field statistics are materialized in memory while //! posting lists stay as compact bytes and are decoded lazily per query term. //! //! ## On-disk layout (version 1) //! //! ```text //! magic "SHTX" (4 bytes) //! version u32 LE //! num_fields u16 LE //! per field: name_len u16 LE, name (utf8), doc_count u32 LE, total_len u64 LE //! num_docs u32 LE -- size of the doc-ordinal space //! doc lengths: num_fields * num_docs uvarints (field-major) //! num_terms u32 LE //! per term (sorted by term bytes): //! term_len u16 LE, term (utf8), entry_count u16 LE //! per entry: field u16 LE, doc_freq u32 LE, offset u32 LE, len u32 LE //! blob_len u32 LE //! blob postings bytes; per posting: uvarint(doc delta), uvarint(tf) //! ``` //! //! Doc ids inside a posting list are delta-encoded in ascending order; the //! first delta is the absolute doc id. use std::collections::hash_map::Entry; use std::collections::{BTreeMap, HashMap}; use std::fmt; use crate::wire::{self, Reader, WireError}; use super::tokenizer::Tokenizer; /// Magic bytes identifying an inverted text segment. pub const SEGMENT_MAGIC: [u8; 4] = *b"SHTX"; /// Current segment format version. pub const SEGMENT_VERSION: u32 = 1; /// Errors produced by the full-text engine. #[derive(Debug)] pub enum TextError { /// Low-level decode failure (truncation, varint overflow). Wire(WireError), /// The blob does not start with the segment magic. BadMagic, /// The blob has an unknown format version. UnsupportedVersion(u32), /// A field name was not registered with the builder/segment. UnknownField(String), /// The same field name was registered twice. DuplicateField(String), /// A non-UTF8 string was found while decoding. InvalidUtf8, /// Structural corruption with a human-readable description. Corrupt(String), } impl fmt::Display for TextError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { TextError::Wire(e) => write!(f, "wire decode error: {e}"), TextError::BadMagic => write!(f, "not an inverted text segment (bad magic)"), TextError::UnsupportedVersion(v) => { write!(f, "unsupported inverted segment version {v}") } TextError::UnknownField(name) => write!(f, "unknown text field: {name}"), TextError::DuplicateField(name) => write!(f, "duplicate text field: {name}"), TextError::InvalidUtf8 => write!(f, "invalid utf8 in segment"), TextError::Corrupt(msg) => write!(f, "corrupt inverted segment: {msg}"), } } } impl std::error::Error for TextError {} impl From for TextError { fn from(e: WireError) -> Self { TextError::Wire(e) } } /// Per-field statistics stored in the segment header. #[derive(Debug, Clone, PartialEq, Eq)] pub struct FieldInfo { /// Field name. pub name: String, /// Number of documents that contain at least one token in this field. pub doc_count: u32, /// Total number of tokens indexed in this field across all documents. pub total_len: u64, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] struct TermEntry { field: u16, doc_freq: u32, offset: u32, len: u32, } // --------------------------------------------------------------------------- // Builder // --------------------------------------------------------------------------- /// Builds an [`InvertedSegment`] from documents. /// /// Documents are addressed by their local `u32` ordinal. Calling /// [`InvertedIndexBuilder::add_document`] twice with the same ordinal appends /// the new text (term frequencies and field lengths accumulate). #[derive(Debug)] pub struct InvertedIndexBuilder { tokenizer: Tokenizer, field_names: Vec, /// term -> field -> doc -> tf postings: BTreeMap>>, /// per field: doc -> token count doc_lens: Vec>, field_doc_counts: Vec, field_total_lens: Vec, max_doc: Option, } impl InvertedIndexBuilder { /// Create a builder over the given field names. Field ids are assigned in /// the order given (`fields[i]` gets id `i as u16`). pub fn new(tokenizer: Tokenizer, fields: &[&str]) -> Result { let mut seen = HashMap::new(); for (i, name) in fields.iter().enumerate() { if seen.insert(name.to_string(), i).is_some() { return Err(TextError::DuplicateField(name.to_string())); } } let n = fields.len(); Ok(InvertedIndexBuilder { tokenizer, field_names: fields.iter().map(|s| s.to_string()).collect(), postings: BTreeMap::new(), doc_lens: vec![HashMap::new(); n], field_doc_counts: vec![0; n], field_total_lens: vec![0; n], max_doc: None, }) } /// Number of documents (distinct ordinals) seen so far is not tracked; /// this returns the ordinal-space size implied by the maximum doc id. pub fn num_docs(&self) -> u32 { self.max_doc.map(|d| d + 1).unwrap_or(0) } /// Index `fields` (name, text) for document ordinal `doc`. pub fn add_document(&mut self, doc: u32, fields: &[(&str, &str)]) -> Result<(), TextError> { self.max_doc = Some(self.max_doc.map_or(doc, |m| m.max(doc))); for (field_name, text) in fields { let fid = self .field_names .iter() .position(|n| n == field_name) .ok_or_else(|| TextError::UnknownField(field_name.to_string()))?; let tokens = self.tokenizer.tokenize(text); let len = tokens.len() as u32; if len == 0 { continue; } match self.doc_lens[fid].entry(doc) { Entry::Occupied(mut e) => *e.get_mut() += len, Entry::Vacant(e) => { e.insert(len); self.field_doc_counts[fid] += 1; } } self.field_total_lens[fid] += len as u64; let fid = fid as u16; for tok in tokens { *self .postings .entry(tok.text) .or_default() .entry(fid) .or_default() .entry(doc) .or_insert(0) += 1; } } Ok(()) } /// Finish building and produce an immutable segment. pub fn build(self) -> InvertedSegment { let num_docs = self.num_docs(); let num_fields = self.field_names.len(); let mut doc_lens = vec![vec![0u32; num_docs as usize]; num_fields]; for (f, lens) in self.doc_lens.iter().enumerate() { for (&doc, &len) in lens { doc_lens[f][doc as usize] = len; } } let mut blob: Vec = Vec::new(); let mut terms: BTreeMap> = BTreeMap::new(); for (term, by_field) in self.postings { let mut entries = Vec::with_capacity(by_field.len()); for (&field, docs) in &by_field { let mut posting: Vec<(u32, u32)> = docs.iter().map(|(&d, &tf)| (d, tf)).collect(); posting.sort_unstable_by_key(|p| p.0); let offset = blob.len() as u32; let mut prev = 0u32; let mut first = true; for (doc, tf) in &posting { let delta = if first { *doc } else { *doc - prev }; wire::put_uvarint(&mut blob, delta as u64); wire::put_uvarint(&mut blob, *tf as u64); prev = *doc; first = false; } entries.push(TermEntry { field, doc_freq: posting.len() as u32, offset, len: blob.len() as u32 - offset, }); } terms.insert(term, entries); } let fields = self .field_names .into_iter() .enumerate() .map(|(i, name)| FieldInfo { name, doc_count: self.field_doc_counts[i], total_len: self.field_total_lens[i], }) .collect(); InvertedSegment { fields, num_docs, doc_lens, terms, postings_blob: blob, } } } // --------------------------------------------------------------------------- // Segment // --------------------------------------------------------------------------- /// An immutable full-text index over one segment's documents. #[derive(Debug)] pub struct InvertedSegment { fields: Vec, num_docs: u32, /// Per field, dense doc-length array of size `num_docs`. doc_lens: Vec>, terms: BTreeMap>, postings_blob: Vec, } impl InvertedSegment { /// Number of fields in this segment. pub fn num_fields(&self) -> usize { self.fields.len() } /// Size of the doc-ordinal space covered by this segment. pub fn num_docs(&self) -> u32 { self.num_docs } /// Number of distinct terms in the dictionary. pub fn term_count(&self) -> usize { self.terms.len() } /// Iterate over all dictionary terms in sorted order. pub fn terms(&self) -> impl Iterator { self.terms.keys().map(|s| s.as_str()) } /// Resolve a field name to its id. pub fn field_id(&self, name: &str) -> Option { self.fields .iter() .position(|f| f.name == name) .map(|i| i as u16) } /// Statistics for a field. pub fn field_stats(&self, field: u16) -> Option<&FieldInfo> { self.fields.get(field as usize) } /// Token length of `doc` in `field` (0 if absent or out of range). pub fn doc_len(&self, field: u16, doc: u32) -> u32 { self.doc_lens .get(field as usize) .and_then(|v| v.get(doc as usize)) .copied() .unwrap_or(0) } /// Average token length of `field` over documents that contain it. pub fn avg_field_len(&self, field: u16) -> f32 { match self.field_stats(field) { Some(s) if s.doc_count > 0 => s.total_len as f32 / s.doc_count as f32, _ => 0.0, } } /// Number of documents containing `term` in `field`. pub fn doc_freq(&self, term: &str, field: u16) -> u32 { self.terms .get(term) .and_then(|es| es.iter().find(|e| e.field == field)) .map(|e| e.doc_freq) .unwrap_or(0) } /// Lazily decode the posting list for `(term, field)`. pub fn postings(&self, term: &str, field: u16) -> Option> { let entry = self .terms .get(term)? .iter() .find(|e| e.field == field)?; let start = entry.offset as usize; let end = start + entry.len as usize; let slice = &self.postings_blob[start..end]; Some(PostingsCursor { reader: Reader::new(slice), remaining: entry.doc_freq, prev: 0, first: true, }) } /// Serialize the segment into a single blob. pub fn to_bytes(&self) -> Vec { let mut out = Vec::new(); out.extend_from_slice(&SEGMENT_MAGIC); wire::put_u32(&mut out, SEGMENT_VERSION); wire::put_u16(&mut out, self.fields.len() as u16); for f in &self.fields { wire::put_u16(&mut out, f.name.len() as u16); out.extend_from_slice(f.name.as_bytes()); wire::put_u32(&mut out, f.doc_count); wire::put_u64(&mut out, f.total_len); } wire::put_u32(&mut out, self.num_docs); for field_lens in &self.doc_lens { for &len in field_lens { wire::put_uvarint(&mut out, len as u64); } } wire::put_u32(&mut out, self.terms.len() as u32); for (term, entries) in &self.terms { wire::put_u16(&mut out, term.len() as u16); out.extend_from_slice(term.as_bytes()); wire::put_u16(&mut out, entries.len() as u16); for e in entries { wire::put_u16(&mut out, e.field); wire::put_u32(&mut out, e.doc_freq); wire::put_u32(&mut out, e.offset); wire::put_u32(&mut out, e.len); } } wire::put_u32(&mut out, self.postings_blob.len() as u32); out.extend_from_slice(&self.postings_blob); out } /// Deserialize a segment, validating structure and bounds. Returns an /// error (never panics) on truncated or corrupt input. pub fn from_bytes(bytes: &[u8]) -> Result { let mut r = Reader::new(bytes); let magic = r.take(4)?; if magic != SEGMENT_MAGIC { return Err(TextError::BadMagic); } let version = r.read_u32()?; if version != SEGMENT_VERSION { return Err(TextError::UnsupportedVersion(version)); } let num_fields = r.read_u16()? as usize; let mut fields = Vec::new(); for _ in 0..num_fields { let name_len = r.read_u16()? as usize; let name_bytes = r.take(name_len)?; let name = String::from_utf8(name_bytes.to_vec()).map_err(|_| TextError::InvalidUtf8)?; let doc_count = r.read_u32()?; let total_len = r.read_u64()?; fields.push(FieldInfo { name, doc_count, total_len, }); } let num_docs = r.read_u32()?; // Each doc length is at least one byte; reject impossible sizes // before allocating. let needed = num_docs as u64 * num_fields as u64; if needed > r.remaining() as u64 { return Err(TextError::Corrupt(format!( "doc-length table needs at least {needed} bytes, {} remain", r.remaining() ))); } let mut doc_lens = Vec::with_capacity(num_fields); for _ in 0..num_fields { let mut lens = Vec::with_capacity(num_docs as usize); for _ in 0..num_docs { lens.push(r.read_uvarint_u32()?); } doc_lens.push(lens); } let num_terms = r.read_u32()?; let mut terms = BTreeMap::new(); for _ in 0..num_terms { let term_len = r.read_u16()? as usize; let term_bytes = r.take(term_len)?; let term = String::from_utf8(term_bytes.to_vec()).map_err(|_| TextError::InvalidUtf8)?; let entry_count = r.read_u16()? as usize; let mut entries = Vec::new(); for _ in 0..entry_count { let field = r.read_u16()?; if field as usize >= num_fields { return Err(TextError::Corrupt(format!( "term '{term}' references field {field}, only {num_fields} declared" ))); } let doc_freq = r.read_u32()?; let offset = r.read_u32()?; let len = r.read_u32()?; entries.push(TermEntry { field, doc_freq, offset, len, }); } terms.insert(term, entries); } let blob_len = r.read_u32()? as usize; let blob = r.take(blob_len)?.to_vec(); // Validate posting ranges against the blob. for (term, entries) in &terms { for e in entries { let end = e.offset as u64 + e.len as u64; if end > blob.len() as u64 { return Err(TextError::Corrupt(format!( "posting range for term '{term}' out of bounds" ))); } } } Ok(InvertedSegment { fields, num_docs, doc_lens, terms, postings_blob: blob, }) } } /// Lazily decodes a posting list. Yields `(doc, term_frequency)` pairs in /// ascending doc order. #[derive(Debug)] pub struct PostingsCursor<'a> { reader: Reader<'a>, remaining: u32, prev: u32, first: bool, } impl Iterator for PostingsCursor<'_> { type Item = (u32, u32); fn next(&mut self) -> Option<(u32, u32)> { if self.remaining == 0 { return None; } // Posting ranges are bounds-validated at load time, so decode errors // here indicate corruption inside a valid range; we stop iteration // rather than panic. let delta = self.reader.read_uvarint_u32().ok()?; let tf = self.reader.read_uvarint_u32().ok()?; let doc = if self.first { delta } else { self.prev + delta }; self.first = false; self.prev = doc; self.remaining -= 1; Some((doc, tf)) } } #[cfg(test)] mod tests { use super::*; fn segment() -> InvertedSegment { let mut b = InvertedIndexBuilder::new(Tokenizer::default(), &["title", "body"]).unwrap(); b.add_document(0, &[("title", "Rust guide"), ("body", "rust database storage")]) .unwrap(); b.add_document(2, &[("body", "database engine engine")]) .unwrap(); b.build() } #[test] fn builder_stats() { let seg = segment(); assert_eq!(seg.num_docs(), 3); // ordinal space covers doc 0..=2 assert_eq!(seg.num_fields(), 2); let title = seg.field_stats(seg.field_id("title").unwrap()).unwrap(); assert_eq!(title.doc_count, 1); assert_eq!(title.total_len, 2); let body = seg.field_stats(seg.field_id("body").unwrap()).unwrap(); assert_eq!(body.doc_count, 2); assert_eq!(body.total_len, 6); assert_eq!(seg.doc_len(1, 0), 3); assert_eq!(seg.doc_len(1, 1), 0); // doc 1 never indexed assert_eq!(seg.doc_len(1, 2), 3); } #[test] fn postings_decode() { let seg = segment(); let body = seg.field_id("body").unwrap(); let postings: Vec<_> = seg.postings("database", body).unwrap().collect(); assert_eq!(postings, vec![(0, 1), (2, 1)]); let postings: Vec<_> = seg.postings("engine", body).unwrap().collect(); assert_eq!(postings, vec![(2, 2)]); assert!(seg.postings("missing", body).is_none()); assert_eq!(seg.doc_freq("database", body), 2); assert_eq!(seg.doc_freq("database", seg.field_id("title").unwrap()), 0); } #[test] fn roundtrip_preserves_everything() { let seg = segment(); let bytes = seg.to_bytes(); let seg2 = InvertedSegment::from_bytes(&bytes).unwrap(); assert_eq!(seg2.num_docs(), seg.num_docs()); assert_eq!(seg2.term_count(), seg.term_count()); for term in seg.terms() { for field in 0..seg.num_fields() as u16 { assert_eq!(seg.doc_freq(term, field), seg2.doc_freq(term, field)); let a: Option> = seg.postings(term, field).map(|c| c.collect()); let b: Option> = seg2.postings(term, field).map(|c| c.collect()); assert_eq!(a, b); } } for field in 0..seg.num_fields() as u16 { for doc in 0..seg.num_docs() { assert_eq!(seg.doc_len(field, doc), seg2.doc_len(field, doc)); } assert_eq!(seg.field_stats(field), seg2.field_stats(field)); } } #[test] fn duplicate_field_rejected() { let err = InvertedIndexBuilder::new(Tokenizer::default(), &["a", "a"]).unwrap_err(); assert!(matches!(err, TextError::DuplicateField(_))); } #[test] fn unknown_field_rejected() { let mut b = InvertedIndexBuilder::new(Tokenizer::default(), &["body"]).unwrap(); let err = b.add_document(0, &[("nope", "text")]).unwrap_err(); assert!(matches!(err, TextError::UnknownField(_))); } #[test] fn empty_segment() { let b = InvertedIndexBuilder::new(Tokenizer::default(), &["body"]).unwrap(); let seg = b.build(); assert_eq!(seg.num_docs(), 0); assert_eq!(seg.term_count(), 0); let seg2 = InvertedSegment::from_bytes(&seg.to_bytes()).unwrap(); assert_eq!(seg2.num_docs(), 0); } }