"""Deterministic, dependency-free demo embeddings.

The Shoal database is embedding-provider agnostic; it stores and searches
whatever vectors you give it. For a self-contained demo that runs offline,
we use a simple feature-hashing embedding: each token is hashed into one of
``DIM`` buckets with a +/- sign, the bucket counts are accumulated, and the
vector is L2-normalised.

This is NOT a semantic embedding model — it behaves like a bag-of-words
signature — but it is deterministic, fast, vendor-free, and good enough to
demonstrate vector search, hybrid fusion, and filters end to end. Swap in a
real embedding provider (OpenAI, Cohere, sentence-transformers, ...) by
replacing :func:`embed`.
"""

from __future__ import annotations

import hashlib
import math
import re

DIM = 64

_TOKEN = re.compile(r"[a-z0-9]+")


def tokenize(text: str) -> list[str]:
    return _TOKEN.findall(text.lower())


def embed(text: str) -> list[float]:
    vec = [0.0] * DIM
    for tok in tokenize(text):
        digest = hashlib.md5(tok.encode("utf-8")).digest()
        idx = int.from_bytes(digest[:4], "big") % DIM
        sign = 1.0 if digest[4] % 2 == 0 else -1.0
        vec[idx] += sign
    norm = math.sqrt(sum(x * x for x in vec))
    if norm > 0.0:
        vec = [x / norm for x in vec]
    return vec