"""Preference extraction from notes, plus higher-order preferences from
routine claims.

Layer 1 (evidence -> claim) reads first-person statements in notes:

* "I love/like/enjoy X"            -> ``preference.likes``
* "my favourite C is X"            -> ``preference.likes`` (with category)
* "I hate/dislike/can't stand X"   -> ``preference.dislikes``
* "I'm allergic to / I avoid X"    -> ``preference.avoids``

Layer 2 (claim -> claim) maps ``routine.weekly`` activities through a small
activity lexicon into ``preference.activity`` -- e.g. a weekly yoga routine
implies the user practices yoga.  Its input is the routine *claim*, so
refuting the routine cascades into the preference.
"""
from __future__ import annotations

import re
from collections import defaultdict
from typing import Dict, List, Optional

from mnema.derive.confidence import clamp, discount, support_curve, temporal_decay
from mnema.derive.derivers.base import (
    CandidateClaim,
    DerivationContext,
    Deriver,
    DeriverInfo,
    normalize_text,
)

EXPLICIT_SUPPORT_MIDPOINT = 0.5
EXPLICIT_SUPPORT_STEEPNESS = 2.0
LIKE_CEILING = 0.85
AVOID_CEILING = 0.92
HALF_LIFE_DAYS = 365.0
GRACE_DAYS = 60.0
ACTIVITY_DISCOUNT = 0.85

_SENTENCE_SPLIT = re.compile(r"[.!?;\n]+")
_TRAILER = re.compile(r"\b(?:because|since|when|but|and|so|which)\b.*$")
_ARTICLES = ("the ", "a ", "an ", "some ", "my ")
_PRONOUNS = {"it", "that", "this", "them", "those", "these", "him", "her"}

_PHRASE = r"(?P<phrase>[a-z0-9][\w '\-]{1,50})"
_LIKE = re.compile(r"\bi (?:really |absolutely |just )?(?:love|like|enjoy|adore) " + _PHRASE)
_FAV = re.compile(r"\bmy favou?rite (?P<category>[\w ]{1,30}?) is " + _PHRASE)
_DISLIKE = re.compile(
    r"\bi (?:really |absolutely )?(?:hate|dislike|can'?t stand|cannot stand|loathe) " + _PHRASE
)
_AVOID = re.compile(
    r"\bi(?:'m| am)? (?:allergic to|avoid|can'?t eat|cannot eat|stay away from) " + _PHRASE
)

#: token (as it appears in routine activity text) -> canonical activity name
ACTIVITY_LEXICON: Dict[str, str] = {
    "yoga": "yoga",
    "gym": "strength training",
    "weights": "strength training",
    "run": "running",
    "running": "running",
    "jog": "running",
    "climb": "climbing",
    "climbing": "climbing",
    "bouldering": "climbing",
    "swim": "swimming",
    "swimming": "swimming",
    "cycling": "cycling",
    "bike": "cycling",
    "spin": "cycling",
    "tennis": "tennis",
    "soccer": "soccer",
    "football": "soccer",
    "pilates": "pilates",
    "meditation": "meditation",
    "hike": "hiking",
    "hiking": "hiking",
}


def _clean_phrase(raw: str) -> Optional[str]:
    phrase = _TRAILER.sub("", raw).strip(" \t'\"-,")
    phrase = normalize_text(phrase)
    for art in _ARTICLES:
        if phrase.startswith(art):
            phrase = phrase[len(art):]
            break
    phrase = phrase.strip()
    if len(phrase) < 2 or phrase in _PRONOUNS or len(phrase.split()) > 5:
        return None
    return phrase


def _note_text(payload: dict) -> str:
    return str(payload.get("text") or payload.get("body") or payload.get("content") or "")


class PreferencesDeriver(Deriver):
    info = DeriverInfo(
        deriver_id="mnema.preferences",
        version="1.0.0",
        consumes_evidence=("note",),
        consumes_predicates=("routine.weekly",),
        produces_predicates=(
            "preference.likes",
            "preference.dislikes",
            "preference.avoids",
            "preference.activity",
        ),
    )

    def derive(self, ctx: DerivationContext) -> List[CandidateClaim]:
        return self._from_notes(ctx) + self._from_routines(ctx)

    # ------------------------------------------------------------------ #
    def _from_notes(self, ctx: DerivationContext) -> List[CandidateClaim]:
        # (predicate, phrase) -> accumulator
        tally: Dict[tuple, dict] = defaultdict(
            lambda: {
                "count": 0,
                "evidence_ids": set(),
                "quotes": [],
                "category": None,
                "allergy": False,
                "last_seen": None,
            }
        )

        for ev in ctx.evidence("note"):
            text = _note_text(ev.payload)
            if not text:
                continue
            for sentence in _SENTENCE_SPLIT.split(text.lower()):
                sentence = sentence.strip()
                if not sentence:
                    continue
                for predicate, match in self._scan_sentence(sentence):
                    phrase = _clean_phrase(match.group("phrase"))
                    if phrase is None:
                        continue
                    entry = tally[(predicate, phrase)]
                    entry["count"] += 1
                    entry["evidence_ids"].add(ev.evidence_id)
                    if len(entry["quotes"]) < 3:
                        entry["quotes"].append(sentence[:120])
                    if predicate == "preference.avoids" and "allergic" in match.group(0):
                        entry["allergy"] = True
                    if "category" in match.groupdict() and match.group("category"):
                        entry["category"] = normalize_text(match.group("category"))
                    if entry["last_seen"] is None or ev.observed_at > entry["last_seen"]:
                        entry["last_seen"] = ev.observed_at

        out: List[CandidateClaim] = []
        for (predicate, phrase) in sorted(tally):
            entry = tally[(predicate, phrase)]
            n = entry["count"]
            ceiling = AVOID_CEILING if predicate == "preference.avoids" else LIKE_CEILING
            base = support_curve(
                n,
                midpoint=EXPLICIT_SUPPORT_MIDPOINT,
                steepness=EXPLICIT_SUPPORT_STEEPNESS,
                ceiling=ceiling,
            )
            age = 0.0
            if entry["last_seen"]:
                age = max(0.0, ctx.age_days(entry["last_seen"]) - GRACE_DAYS)
            conf = clamp(temporal_decay(base, age, HALF_LIFE_DAYS))

            value = {
                "item": phrase,
                "mentions": n,
                "quotes": list(entry["quotes"]),
                "last_seen": entry["last_seen"],
            }
            if entry["category"]:
                value["category"] = entry["category"]
            if entry["allergy"]:
                value["reason"] = "allergy"

            verb = {
                "preference.likes": "likes",
                "preference.dislikes": "dislikes",
                "preference.avoids": "avoids",
            }[predicate]
            out.append(
                CandidateClaim(
                    subject="user",
                    predicate=predicate,
                    identity={"item": phrase},
                    value=value,
                    confidence=conf,
                    inputs=sorted(entry["evidence_ids"]),
                    summary=(
                        f"The user {verb} '{phrase}' -- stated in their own "
                        f"words {n} time(s) in notes."
                    ),
                    reasoning=[
                        f"First-person statement(s) matched in notes ({n}x), "
                        f"e.g. \"{entry['quotes'][0]}\".",
                        "Explicit self-statements start at high confidence "
                        f"(ceiling {ceiling}) and decay slowly "
                        f"(half-life {HALF_LIFE_DAYS:.0f} days).",
                    ],
                    confidence_account={
                        "method": "support_curve(mentions) * temporal_decay",
                        "mentions": n,
                        "support_midpoint": EXPLICIT_SUPPORT_MIDPOINT,
                        "support_steepness": EXPLICIT_SUPPORT_STEEPNESS,
                        "support_ceiling": ceiling,
                        "base": round(base, 6),
                        "age_days_past_grace": round(age, 3),
                        "half_life_days": HALF_LIFE_DAYS,
                        "result": round(conf, 6),
                    },
                )
            )
        return out

    @staticmethod
    def _scan_sentence(sentence: str):
        for predicate, pattern in (
            ("preference.likes", _LIKE),
            ("preference.likes", _FAV),
            ("preference.dislikes", _DISLIKE),
            ("preference.avoids", _AVOID),
        ):
            for match in pattern.finditer(sentence):
                yield predicate, match

    # ------------------------------------------------------------------ #
    def _from_routines(self, ctx: DerivationContext) -> List[CandidateClaim]:
        # activity -> best supporting routine claim
        best: Dict[str, object] = {}
        for claim in ctx.claims("routine.weekly"):
            activity_text = str(claim.value.get("activity", ""))
            for token in normalize_text(activity_text).split():
                canonical = ACTIVITY_LEXICON.get(token)
                if canonical is None:
                    continue
                prev = best.get(canonical)
                if prev is None or claim.confidence > prev.confidence:
                    best[canonical] = claim

        out: List[CandidateClaim] = []
        for canonical in sorted(best):
            claim = best[canonical]
            conf = clamp(discount(claim.confidence, ACTIVITY_DISCOUNT))
            out.append(
                CandidateClaim(
                    subject="user",
                    predicate="preference.activity",
                    identity={"activity": canonical},
                    value={
                        "activity": canonical,
                        "based_on": claim.claim_id,
                        "routine": {
                            "activity": claim.value.get("activity"),
                            "weekday": claim.value.get("weekday"),
                            "start_time_local": claim.value.get("start_time_local"),
                        },
                    },
                    confidence=conf,
                    inputs=[claim.claim_id],
                    summary=(
                        f"The user practices {canonical}: inferred from their "
                        f"weekly '{claim.value.get('activity')}' routine."
                    ),
                    reasoning=[
                        f"Derived from claim {claim.claim_id} (routine.weekly "
                        f"'{claim.value.get('activity')}' on "
                        f"{claim.value.get('weekday')}s, confidence "
                        f"{claim.confidence:.4f}).",
                        f"Activity lexicon maps the routine to '{canonical}'.",
                        "If the routine claim is refuted or invalidated, this "
                        "preference is mechanically invalidated with it.",
                    ],
                    confidence_account={
                        "method": "input_confidence * activity_discount",
                        "input_claim": claim.claim_id,
                        "input_confidence": round(claim.confidence, 6),
                        "activity_discount": ACTIVITY_DISCOUNT,
                        "result": round(conf, 6),
                    },
                )
            )
        return out