"""Preference extraction from notes, plus higher-order preferences from routine claims. Layer 1 (evidence -> claim) reads first-person statements in notes: * "I love/like/enjoy X" -> ``preference.likes`` * "my favourite C is X" -> ``preference.likes`` (with category) * "I hate/dislike/can't stand X" -> ``preference.dislikes`` * "I'm allergic to / I avoid X" -> ``preference.avoids`` Layer 2 (claim -> claim) maps ``routine.weekly`` activities through a small activity lexicon into ``preference.activity`` -- e.g. a weekly yoga routine implies the user practices yoga. Its input is the routine *claim*, so refuting the routine cascades into the preference. """ from __future__ import annotations import re from collections import defaultdict from typing import Dict, List, Optional from mnema.derive.confidence import clamp, discount, support_curve, temporal_decay from mnema.derive.derivers.base import ( CandidateClaim, DerivationContext, Deriver, DeriverInfo, normalize_text, ) EXPLICIT_SUPPORT_MIDPOINT = 0.5 EXPLICIT_SUPPORT_STEEPNESS = 2.0 LIKE_CEILING = 0.85 AVOID_CEILING = 0.92 HALF_LIFE_DAYS = 365.0 GRACE_DAYS = 60.0 ACTIVITY_DISCOUNT = 0.85 _SENTENCE_SPLIT = re.compile(r"[.!?;\n]+") _TRAILER = re.compile(r"\b(?:because|since|when|but|and|so|which)\b.*$") _ARTICLES = ("the ", "a ", "an ", "some ", "my ") _PRONOUNS = {"it", "that", "this", "them", "those", "these", "him", "her"} _PHRASE = r"(?P[a-z0-9][\w '\-]{1,50})" _LIKE = re.compile(r"\bi (?:really |absolutely |just )?(?:love|like|enjoy|adore) " + _PHRASE) _FAV = re.compile(r"\bmy favou?rite (?P[\w ]{1,30}?) is " + _PHRASE) _DISLIKE = re.compile( r"\bi (?:really |absolutely )?(?:hate|dislike|can'?t stand|cannot stand|loathe) " + _PHRASE ) _AVOID = re.compile( r"\bi(?:'m| am)? (?:allergic to|avoid|can'?t eat|cannot eat|stay away from) " + _PHRASE ) #: token (as it appears in routine activity text) -> canonical activity name ACTIVITY_LEXICON: Dict[str, str] = { "yoga": "yoga", "gym": "strength training", "weights": "strength training", "run": "running", "running": "running", "jog": "running", "climb": "climbing", "climbing": "climbing", "bouldering": "climbing", "swim": "swimming", "swimming": "swimming", "cycling": "cycling", "bike": "cycling", "spin": "cycling", "tennis": "tennis", "soccer": "soccer", "football": "soccer", "pilates": "pilates", "meditation": "meditation", "hike": "hiking", "hiking": "hiking", } def _clean_phrase(raw: str) -> Optional[str]: phrase = _TRAILER.sub("", raw).strip(" \t'\"-,") phrase = normalize_text(phrase) for art in _ARTICLES: if phrase.startswith(art): phrase = phrase[len(art):] break phrase = phrase.strip() if len(phrase) < 2 or phrase in _PRONOUNS or len(phrase.split()) > 5: return None return phrase def _note_text(payload: dict) -> str: return str(payload.get("text") or payload.get("body") or payload.get("content") or "") class PreferencesDeriver(Deriver): info = DeriverInfo( deriver_id="mnema.preferences", version="1.0.0", consumes_evidence=("note",), consumes_predicates=("routine.weekly",), produces_predicates=( "preference.likes", "preference.dislikes", "preference.avoids", "preference.activity", ), ) def derive(self, ctx: DerivationContext) -> List[CandidateClaim]: return self._from_notes(ctx) + self._from_routines(ctx) # ------------------------------------------------------------------ # def _from_notes(self, ctx: DerivationContext) -> List[CandidateClaim]: # (predicate, phrase) -> accumulator tally: Dict[tuple, dict] = defaultdict( lambda: { "count": 0, "evidence_ids": set(), "quotes": [], "category": None, "allergy": False, "last_seen": None, } ) for ev in ctx.evidence("note"): text = _note_text(ev.payload) if not text: continue for sentence in _SENTENCE_SPLIT.split(text.lower()): sentence = sentence.strip() if not sentence: continue for predicate, match in self._scan_sentence(sentence): phrase = _clean_phrase(match.group("phrase")) if phrase is None: continue entry = tally[(predicate, phrase)] entry["count"] += 1 entry["evidence_ids"].add(ev.evidence_id) if len(entry["quotes"]) < 3: entry["quotes"].append(sentence[:120]) if predicate == "preference.avoids" and "allergic" in match.group(0): entry["allergy"] = True if "category" in match.groupdict() and match.group("category"): entry["category"] = normalize_text(match.group("category")) if entry["last_seen"] is None or ev.observed_at > entry["last_seen"]: entry["last_seen"] = ev.observed_at out: List[CandidateClaim] = [] for (predicate, phrase) in sorted(tally): entry = tally[(predicate, phrase)] n = entry["count"] ceiling = AVOID_CEILING if predicate == "preference.avoids" else LIKE_CEILING base = support_curve( n, midpoint=EXPLICIT_SUPPORT_MIDPOINT, steepness=EXPLICIT_SUPPORT_STEEPNESS, ceiling=ceiling, ) age = 0.0 if entry["last_seen"]: age = max(0.0, ctx.age_days(entry["last_seen"]) - GRACE_DAYS) conf = clamp(temporal_decay(base, age, HALF_LIFE_DAYS)) value = { "item": phrase, "mentions": n, "quotes": list(entry["quotes"]), "last_seen": entry["last_seen"], } if entry["category"]: value["category"] = entry["category"] if entry["allergy"]: value["reason"] = "allergy" verb = { "preference.likes": "likes", "preference.dislikes": "dislikes", "preference.avoids": "avoids", }[predicate] out.append( CandidateClaim( subject="user", predicate=predicate, identity={"item": phrase}, value=value, confidence=conf, inputs=sorted(entry["evidence_ids"]), summary=( f"The user {verb} '{phrase}' -- stated in their own " f"words {n} time(s) in notes." ), reasoning=[ f"First-person statement(s) matched in notes ({n}x), " f"e.g. \"{entry['quotes'][0]}\".", "Explicit self-statements start at high confidence " f"(ceiling {ceiling}) and decay slowly " f"(half-life {HALF_LIFE_DAYS:.0f} days).", ], confidence_account={ "method": "support_curve(mentions) * temporal_decay", "mentions": n, "support_midpoint": EXPLICIT_SUPPORT_MIDPOINT, "support_steepness": EXPLICIT_SUPPORT_STEEPNESS, "support_ceiling": ceiling, "base": round(base, 6), "age_days_past_grace": round(age, 3), "half_life_days": HALF_LIFE_DAYS, "result": round(conf, 6), }, ) ) return out @staticmethod def _scan_sentence(sentence: str): for predicate, pattern in ( ("preference.likes", _LIKE), ("preference.likes", _FAV), ("preference.dislikes", _DISLIKE), ("preference.avoids", _AVOID), ): for match in pattern.finditer(sentence): yield predicate, match # ------------------------------------------------------------------ # def _from_routines(self, ctx: DerivationContext) -> List[CandidateClaim]: # activity -> best supporting routine claim best: Dict[str, object] = {} for claim in ctx.claims("routine.weekly"): activity_text = str(claim.value.get("activity", "")) for token in normalize_text(activity_text).split(): canonical = ACTIVITY_LEXICON.get(token) if canonical is None: continue prev = best.get(canonical) if prev is None or claim.confidence > prev.confidence: best[canonical] = claim out: List[CandidateClaim] = [] for canonical in sorted(best): claim = best[canonical] conf = clamp(discount(claim.confidence, ACTIVITY_DISCOUNT)) out.append( CandidateClaim( subject="user", predicate="preference.activity", identity={"activity": canonical}, value={ "activity": canonical, "based_on": claim.claim_id, "routine": { "activity": claim.value.get("activity"), "weekday": claim.value.get("weekday"), "start_time_local": claim.value.get("start_time_local"), }, }, confidence=conf, inputs=[claim.claim_id], summary=( f"The user practices {canonical}: inferred from their " f"weekly '{claim.value.get('activity')}' routine." ), reasoning=[ f"Derived from claim {claim.claim_id} (routine.weekly " f"'{claim.value.get('activity')}' on " f"{claim.value.get('weekday')}s, confidence " f"{claim.confidence:.4f}).", f"Activity lexicon maps the routine to '{canonical}'.", "If the routine claim is refuted or invalidated, this " "preference is mechanically invalidated with it.", ], confidence_account={ "method": "input_confidence * activity_discount", "input_claim": claim.claim_id, "input_confidence": round(claim.confidence, 6), "activity_discount": ACTIVITY_DISCOUNT, "result": round(conf, 6), }, ) ) return out