"""Notes import adapter: Markdown / plain-text note files to evidence records. Understands the conventions most note apps export to: * optional ``---`` front matter with simple ``key: value`` pairs (``title``, ``date``/``created``, ``tags: [a, b]`` or comma lists), * a first-level ``# Heading`` as the title fallback, * inline ``#hashtags`` in the body. No YAML dependency: front matter is parsed with a deliberately small ``key: value`` reader, and anything it cannot read is left in the body text. External ids are the file's path relative to the ingest root, so re-exports of the same notebook map onto the same evidence. """ from __future__ import annotations import re from pathlib import Path from typing import Any, Dict, List, Optional, Tuple from . import EvidenceRecord __all__ = ["load", "parse_note"] _NOTE_SUFFIXES = (".md", ".markdown", ".txt") _TAG_RE = re.compile(r"(?:^|[\s(\[])#([A-Za-z][A-Za-z0-9_-]{1,40})\b") _HEADING_RE = re.compile(r"^#\s+(.+?)\s*$", re.MULTILINE) _DATE_RE = re.compile(r"^(\d{4}-\d{2}-\d{2})(?:[T ](\d{2}:\d{2}(?::\d{2})?))?") _WORD_RE = re.compile(r"\w+", re.UNICODE) def _parse_front_matter(text: str) -> Tuple[Dict[str, Any], str]: """Split optional ``---`` front matter from the note body.""" lines = text.split("\n") if not lines or lines[0].strip() != "---": return {}, text meta: Dict[str, Any] = {} for index in range(1, len(lines)): stripped = lines[index].strip() if stripped in ("---", "..."): return meta, "\n".join(lines[index + 1 :]) if ":" not in stripped or stripped.startswith("#"): continue key, value = stripped.split(":", 1) key = key.strip().lower() value = value.strip() if key == "tags": tags = [ t.strip().lstrip("#").lower() for t in re.split(r"[,\s]+", value.strip("[]")) if t.strip().lstrip("#") ] meta[key] = tags elif value: meta[key] = value.strip("\"'") # No closing fence: treat the whole file as body. return {}, text def _normalize_date(value: Optional[str]) -> Optional[str]: if not isinstance(value, str): return None match = _DATE_RE.match(value.strip()) if not match: return None date, time = match.groups() if time: if len(time) == 5: time += ":00" return f"{date}T{time}" return date def parse_note(text: str, fallback_title: str) -> Dict[str, Any]: """Parse one note file's text into a JSON-safe payload.""" meta, body = _parse_front_matter(text) body = body.strip("\n") title = meta.get("title") if not title: heading = _HEADING_RE.search(body) title = heading.group(1) if heading else fallback_title tags = set(meta.get("tags") or []) tags.update(tag.lower() for tag in _TAG_RE.findall(body)) created = _normalize_date( meta.get("date") or meta.get("created") or meta.get("created_at") ) payload: Dict[str, Any] = { "title": str(title), "text": body, "tags": sorted(tags), "word_count": len(_WORD_RE.findall(body)), } if created: payload["created"] = created extra_meta = { key: value for key, value in meta.items() if key not in ("title", "tags", "date", "created", "created_at") } if extra_meta: payload["meta"] = extra_meta return payload def load(path: str) -> List[EvidenceRecord]: """Load one note file, or every note file under a directory tree.""" root = Path(path) if root.is_dir(): files = sorted( p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in _NOTE_SUFFIXES ) base = root elif root.is_file(): files = [root] base = root.parent else: raise FileNotFoundError(f"notes source not found: {path}") records: List[EvidenceRecord] = [] for file in files: text = file.read_text(encoding="utf-8", errors="replace") payload = parse_note(text, fallback_title=file.stem.replace("-", " ").replace("_", " ")) relative = file.relative_to(base).as_posix() records.append( EvidenceRecord( source=f"notes:{base.name}", kind="note.document", external_id=relative, payload=payload, observed_at=payload.get("created"), ) ) return records