"""Notes adapter: Markdown / plain-text files -> ``note.document`` evidence. Handles ``.md``, ``.markdown``, and ``.txt`` files. For Markdown it extracts a small amount of useful structure without pulling in a parser dependency: * YAML-style front matter between leading ``---`` fences -- a restricted dialect: ``key: value`` scalars, inline ``[a, b]`` lists, and indented ``- item`` block lists. Values are kept as strings (no type coercion; interpretation is derivation's job). * the title: front matter ``title``, else the first ``# `` heading, else the filename stem; * headings, ``#hashtags``, ``[[wikilinks]]``, and ``[text](url)`` links; * the full body text (this is the user's own local data; evidence is supposed to be faithful) plus a word count. ``external_id`` is the path relative to the import root, so editing a note produces a superseding evidence op rather than a duplicate. ``observed_at`` prefers front matter ``updated``/``date``, falling back to the file's mtime in UTC. """ from __future__ import annotations import re from datetime import datetime, timezone from pathlib import Path from typing import Iterator, Optional from pmp.adapters.base import Adapter, EvidenceItem, prune, register from pmp.errors import AdapterError _HEADING_RE = re.compile(r"^(#{1,6})\s+(.+?)\s*$", re.MULTILINE) _HASHTAG_RE = re.compile(r"(? str: value = value.strip() if len(value) >= 2 and value[0] == value[-1] and value[0] in ("'", '"'): return value[1:-1] return value def parse_front_matter(text: str) -> tuple[dict, str]: """Split off and parse front matter; returns (metadata, body). Only a restricted YAML dialect is supported (see module docstring). Lines that don't fit the dialect are skipped rather than failing the whole import -- front matter is auxiliary metadata, not the payload. """ lines = text.split("\n") if not lines or lines[0].strip() != "---": return {}, text end = None for i in range(1, len(lines)): if lines[i].strip() in ("---", "..."): end = i break if end is None: return {}, text meta: dict[str, object] = {} current_list_key: Optional[str] = None for raw in lines[1:end]: if not raw.strip() or raw.lstrip().startswith("#"): continue item = _LIST_ITEM_RE.match(raw) if item and current_list_key is not None: meta.setdefault(current_list_key, []) target = meta[current_list_key] if isinstance(target, list): target.append(_strip_quotes(item.group(1))) continue if ":" in raw: key, _, value = raw.partition(":") key = key.strip() value = value.strip() if not key: continue if value == "": meta[key] = [] current_list_key = key elif value.startswith("[") and value.endswith("]"): inner = value[1:-1].strip() meta[key] = ( [_strip_quotes(v) for v in inner.split(",") if v.strip()] if inner else [] ) current_list_key = None else: meta[key] = _strip_quotes(value) current_list_key = None body = "\n".join(lines[end + 1:]) return meta, body def _mtime_iso(path: Path) -> str: ts = datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc) return ts.isoformat(timespec="seconds").replace("+00:00", "Z") @register class NotesAdapter(Adapter): """Imports markdown and plain-text notes as ``note.document`` evidence.""" name = "notes" version = "1.0.0" content_types = ("note.document",) SUFFIXES = (".md", ".markdown", ".txt") def parse(self, source: Path) -> Iterator[EvidenceItem]: source = Path(source) root = source if source.is_dir() else source.parent for path in self.iter_files(source, self.SUFFIXES): try: text = path.read_text(encoding="utf-8") except (OSError, UnicodeDecodeError) as exc: raise AdapterError(f"cannot read {path}: {exc}") from exc is_markdown = path.suffix.lower() in (".md", ".markdown") rel_path = path.relative_to(root).as_posix() if is_markdown: meta, body = parse_front_matter(text) else: meta, body = {}, text headings = [ {"level": len(h), "text": t} for h, t in _HEADING_RE.findall(body) ] if is_markdown else [] title = None if isinstance(meta.get("title"), str): title = meta["title"] elif headings and headings[0]["level"] == 1: title = headings[0]["text"] else: first_line = next((ln.strip() for ln in body.splitlines() if ln.strip()), "") title = ( first_line[:120] if not is_markdown and first_line else path.stem ) tags: list[str] = [] fm_tags = meta.get("tags") if isinstance(fm_tags, list): tags.extend(str(t).lstrip("#") for t in fm_tags) elif isinstance(fm_tags, str): tags.extend( t.strip().lstrip("#") for t in fm_tags.split(",") if t.strip() ) tags.extend(_HASHTAG_RE.findall(body)) seen: set[str] = set() tags = [t for t in tags if t and not (t in seen or seen.add(t))] wikilinks = _WIKILINK_RE.findall(body) if is_markdown else [] links = ( [{"text": t, "url": u} for t, u in _MDLINK_RE.findall(body)] if is_markdown else [] ) # Front matter values must be canonical-safe: coerce to str/list[str]. front_matter = { k: ([str(x) for x in v] if isinstance(v, list) else str(v)) for k, v in meta.items() } mtime = _mtime_iso(path) observed = None for key in ("updated", "modified", "date", "created"): if isinstance(meta.get(key), str) and meta[key]: observed = str(meta[key]) break observed = observed or mtime content = prune( { "path": rel_path, "title": title, "format": "markdown" if is_markdown else "text", "text": body, "front_matter": front_matter, "tags": tags, "headings": headings, "wikilinks": wikilinks, "links": links, "word_count": len(_WORD_RE.findall(body)), "modified_at": mtime, } ) yield EvidenceItem( content_type="note.document", content=content, external_id=rel_path, observed_at=observed, origin=path.resolve().as_uri(), )