betterbot/learning.py

"""Learning extraction — identifies facts worth persisting from conversation turns."""

from __future__ import annotations

import re
from typing import Any


def extract_learnings_from_turn(
    user_message: str,
    assistant_message: str,
) -> tuple[list[tuple[str, str]], list[tuple[str, str]]]:
    """Extract project-scoped and global learnings from a completed turn.

    Returns:
        (project_learnings, global_learnings) where each item is (fact, category).

    Strategy: rule-based extraction (zero cost, no extra LLM call).
    - Detect user-stated preferences, personality cues, contact info → global
    - Detect technical discoveries, patterns, architecture notes → project
    """
    project: list[tuple[str, str]] = []
    global_facts: list[tuple[str, str]] = []

    # ── Global learnings (user preferences, personality) ──────────────

    # Explicit preference statements
    pref_patterns = [
        r"(?:I (?:prefer|like|want|always|never|hate|don't like|can't stand)\s+.+?)[.!?]",
        r"(?:my (?:favorite|preferred|default|usual)\s+.+?(?:is|are)\s+.+?)[.!?]",
        r"(?:I'm (?:a|an)\s+.+?)(?:\s+(?:at|who|that|working))?[.,!?]",
    ]
    for pattern in pref_patterns:
        for match in re.finditer(pattern, user_message, re.IGNORECASE):
            fact = match.group(0).strip()
            if 10 < len(fact) < 200:
                global_facts.append((fact, "preference"))

    # Job/role mentions
    job_patterns = [
        r"(?:I (?:work|am working)\s+(?:at|for|on|with)\s+.+?)[.,!?]",
        r"(?:I'm\s+(?:a|an)\s+\w+(?:\s+\w+)?\s+(?:engineer|developer|designer|manager|analyst|researcher|founder|student))",
        r"(?:my (?:role|title|position|job)\s+(?:is|as)\s+.+?)[.,!?]",
    ]
    for pattern in job_patterns:
        for match in re.finditer(pattern, user_message, re.IGNORECASE):
            fact = match.group(0).strip()
            if 10 < len(fact) < 200:
                global_facts.append((fact, "profile"))

    # Contact/identity info
    identity_patterns = [
        r"(?:my (?:email|github|twitter|handle|username|name|phone|number)\s+(?:is)\s+.+?)[.,!?]",
    ]
    for pattern in identity_patterns:
        for match in re.finditer(pattern, user_message, re.IGNORECASE):
            fact = match.group(0).strip()
            if 10 < len(fact) < 200:
                global_facts.append((fact, "identity"))

    # Family / relationships / personal facts
    family_patterns = [
        r"(?:I (?:have|am married to|live with)\s+(?:a\s+)?(?:wife|husband|partner|spouse)\s+\w+[^.!?]*)[.!?]",
        r"(?:my (?:wife|husband|partner|spouse|daughter|son|child|kid|mother|father|parent|sibling|brother|sister)(?:'s\s+\w+)?\s+(?:is|was|name(?:d| is)?|born)\s+[^.!?]+)[.!?]",
        r"(?:I (?:have|have got)\s+(?:two|three|four|five|\d+)\s+(?:daughters?|sons?|kids?|children)[^.!?]*)[.!?]",
        r"[A-Z][a-z]+\s+born\s+\d{4}",
    ]
    for pattern in family_patterns:
        for match in re.finditer(pattern, user_message, re.IGNORECASE):
            fact = match.group(0).strip()
            if 10 < len(fact) < 300:
                global_facts.append((fact, "personal"))

    # Tone/personality instructions (e.g., "be more concise", "use bullet points")
    tone_patterns = [
        r"(?:(?:be|use|respond|reply|answer|speak|write)\s+(?:more\s+)?(?:concise|brief|detailed|verbose|short|formal|casual|friendly|professional|terse|bullet|markdown|code))[.!?]?",
        r"(?:don't (?:use|add|include|give)\s+.+?(?:explanation|comment|context|preamble|prefix))[.!?]?",
    ]
    for pattern in tone_patterns:
        for match in re.finditer(pattern, user_message, re.IGNORECASE):
            fact = match.group(0).strip()
            if 5 < len(fact) < 200:
                global_facts.append((fact, "tone"))

    # ── Project learnings (technical facts, patterns) ─────────────────

    # Architecture/discovery from assistant responses
    arch_patterns = [
        r"(?:the\s+\w[\w-]*(?:\s+\w[\w-]*)?\s+(?:uses|runs on|depends on|is backed by|is configured with|requires)\s+.+?)[.,]",
    ]
    for pattern in arch_patterns:
        for match in re.finditer(pattern, assistant_message, re.IGNORECASE):
            fact = match.group(0).strip()
            if 15 < len(fact) < 250:
                project.append((fact, "architecture"))

    # Bug/pattern from assistant
    bug_patterns = [
        r"(?:(?:root cause|the issue|the problem|the bug)\s+(?:is|was)\s+.+?)[.,]",
        r"(?:this\s+(?:happens|occurs)\s+(?:because|due to|when)\s+.+?)[.,]",
    ]
    for pattern in bug_patterns:
        for match in re.finditer(pattern, assistant_message, re.IGNORECASE):
            fact = match.group(0).strip()
            if 15 < len(fact) < 250:
                project.append((fact, "bug_pattern"))

    # Deployment patterns from assistant
    deploy_patterns = [
        r"(?:deployed?\s+(?:via|through|using|to)\s+.+?)[.,]",
        r"(?:the\s+(?:deploy|ci|pipeline|action)\s+(?:uses|runs|triggers)\s+.+?)[.,]",
    ]
    for pattern in deploy_patterns:
        for match in re.finditer(pattern, assistant_message, re.IGNORECASE):
            fact = match.group(0).strip()
            if 15 < len(fact) < 250:
                project.append((fact, "deployment"))

    return project, global_facts


def format_learnings_for_prompt(
    project_learnings: list[dict[str, Any]],
    global_learnings: list[dict[str, Any]],
) -> str | None:
    """Format learnings into a section to append to the system prompt.

    Returns None if there are no learnings to inject.
    """
    sections: list[str] = []

    if global_learnings:
        lines = ["## User Preferences & Profile"]
        for item in global_learnings[-15:]:  # Keep prompt concise
            lines.append(f"- {item['fact']}")
        sections.append("\n".join(lines))

    if project_learnings:
        lines = ["## Project Learnings"]
        for item in project_learnings[-15:]:
            lines.append(f"- {item['fact']}")
        sections.append("\n".join(lines))

    return "\n\n".join(sections) if sections else None