"""Learning extraction — identifies facts worth persisting from conversation turns.""" from __future__ import annotations import re from typing import Any def extract_learnings_from_turn( user_message: str, assistant_message: str, ) -> tuple[list[tuple[str, str]], list[tuple[str, str]]]: """Extract project-scoped and global learnings from a completed turn. Returns: (project_learnings, global_learnings) where each item is (fact, category). Strategy: rule-based extraction (zero cost, no extra LLM call). - Detect user-stated preferences, personality cues, contact info → global - Detect technical discoveries, patterns, architecture notes → project """ project: list[tuple[str, str]] = [] global_facts: list[tuple[str, str]] = [] # ── Global learnings (user preferences, personality) ────────────── # Explicit preference statements pref_patterns = [ r"(?:I (?:prefer|like|want|always|never|hate|don't like|can't stand)\s+.+?)[.!?]", r"(?:my (?:favorite|preferred|default|usual)\s+.+?(?:is|are)\s+.+?)[.!?]", r"(?:I'm (?:a|an)\s+.+?)(?:\s+(?:at|who|that|working))?[.,!?]", ] for pattern in pref_patterns: for match in re.finditer(pattern, user_message, re.IGNORECASE): fact = match.group(0).strip() if 10 < len(fact) < 200: global_facts.append((fact, "preference")) # Job/role mentions job_patterns = [ r"(?:I (?:work|am working)\s+(?:at|for|on|with)\s+.+?)[.,!?]", r"(?:I'm\s+(?:a|an)\s+\w+(?:\s+\w+)?\s+(?:engineer|developer|designer|manager|analyst|researcher|founder|student))", r"(?:my (?:role|title|position|job)\s+(?:is|as)\s+.+?)[.,!?]", ] for pattern in job_patterns: for match in re.finditer(pattern, user_message, re.IGNORECASE): fact = match.group(0).strip() if 10 < len(fact) < 200: global_facts.append((fact, "profile")) # Contact/identity info identity_patterns = [ r"(?:my (?:email|github|twitter|handle|username|name|phone|number)\s+(?:is)\s+.+?)[.,!?]", ] for pattern in identity_patterns: for match in re.finditer(pattern, user_message, re.IGNORECASE): fact = match.group(0).strip() if 10 < len(fact) < 200: global_facts.append((fact, "identity")) # Family / relationships / personal facts family_patterns = [ r"(?:I (?:have|am married to|live with)\s+(?:a\s+)?(?:wife|husband|partner|spouse)\s+\w+[^.!?]*)[.!?]", r"(?:my (?:wife|husband|partner|spouse|daughter|son|child|kid|mother|father|parent|sibling|brother|sister)(?:'s\s+\w+)?\s+(?:is|was|name(?:d| is)?|born)\s+[^.!?]+)[.!?]", r"(?:I (?:have|have got)\s+(?:two|three|four|five|\d+)\s+(?:daughters?|sons?|kids?|children)[^.!?]*)[.!?]", r"[A-Z][a-z]+\s+born\s+\d{4}", ] for pattern in family_patterns: for match in re.finditer(pattern, user_message, re.IGNORECASE): fact = match.group(0).strip() if 10 < len(fact) < 300: global_facts.append((fact, "personal")) # Tone/personality instructions (e.g., "be more concise", "use bullet points") tone_patterns = [ r"(?:(?:be|use|respond|reply|answer|speak|write)\s+(?:more\s+)?(?:concise|brief|detailed|verbose|short|formal|casual|friendly|professional|terse|bullet|markdown|code))[.!?]?", r"(?:don't (?:use|add|include|give)\s+.+?(?:explanation|comment|context|preamble|prefix))[.!?]?", ] for pattern in tone_patterns: for match in re.finditer(pattern, user_message, re.IGNORECASE): fact = match.group(0).strip() if 5 < len(fact) < 200: global_facts.append((fact, "tone")) # ── Project learnings (technical facts, patterns) ───────────────── # Architecture/discovery from assistant responses arch_patterns = [ r"(?:the\s+\w[\w-]*(?:\s+\w[\w-]*)?\s+(?:uses|runs on|depends on|is backed by|is configured with|requires)\s+.+?)[.,]", ] for pattern in arch_patterns: for match in re.finditer(pattern, assistant_message, re.IGNORECASE): fact = match.group(0).strip() if 15 < len(fact) < 250: project.append((fact, "architecture")) # Bug/pattern from assistant bug_patterns = [ r"(?:(?:root cause|the issue|the problem|the bug)\s+(?:is|was)\s+.+?)[.,]", r"(?:this\s+(?:happens|occurs)\s+(?:because|due to|when)\s+.+?)[.,]", ] for pattern in bug_patterns: for match in re.finditer(pattern, assistant_message, re.IGNORECASE): fact = match.group(0).strip() if 15 < len(fact) < 250: project.append((fact, "bug_pattern")) # Deployment patterns from assistant deploy_patterns = [ r"(?:deployed?\s+(?:via|through|using|to)\s+.+?)[.,]", r"(?:the\s+(?:deploy|ci|pipeline|action)\s+(?:uses|runs|triggers)\s+.+?)[.,]", ] for pattern in deploy_patterns: for match in re.finditer(pattern, assistant_message, re.IGNORECASE): fact = match.group(0).strip() if 15 < len(fact) < 250: project.append((fact, "deployment")) return project, global_facts def format_learnings_for_prompt( project_learnings: list[dict[str, Any]], global_learnings: list[dict[str, Any]], ) -> str | None: """Format learnings into a section to append to the system prompt. Returns None if there are no learnings to inject. """ sections: list[str] = [] if global_learnings: lines = ["## User Preferences & Profile"] for item in global_learnings[-15:]: # Keep prompt concise lines.append(f"- {item['fact']}") sections.append("\n".join(lines)) if project_learnings: lines = ["## Project Learnings"] for item in project_learnings[-15:]: lines.append(f"- {item['fact']}") sections.append("\n".join(lines)) return "\n\n".join(sections) if sections else None