Skip to content

distill

Full name: tenets.cli.commands.distill

distill

Distill command - extract relevant context from codebase.

Classes

Functions

distill

Python
distill(prompt: str = typer.Argument(..., help='Your query or task (can be text or URL to GitHub issue, etc.)'), path: Path = typer.Argument(Path(), help='Path to analyze (directory or files)'), format: str = typer.Option('markdown', '--format', '-f', help='Output format: markdown, xml, json, html'), output: Optional[Path] = typer.Option(None, '--output', '-o', help='Save output to file instead of stdout'), mode: str = typer.Option('balanced', '--mode', '-m', help='Analysis mode: fast (keywords only), balanced (default), thorough (deep analysis)'), model: Optional[str] = typer.Option(None, '--model', help='Target LLM model for token counting (e.g., gpt-4o, claude-3-opus)'), max_tokens: Optional[int] = typer.Option(None, '--max-tokens', help='Maximum tokens for context (overrides model default)'), include: Optional[str] = typer.Option(None, '--include', '-i', help="Include file patterns (e.g., '*.py,*.js')"), exclude: Optional[str] = typer.Option(None, '--exclude', '-e', help="Exclude file patterns (e.g., 'test_*,*.backup')"), include_tests: bool = typer.Option(False, '--include-tests', help='Include test files (overrides default exclusion)'), exclude_tests: bool = typer.Option(False, '--exclude-tests', help='Explicitly exclude test files (even for test-related prompts)'), include_minified: bool = typer.Option(False, '--include-minified', help='Include minified/built files (*.min.js, dist/, etc.) normally excluded'), no_git: bool = typer.Option(False, '--no-git', help='Disable git context inclusion'), full: bool = typer.Option(False, '--full', help='Include full content for all ranked files within token budget (no summarization)'), condense: bool = typer.Option(False, '--condense', help='Condense whitespace (collapse large blank runs, trim trailing spaces) before counting tokens'), remove_comments: bool = typer.Option(False, '--remove-comments', help='Strip comments (heuristic, language-aware) before counting tokens'), docstring_weight: Optional[float] = typer.Option(None, '--docstring-weight', min=0.0, max=1.0, help='Weight for including docstrings in summaries (0=never, 0.5=balanced, 1.0=always)'), no_summarize_imports: bool = typer.Option(False, '--no-summarize-imports', help='Disable import summarization (show all imports verbatim)'), session: Optional[str] = typer.Option(None, '--session', '-s', help='Use session for stateful context building'), estimate_cost: bool = typer.Option(False, '--estimate-cost', help='Show token usage and cost estimate'), show_stats: bool = typer.Option(False, '--stats', help='Show statistics about context generation'), verbose: bool = typer.Option(False, '--verbose', '-v', help='Show detailed debug information including keyword matching'), copy: bool = typer.Option(False, '--copy', help='Copy distilled context to clipboard (also enabled automatically if config.output.copy_on_distill)'))

Distill relevant context from your codebase for any prompt.

This command extracts and aggregates the most relevant files, documentation, and git history based on your query, optimizing for LLM token limits.

Examples:

Text Only
# Basic usage
tenets distill "implement OAuth2 authentication"

# From a GitHub issue
tenets distill https://github.com/org/repo/issues/123

# Specific path with options
tenets distill "add caching layer" ./src --mode thorough --max-tokens 50000

# Filter by file types
tenets distill "review API" --include "*.py,*.yaml" --exclude "test_*"

# Save to file with cost estimate
tenets distill "debug login" -o context.md --model gpt-4o --estimate-cost
Source code in tenets/cli/commands/distill.py
Python
def distill(
    prompt: str = typer.Argument(
        ..., help="Your query or task (can be text or URL to GitHub issue, etc.)"
    ),
    path: Path = typer.Argument(Path(), help="Path to analyze (directory or files)"),
    # Output options
    format: str = typer.Option(
        "markdown", "--format", "-f", help="Output format: markdown, xml, json, html"
    ),
    output: Optional[Path] = typer.Option(
        None, "--output", "-o", help="Save output to file instead of stdout"
    ),
    # Analysis options
    mode: str = typer.Option(
        "balanced",
        "--mode",
        "-m",
        help="Analysis mode: fast (keywords only), balanced (default), thorough (deep analysis)",
    ),
    model: Optional[str] = typer.Option(
        None, "--model", help="Target LLM model for token counting (e.g., gpt-4o, claude-3-opus)"
    ),
    max_tokens: Optional[int] = typer.Option(
        None, "--max-tokens", help="Maximum tokens for context (overrides model default)"
    ),
    # Filtering
    include: Optional[str] = typer.Option(
        None, "--include", "-i", help="Include file patterns (e.g., '*.py,*.js')"
    ),
    exclude: Optional[str] = typer.Option(
        None, "--exclude", "-e", help="Exclude file patterns (e.g., 'test_*,*.backup')"
    ),
    include_tests: bool = typer.Option(
        False, "--include-tests", help="Include test files (overrides default exclusion)"
    ),
    exclude_tests: bool = typer.Option(
        False,
        "--exclude-tests",
        help="Explicitly exclude test files (even for test-related prompts)",
    ),
    include_minified: bool = typer.Option(
        False,
        "--include-minified",
        help="Include minified/built files (*.min.js, dist/, etc.) normally excluded",
    ),
    # Features
    no_git: bool = typer.Option(False, "--no-git", help="Disable git context inclusion"),
    full: bool = typer.Option(
        False,
        "--full",
        help="Include full content for all ranked files within token budget (no summarization)",
    ),
    condense: bool = typer.Option(
        False,
        "--condense",
        help="Condense whitespace (collapse large blank runs, trim trailing spaces) before counting tokens",
    ),
    remove_comments: bool = typer.Option(
        False,
        "--remove-comments",
        help="Strip comments (heuristic, language-aware) before counting tokens",
    ),
    docstring_weight: Optional[float] = typer.Option(
        None,
        "--docstring-weight",
        min=0.0,
        max=1.0,
        help="Weight for including docstrings in summaries (0=never, 0.5=balanced, 1.0=always)",
    ),
    no_summarize_imports: bool = typer.Option(
        False,
        "--no-summarize-imports",
        help="Disable import summarization (show all imports verbatim)",
    ),
    session: Optional[str] = typer.Option(
        None, "--session", "-s", help="Use session for stateful context building"
    ),
    # Info options
    estimate_cost: bool = typer.Option(
        False, "--estimate-cost", help="Show token usage and cost estimate"
    ),
    show_stats: bool = typer.Option(
        False, "--stats", help="Show statistics about context generation"
    ),
    verbose: bool = typer.Option(
        False, "--verbose", "-v", help="Show detailed debug information including keyword matching"
    ),
    copy: bool = typer.Option(
        False,
        "--copy",
        help="Copy distilled context to clipboard (also enabled automatically if config.output.copy_on_distill)",
    ),
    # Context options
):
    """
    Distill relevant context from your codebase for any prompt.

    This command extracts and aggregates the most relevant files, documentation,
    and git history based on your query, optimizing for LLM token limits.

    Examples:

        # Basic usage
        tenets distill "implement OAuth2 authentication"

        # From a GitHub issue
        tenets distill https://github.com/org/repo/issues/123

        # Specific path with options
        tenets distill "add caching layer" ./src --mode thorough --max-tokens 50000

        # Filter by file types
        tenets distill "review API" --include "*.py,*.yaml" --exclude "test_*"

        # Save to file with cost estimate
        tenets distill "debug login" -o context.md --model gpt-4o --estimate-cost
    """
    # Get verbosity from context (but parameter takes precedence)
    ctx_obj_local = {}
    try:
        _ctx = click.get_current_context(silent=True)
        if _ctx and _ctx.obj:
            ctx_obj_local = _ctx.obj
    except Exception:
        ctx_obj_local = {}
    state = ctx_obj_local or {}
    # Use the verbose parameter directly (it overrides context)
    quiet = state.get("quiet", False)

    # Initialize timer - suppress output in JSON/HTML modes when not outputting to file
    is_json_quiet = format.lower() == "json" and not output
    is_html_quiet = format.lower() == "html" and not output
    timer = CommandTimer(console, quiet or is_json_quiet or is_html_quiet)

    try:
        # Start timing
        timer.start("Initializing tenets...")

        # Initialize tenets
        tenets = Tenets()

        # Parse include/exclude patterns
        include_patterns = include.split(",") if include else None
        exclude_patterns = exclude.split(",") if exclude else None

        # Determine test inclusion based on CLI flags
        # Priority: exclude_tests flag > include_tests flag > automatic detection
        test_inclusion = None
        if exclude_tests:
            test_inclusion = False  # Explicitly exclude tests
        elif include_tests:
            test_inclusion = True  # Explicitly include tests
        # If neither flag is set, let the prompt analysis decide (test_inclusion = None)

        # Show progress unless quiet
        if not quiet:
            with Progress(
                SpinnerColumn(),
                TextColumn("[progress.description]{task.description}"),
                console=console,
                transient=True,
            ) as progress:
                progress.add_task(f"Distilling context for: {prompt[:50]}...", total=None)

                # Distill context
                result = tenets.distill(
                    prompt=prompt,
                    files=path,
                    format=format,
                    model=model,
                    max_tokens=max_tokens,
                    mode=mode,
                    include_git=not no_git,
                    session_name=session,
                    include_patterns=include_patterns,
                    exclude_patterns=exclude_patterns,
                    full=full,
                    condense=condense,
                    remove_comments=remove_comments,
                    include_tests=test_inclusion,
                    docstring_weight=docstring_weight,
                    summarize_imports=not no_summarize_imports,
                )
        else:
            # No progress bar in quiet mode
            result = tenets.distill(
                prompt=prompt,
                files=path,
                format=format,
                model=model,
                max_tokens=max_tokens,
                mode=mode,
                include_git=not no_git,
                session_name=session,
                include_patterns=include_patterns,
                exclude_patterns=exclude_patterns,
                full=full,
                condense=condense,
                remove_comments=remove_comments,
                include_tests=test_inclusion,
                docstring_weight=docstring_weight,
                summarize_imports=not no_summarize_imports,
            )

        # Prepare metadata and interactivity flags
        raw_meta = getattr(result, "metadata", {})
        metadata = raw_meta if isinstance(raw_meta, dict) else {}

        # Show verbose debug information if requested
        if verbose and not quiet:
            console.print("\n[yellow]═══ Verbose Debug Information ═══[/yellow]")

            # Show parsing details
            if "prompt_context" in metadata:
                pc = metadata["prompt_context"]
                console.print("\n[cyan]Prompt Parsing:[/cyan]")
                console.print(f"  Task Type: {pc.get('task_type', 'unknown')}")
                console.print(f"  Intent: {pc.get('intent', 'unknown')}")
                console.print(f"  Keywords: {pc.get('keywords', [])}")
                console.print(f"  Synonyms: {pc.get('synonyms', [])}")
                console.print(f"  Entities: {pc.get('entities', [])}")

            # Show NLP normalization details
            if "nlp_normalization" in metadata:
                nn = metadata["nlp_normalization"]
                console.print("\n[cyan]NLP Normalization:[/cyan]")
                kw = nn.get("keywords", {})
                console.print(
                    f"  Keywords normalized: {kw.get('original_total', 0)} -> {kw.get('total', 0)}"
                )
                # Print up to 5 examples of normalization steps
                norm_map = kw.get("normalized", {})
                shown = 0
                for k, info in norm_map.items():
                    console.print(
                        f"    - {k}: steps={info.get('steps', [])}, variants={info.get('variants', [])}"
                    )
                    shown += 1
                    if shown >= 5:
                        break
                ent = nn.get("entities", {})
                console.print(
                    f"  Entities recognized: {ent.get('total', 0)} (variation counts: top {min(5, len(ent.get('variation_counts', {})))} shown)"
                )
                vc = ent.get("variation_counts", {})
                shown = 0
                for name, cnt in vc.items():
                    console.print(f"    - {name}: {cnt} variants")
                    shown += 1
                    if shown >= 5:
                        break

            # Show ranking details
            if "ranking_details" in metadata:
                rd = metadata["ranking_details"]
                console.print("\n[cyan]Ranking Details:[/cyan]")
                console.print(f"  Algorithm: {rd.get('algorithm', 'unknown')}")
                console.print(f"  Threshold: {rd.get('threshold', 0.1)}")
                console.print(f"  Files Ranked: {rd.get('files_ranked', 0)}")
                console.print(f"  Files Above Threshold: {rd.get('files_above_threshold', 0)}")

                # Show top ranked files
                if "top_files" in rd:
                    console.print("\n[cyan]Top Ranked Files:[/cyan]")
                    for i, file_info in enumerate(rd["top_files"][:10], 1):
                        console.print(
                            f"  {i}. {file_info['path']} (score: {file_info['score']:.3f})"
                        )
                        if "match_details" in file_info:
                            md = file_info["match_details"]
                            console.print(
                                f"      Keywords matched: {md.get('keywords_matched', [])}"
                            )
                            console.print(
                                f"      Semantic score: {md.get('semantic_score', 0):.3f}"
                            )

            # Show aggregation details
            if "aggregation_details" in metadata:
                ad = metadata["aggregation_details"]
                console.print("\n[cyan]Aggregation Details:[/cyan]")
                console.print(f"  Strategy: {ad.get('strategy', 'unknown')}")
                console.print(f"  Min Relevance: {ad.get('min_relevance', 0)}")
                console.print(f"  Files Considered: {ad.get('files_considered', 0)}")
                console.print(f"  Files Rejected: {ad.get('files_rejected', 0)}")
                if "rejection_reasons" in ad:
                    console.print("\n  [yellow]Rejection Reasons:[/yellow]")
                    for reason, count in ad["rejection_reasons"].items():
                        console.print(f"    {reason}: {count} files")

            console.print("\n[yellow]═════════════════════════════[/yellow]\n")
        files_included = metadata.get("files_included", 0)
        files_analyzed = metadata.get("files_analyzed", 0)
        token_count = getattr(result, "token_count", 0)
        try:
            token_count = int(token_count)
        except Exception:
            token_count = 0
        interactive = (output is None) and (not quiet) and sys.stdout.isatty()

        # Format output
        if format == "json":
            output_text = json.dumps(result.to_dict(), indent=2)
        else:
            output_text = result.context

        # Stop timing
        timing_result = timer.stop("Context distillation complete")

        # Build summary details
        include_display_raw = ",".join(include_patterns) if include_patterns else "(none)"
        exclude_display_raw = ",".join(exclude_patterns) if exclude_patterns else "(none)"
        git_display = "disabled" if no_git else "enabled (ranking only)"
        session_display_raw = session or "(none)"
        max_tokens_display = str(max_tokens) if max_tokens else "model default"

        # Escape dynamic strings for Rich markup safety
        prompt_text = escape(str(prompt)[:80])
        path_text = escape(str(path))
        include_display = escape(include_display_raw)
        exclude_display = escape(exclude_display_raw)
        session_display = escape(session_display_raw)

        # Show a concise summary before content in interactive mode
        if interactive:
            console.print(
                Panel(
                    f"[bold]Prompt[/bold]: {prompt_text}\n"
                    f"Path: {path_text}\n"
                    f"Mode: {metadata.get('mode', 'unknown')}  •  Format: {format}\n"
                    f"Full: {metadata.get('full_mode', full)}  •  Condense: {metadata.get('condense', condense)}  •  Remove Comments: {metadata.get('remove_comments', remove_comments)}\n"
                    f"Files: {files_included}/{files_analyzed}  •  Tokens: {token_count:,} / {max_tokens_display}\n"
                    f"Include: {include_display}\n"
                    f"Exclude: {exclude_display}\n"
                    f"Git: {git_display}  •  Session: {session_display}\n"
                    f"[dim]Time: {timing_result.formatted_duration}[/dim]",
                    title="Tenets Context",
                    border_style="green",
                )
            )

        # Output result
        if output:
            output.write_text(output_text, encoding="utf-8")
            if not quiet:
                console.print(
                    f"[green]✓[/green] Context saved to {escape(str(output))} [dim]({timing_result.formatted_duration})[/dim]"
                )

                # If HTML format and interactive, offer to open in browser
                if format == "html" and interactive:
                    import click

                    if click.confirm(
                        "\nWould you like to open it in your browser now?", default=False
                    ):
                        import webbrowser

                        # Ensure absolute path for file URI
                        file_path = output.resolve()
                        webbrowser.open(file_path.as_uri())
                        console.print("[green]✓[/green] Opened in browser")
        elif format in ["json", "xml", "html"]:
            # For HTML/XML/JSON, save to a default file if no output specified
            if interactive:
                # Auto-generate filename with timestamp and prompt info
                import re
                from datetime import datetime

                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

                # Create safe prompt snippet for filename
                prompt_str = prompt[:50] if prompt else "context"
                safe_prompt = re.sub(r"[^\w\-_\s]", "", prompt_str)
                safe_prompt = safe_prompt.replace(" ", "_")[:30]
                safe_prompt = re.sub(r"_+", "_", safe_prompt)

                # Determine file extension
                ext = format.lower()
                default_file = Path(f"distill_{safe_prompt}_{timestamp}.{ext}")
                default_file.write_text(output_text, encoding="utf-8")
                console.print(
                    f"[green]✓[/green] {format.upper()} context saved to {escape(str(default_file))} [dim]({timing_result.formatted_duration})[/dim]"
                )

                # Offer to open in browser for HTML, or folder for XML/JSON
                import click

                if format == "html":
                    if click.confirm(
                        "\nWould you like to open it in your browser now?", default=False
                    ):
                        import webbrowser

                        # Ensure absolute path for file URI
                        file_path = default_file.resolve()
                        webbrowser.open(file_path.as_uri())
                        console.print("[green]✓[/green] Opened in browser")
                    else:
                        console.print(
                            "[cyan]💡 Tip:[/cyan] Open the file in a browser or use --output to specify a different path"
                        )
                # For XML/JSON, offer to open the folder
                elif click.confirm(
                    f"\nWould you like to open the folder containing the {format.upper()} file?",
                    default=False,
                ):
                    import platform

                    folder = default_file.parent.resolve()
                    if platform.system() == "Windows":
                        import os

                        os.startfile(folder)
                    elif platform.system() == "Darwin":  # macOS
                        import subprocess

                        subprocess.run(["open", folder], check=False)
                    else:  # Linux
                        import subprocess

                        subprocess.run(["xdg-open", folder], check=False)
                    console.print(f"[green]✓[/green] Opened folder: {folder}")
            else:
                # Non-interactive mode: print raw output for piping
                print(output_text)
        else:
            # Draw clear context boundaries in interactive TTY only
            if interactive:
                console.rule("Context")
            print(output_text)
            if interactive:
                console.rule("End")

        # Clipboard copy (after output so piping still works)
        do_copy = copy
        try:
            # Check config flag (best-effort; Tenets() instance may expose config)
            cfg = getattr(tenets, "config", None)
            if cfg and getattr(getattr(cfg, "output", None), "copy_on_distill", False):
                do_copy = True or copy
        except Exception:
            pass
        if do_copy:
            copied = False
            text_to_copy = (
                output_text if format != "json" else json.dumps(result.to_dict(), indent=2)
            )
            # Try pyperclip first
            try:  # pragma: no cover - environment dependent
                if pyperclip is not None:
                    pyperclip.copy(text_to_copy)  # type: ignore[attr-defined]
                    copied = True
                else:
                    raise RuntimeError("no pyperclip")
            except Exception:
                # Fallbacks by platform
                try:
                    import platform
                    import shutil
                    import subprocess

                    plat = platform.system().lower()
                    if "windows" in plat:
                        # Use clip
                        p = subprocess.Popen(["clip"], stdin=subprocess.PIPE, close_fds=True)
                        p.communicate(input=text_to_copy.encode("utf-8"))
                        copied = p.returncode == 0
                    elif "darwin" in plat and shutil.which("pbcopy"):
                        p = subprocess.Popen(["pbcopy"], stdin=subprocess.PIPE)
                        p.communicate(input=text_to_copy.encode("utf-8"))
                        copied = p.returncode == 0
                    elif shutil.which("xclip"):
                        p = subprocess.Popen(
                            ["xclip", "-selection", "clipboard"], stdin=subprocess.PIPE
                        )
                        p.communicate(input=text_to_copy.encode("utf-8"))
                        copied = p.returncode == 0
                    elif shutil.which("wl-copy"):
                        p = subprocess.Popen(["wl-copy"], stdin=subprocess.PIPE)
                        p.communicate(input=text_to_copy.encode("utf-8"))
                        copied = p.returncode == 0
                except Exception:
                    copied = False
            if copied and not quiet:
                console.print(
                    f"[cyan]📋 Context copied to clipboard[/cyan] [dim]({timing_result.formatted_duration} total)[/dim]"
                )
            elif not copied and do_copy and not quiet:
                console.print(
                    "[yellow]Warning:[/yellow] Unable to copy to clipboard (missing pyperclip/xclip/pbcopy)."
                )

        # Show cost estimation if requested
        if estimate_cost and model:
            cost_info = tenets.estimate_cost(result, model)

            if not quiet:
                console.print(
                    Panel(
                        f"[bold]Token Usage[/bold]\n"
                        f"Context tokens: {cost_info['input_tokens']:,}\n"
                        f"Est. response: {cost_info['output_tokens']:,}\n"
                        f"Total tokens: {cost_info['input_tokens'] + cost_info['output_tokens']:,}\n\n"
                        f"[bold]Cost Estimate[/bold]\n"
                        f"Context cost: ${cost_info['input_cost']:.4f}\n"
                        f"Response cost: ${cost_info['output_cost']:.4f}\n"
                        f"Total cost: ${cost_info['total_cost']:.4f}",
                        title=f"💰 Cost Estimate for {model}",
                        border_style="yellow",
                    )
                )

        # If no files included, provide actionable suggestions. Avoid contaminating JSON stdout.
        if files_included == 0 and format != "json" and output is None:
            if interactive:
                console.print(
                    Panel(
                        "No files were included in the context.\n\n"
                        "Try: \n"
                        "• Increase --max-tokens\n"
                        "• Relax filters: remove or adjust --include/--exclude\n"
                        "• Use --mode thorough for deeper analysis\n"
                        "• Run with --verbose to see why files were skipped\n"
                        "• Add --stats to view generation metrics",
                        title="Suggestions",
                        border_style="red",
                    )
                )
            else:
                # Plain output for non-interactive (piped) environments
                print("No files were included in the context.")
                print("Suggestions")
                print("- Increase --max-tokens")
                print("- Relax filters: remove or adjust --include/--exclude")
                print("- Use --mode thorough for deeper analysis")
                print("- Run with --verbose to see why files were skipped")
                print("- Add --stats to view generation metrics")

        # Show statistics if requested
        if show_stats and not quiet:
            console.print(
                Panel(
                    f"[bold]Distillation Statistics[/bold]\n"
                    f"Mode: {metadata.get('mode', 'unknown')}\n"
                    f"Files found: {files_analyzed}\n"
                    f"Files included: {files_included}\n"
                    f"Token usage: {token_count:,} / {max_tokens or 'model default'}\n"
                    f"Analysis time: {metadata.get('analysis_time', '?')}s\n"
                    f"Total time: [green]{timing_result.formatted_duration}[/green]",
                    title="📊 Statistics",
                    border_style="blue",
                )
            )

    except Exception as e:
        # Stop timer on error
        if timer.start_time and not timer.end_time:
            timing_result = timer.stop("Operation failed")
            if not quiet:
                console.print(f"[dim]Failed after {timing_result.formatted_duration}[/dim]")

        # Escape dynamic error text to avoid Rich markup parsing issues (e.g., stray [ or ]).
        console.print(f"[red]Error:[/red] {escape(str(e))}")
        if verbose:
            console.print_exception()
        raise typer.Exit(1)