Skip to content

generic_analyzer

Full name: tenets.core.analysis.implementations.generic_analyzer

generic_analyzer

Generic code analyzer for unsupported file types.

This module provides basic analysis capabilities for files that don't have a specific language analyzer. It performs text-based analysis and pattern matching to extract basic information. Enhanced with context-aware documentation analysis for smart summarization based on prompt/query relevance.

Classes

GenericAnalyzer

Python
GenericAnalyzer()

Bases: LanguageAnalyzer

Generic analyzer for unsupported file types.

Provides basic analysis for text-based files including: - Line and character counting - Basic pattern matching for imports/includes - Simple complexity estimation - Keyword extraction - Configuration file parsing (JSON, YAML, XML, etc.)

This analyzer serves as a fallback for files without specific language support and can handle various text formats.

Initialize the generic analyzer with logger.

Source code in tenets/core/analysis/implementations/generic_analyzer.py
Python
def __init__(self):
    """Initialize the generic analyzer with logger."""
    self.logger = get_logger(__name__)
Functions
extract_imports
Python
extract_imports(content: str, file_path: Path) -> List[ImportInfo]

Extract potential imports/includes from generic text.

Looks for common import patterns across various languages and configuration files.

PARAMETERDESCRIPTION
content

File content

TYPE:str

file_path

Path to the file being analyzed

TYPE:Path

RETURNSDESCRIPTION
List[ImportInfo]

List of ImportInfo objects with detected imports

Source code in tenets/core/analysis/implementations/generic_analyzer.py
Python
def extract_imports(self, content: str, file_path: Path) -> List[ImportInfo]:
    """Extract potential imports/includes from generic text.

    Looks for common import patterns across various languages
    and configuration files.

    Args:
        content: File content
        file_path: Path to the file being analyzed

    Returns:
        List of ImportInfo objects with detected imports
    """
    imports = []
    lines = content.split("\n")

    # Common import/include patterns
    patterns = [
        # Include patterns (C-style, various scripting languages)
        (r"^\s*#include\s+<([^>]+)>", "include"),  # angle includes
        (r'^\s*#include\s+"([^"]+)"', "include"),  # quote includes
        (r"^\s*include\s+[\'\"]([^\'\"]+)[\'\"]", "include"),
        # CMake include()
        (r"^\s*include\s*\(\s*([^)\s]+)\s*\)", "include"),
        # Import patterns (various languages)
        (r'^\s*import\s+[\'"]([^\'"]+)[\'"]', "import"),  # import "module"
        (r"^\s*import\s+([A-Za-z_][\w\.]*)\b", "import"),  # import os
        (r'^\s*from\s+[\'"]([^\'"]+)[\'"]', "from"),  # from "mod"
        (r"^\s*from\s+([A-Za-z_][\w\.]*)\s+import\b", "from"),  # from pkg import X
        (r'^\s*require\s+[\'"]([^\'"]+)[\'"]', "require"),
        # PHP/Perl and JS style use statements
        (r"^\s*use\s+([\\\w:]+);?", "use"),  # use Data::Dumper; or use Foo\Bar;
        # Load/source patterns (shell scripts)
        (r'^\s*source\s+[\'"]?([^\'"]+)[\'"]?', "source"),
        (r'^\s*\.[ \t]+[\'"]?([^\'"]+)[\'"]?', "source"),
        # Configuration file references
        (r'[\'"]?(?:file|path|src|href|url)[\'"]?\s*[:=]\s*[\'"]([^\'"]+)[\'"]', "reference"),
    ]

    captured_modules: set[str] = set()

    for i, line in enumerate(lines, 1):
        # Skip comments (generic comment patterns) but keep C preprocessor includes
        if (
            line.strip().startswith("#") and not re.match(r"^\s*#include\b", line)
        ) or line.strip().startswith("//"):
            continue

        for pattern, import_type in patterns:
            match = re.search(pattern, line, re.IGNORECASE)
            if match:
                module = match.group(1)
                imports.append(
                    ImportInfo(
                        module=module,
                        line=i,
                        type=import_type,
                        is_relative=self._is_relative_path(module),
                    )
                )
                captured_modules.add(module)
                break

        # Special case: 'use strict;' (JavaScript directive)
        if re.match(r"^\s*use\s+strict\s*;?\s*$", line):
            imports.append(ImportInfo(module="strict", line=i, type="use", is_relative=False))
            captured_modules.add("strict")

    # Special handling for specific file types
    if file_path.suffix.lower() in [".json", ".yaml", ".yml"]:
        imports.extend(self._extract_config_dependencies(content, file_path))

    # Detect standalone file references like config.yml in logs
    file_ref_pattern = re.compile(
        r"\b([\w./-]+\.(?:ya?ml|json|conf|cfg|ini|xml|toml|log|txt|sh))\b"
    )
    for i, line in enumerate(lines, 1):
        for m in file_ref_pattern.finditer(line):
            module = m.group(1)
            if module not in captured_modules:
                imports.append(
                    ImportInfo(
                        module=module,
                        line=i,
                        type="reference",
                        is_relative=self._is_relative_path(module),
                    )
                )
                captured_modules.add(module)

    return imports
extract_exports
Python
extract_exports(content: str, file_path: Path) -> List[Dict[str, Any]]

Extract potential exports from generic text.

Looks for common export patterns and definitions.

PARAMETERDESCRIPTION
content

File content

TYPE:str

file_path

Path to the file being analyzed

TYPE:Path

RETURNSDESCRIPTION
List[Dict[str, Any]]

List of potential exported symbols

Source code in tenets/core/analysis/implementations/generic_analyzer.py
Python
def extract_exports(self, content: str, file_path: Path) -> List[Dict[str, Any]]:
    """Extract potential exports from generic text.

    Looks for common export patterns and definitions.

    Args:
        content: File content
        file_path: Path to the file being analyzed

    Returns:
        List of potential exported symbols
    """
    exports = []

    # Common export/definition patterns
    patterns = [
        # Function-like definitions
        (r"^(?:function|def|func|sub|proc)\s+(\w+)", "function"),
        (r"^(\w+)\s*\(\)\s*\{", "function"),
        # Class-like definitions
        (r"^(?:class|struct|type|interface)\s+(\w+)", "class"),
        # Variable/constant definitions
        (r"^(?:export\s+)?(?:const|let|var|val)\s+(\w+)\s*=", "variable"),
        (r'^(\w+)\s*=\s*[\'"]?[^\'"\n]+[\'"]?', "assignment"),
        # Export statements
        (r"^export\s+(\w+)", "export"),
        (r"^module\.exports\.(\w+)", "export"),
    ]

    for pattern, export_type in patterns:
        for match in re.finditer(pattern, content, re.MULTILINE):
            name = match.group(1)
            exports.append(
                {
                    "name": name,
                    "type": export_type,
                    "line": content[: match.start()].count("\n") + 1,
                }
            )

    # For configuration files, extract top-level keys
    if file_path.suffix.lower() in [".json", ".yaml", ".yml", ".toml", ".ini"]:
        exports.extend(self._extract_config_keys(content, file_path))

    return exports
calculate_complexity
Python
calculate_complexity(content: str, file_path: Path) -> ComplexityMetrics

Calculate basic complexity metrics for generic text.

Provides simplified complexity estimation based on: - Line count and length - Nesting depth (indentation/braces) - Decision keywords - File type specific metrics

PARAMETERDESCRIPTION
content

File content

TYPE:str

file_path

Path to the file being analyzed

TYPE:Path

RETURNSDESCRIPTION
ComplexityMetrics

ComplexityMetrics object with basic metrics

Source code in tenets/core/analysis/implementations/generic_analyzer.py
Python
def calculate_complexity(self, content: str, file_path: Path) -> ComplexityMetrics:
    """Calculate basic complexity metrics for generic text.

    Provides simplified complexity estimation based on:
    - Line count and length
    - Nesting depth (indentation/braces)
    - Decision keywords
    - File type specific metrics

    Args:
        content: File content
        file_path: Path to the file being analyzed

    Returns:
        ComplexityMetrics object with basic metrics
    """
    metrics = ComplexityMetrics()

    # Basic line metrics
    lines = content.split("\n")
    # Trim leading/trailing empty lines for line count to match human expectations/tests
    start = 0
    end = len(lines)
    while start < end and lines[start].strip() == "":
        start += 1
    while end > start and lines[end - 1].strip() == "":
        end -= 1
    trimmed_lines = lines[start:end]

    # Preserve historical/test expectation: an entirely empty file counts as 1 line (logical line),
    # while code_lines will be 0. Non-empty (after trimming) counts actual trimmed lines.
    if not trimmed_lines:
        metrics.line_count = 1
    else:
        metrics.line_count = len(trimmed_lines)
    # Character count: count characters, and if file doesn't end with newline, count implicit final EOL
    metrics.character_count = len(content) + (0 if content.endswith("\n") else 1)

    # Count comment lines (generic patterns)
    comment_patterns = [
        r"^\s*#",  # Hash comments
        r"^\s*//",  # Double slash comments
        r"^\s*/\*",  # Block comment start
        r"^\s*\*",  # Block comment continuation
        r"^\s*<!--",  # HTML/XML comments
        r"^\s*;",  # Semicolon comments (INI, assembly)
        r"^\s*--",  # SQL/Lua comments
        r"^\s*%",  # LaTeX/MATLAB comments
    ]

    comment_lines = 0
    for line in trimmed_lines:
        if any(re.match(pattern, line) for pattern in comment_patterns):
            comment_lines += 1

    # Compute code lines as total lines minus comment lines (consistent with tests)
    # For empty file (line_count==1 but no trimmed lines), code_lines should be 0
    if not trimmed_lines:
        metrics.code_lines = 0
    else:
        metrics.code_lines = metrics.line_count - comment_lines

    metrics.comment_lines = comment_lines
    metrics.comment_ratio = comment_lines / metrics.line_count if metrics.line_count > 0 else 0

    # Estimate cyclomatic complexity (decision points)
    decision_keywords = [
        r"\bif\b",
        r"\belse\b",
        r"\belif\b",
        r"\belsif\b",
        r"\bfor\b",
        r"\bwhile\b",
        r"\bdo\b",
        r"\bcase\b",
        r"\bwhen\b",
        r"\btry\b",
        r"\bcatch\b",
        r"\bexcept\b",
        r"\bunless\b",
        r"\buntil\b",
        r"\bswitch\b",
        r"\b\?\s*[^:]+\s*:",
        r"\|\|",
        r"&&",
        r"\band\b",
        r"\bor\b",
    ]

    complexity = 1  # Base complexity
    for keyword in decision_keywords:
        complexity += len(re.findall(keyword, content, re.IGNORECASE))

    metrics.cyclomatic = min(complexity, 50)  # Cap at 50 for generic files

    # Estimate nesting depth
    max_depth = 0
    current_depth = 0

    for line in lines:
        # Track braces
        current_depth += line.count("{") - line.count("}")
        current_depth += line.count("(") - line.count(")")
        current_depth += line.count("[") - line.count("]")
        max_depth = max(max_depth, current_depth)

        # Reset if negative (mismatched brackets)
        current_depth = max(current_depth, 0)

    # Also check indentation depth
    indent_depth = self._calculate_max_indent(lines)
    # Combine and cap at 10
    metrics.max_depth = min(max(max_depth, indent_depth), 10)

    # File type specific metrics
    file_type = self._detect_file_type(file_path)

    if file_type == "configuration":
        # For config files, count keys/sections
        metrics.key_count = len(re.findall(r"^\s*[\w\-\.]+\s*[:=]", content, re.MULTILINE))
        metrics.section_count = len(re.findall(r"^\s*\[[\w\-\.]+\]", content, re.MULTILINE))

    elif file_type == "markup":
        # For markup files, count tags
        metrics.tag_count = len(re.findall(r"<\w+", content))
        metrics.header_count = len(re.findall(r"^#{1,6}\s+", content, re.MULTILINE))

    elif file_type == "data":
        # For data files, estimate structure
        if file_path.suffix.lower() == ".csv":
            lines_sample = lines[:10] if len(lines) > 10 else lines
            if lines_sample:
                # Estimate columns
                metrics.column_count = len(lines_sample[0].split(","))
                metrics.row_count = len(lines) - 1  # Exclude header

    # Calculate a simple maintainability index
    if metrics.code_lines > 0:
        # Simplified calculation
        maintainability = 100

        # Penalize high complexity
        maintainability -= min(30, complexity * 0.5)

        # Penalize deep nesting
        maintainability -= min(20, metrics.max_depth * 2)

        # Reward comments
        maintainability += min(10, metrics.comment_ratio * 30)

        # Penalize very long files
        if metrics.line_count > 1000:
            maintainability -= 10
        elif metrics.line_count > 500:
            maintainability -= 5

        metrics.maintainability_index = max(0, min(100, maintainability))

    return metrics
extract_structure
Python
extract_structure(content: str, file_path: Path) -> CodeStructure

Extract basic structure from generic text.

Attempts to identify structural elements using pattern matching and indentation analysis.

PARAMETERDESCRIPTION
content

File content

TYPE:str

file_path

Path to the file being analyzed

TYPE:Path

RETURNSDESCRIPTION
CodeStructure

CodeStructure object with detected elements

Source code in tenets/core/analysis/implementations/generic_analyzer.py
Python
def extract_structure(self, content: str, file_path: Path) -> CodeStructure:
    """Extract basic structure from generic text.

    Attempts to identify structural elements using pattern matching
    and indentation analysis.

    Args:
        content: File content
        file_path: Path to the file being analyzed

    Returns:
        CodeStructure object with detected elements
    """
    structure = CodeStructure()

    # Detect file type category
    file_type = self._detect_file_type(file_path)
    structure.file_type = file_type

    # Detect common YAML-based frameworks/configs
    try:
        if file_path.suffix.lower() in [".yaml", ".yml"]:
            # Initialize modules collection if not present
            if not hasattr(structure, "modules"):
                structure.modules = []

            if self._is_docker_compose_file(file_path, content):
                structure.framework = "docker-compose"
                for svc in self._extract_compose_services(content):
                    structure.modules.append({"type": "service", **svc})
            elif self._looks_like_kubernetes_yaml(content):
                structure.framework = "kubernetes"
                for res in self._extract_k8s_resources(content):
                    structure.modules.append({"type": "resource", **res})
            else:
                # Helm/Kustomize/GitHub Actions quick hints
                name = file_path.name.lower()
                if name == "chart.yaml":
                    structure.framework = "helm"
                elif name == "values.yaml":
                    structure.framework = getattr(structure, "framework", None) or "helm"
                elif name == "kustomization.yaml":
                    structure.framework = "kustomize"
                elif ".github" in str(file_path).replace("\\", "/") and "/workflows/" in str(
                    file_path
                ).replace("\\", "/"):
                    structure.framework = "github-actions"
    except Exception:
        # Never fail generic structure on heuristics
        pass

    # Extract functions (various patterns)
    function_patterns = [
        r"^(?:async\s+)?(?:function|def|func|sub|proc)\s+(\w+)",
        r"^(\w+)\s*\(\)\s*\{",
        r"^(\w+)\s*:\s*function",
        r"^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?\([^)]*\)\s*=>",
    ]

    for pattern in function_patterns:
        for match in re.finditer(pattern, content, re.MULTILINE):
            func_name = match.group(1)
            structure.functions.append(
                FunctionInfo(name=func_name, line=content[: match.start()].count("\n") + 1)
            )

    # Extract classes/types
    class_patterns = [
        r"^(?:export\s+)?(?:class|struct|type|interface|enum)\s+(\w+)",
        r"^(\w+)\s*=\s*class\s*\{",
    ]

    for pattern in class_patterns:
        for match in re.finditer(pattern, content, re.MULTILINE):
            class_name = match.group(1)
            structure.classes.append(
                ClassInfo(name=class_name, line=content[: match.start()].count("\n") + 1)
            )

    # Extract sections (markdown headers, etc.)
    if file_type in ["markdown", "documentation", "markup"]:
        section_pattern = r"^(#{1,6})\s+(.+)$"
        for match in re.finditer(section_pattern, content, re.MULTILINE):
            level = len(match.group(1))
            title = match.group(2)
            structure.sections.append(
                {
                    "title": title,
                    "level": level,
                    "line": content[: match.start()].count("\n") + 1,
                }
            )

    # Extract variables/constants
    var_patterns = [
        r"^(?:const|let|var|val)\s+(\w+)",
        r"^(\w+)\s*[:=]\s*[^=]",
        r"^export\s+(\w+)",
    ]

    for pattern in var_patterns:
        for match in re.finditer(pattern, content, re.MULTILINE):
            var_name = match.group(1)
            structure.variables.append(
                {
                    "name": var_name,
                    "line": content[: match.start()].count("\n") + 1,
                    "type": "variable",
                }
            )

    # Detect constants (UPPERCASE variables)
    const_pattern = r"^([A-Z][A-Z0-9_]*)\s*[:=]"
    for match in re.finditer(const_pattern, content, re.MULTILINE):
        structure.constants.append(match.group(1))

    # Extract TODO/FIXME comments
    todo_pattern = r"(?:#|//|/\*|\*)\s*(TODO|FIXME|HACK|NOTE|XXX|BUG):\s*(.+)"
    for match in re.finditer(todo_pattern, content, re.IGNORECASE):
        structure.todos.append(
            {
                "type": match.group(1).upper(),
                "message": match.group(2).strip(),
                "line": content[: match.start()].count("\n") + 1,
            }
        )

    # Count blocks (based on indentation or braces)
    structure.block_count = content.count("{")
    structure.indent_levels = self._analyze_indentation(content)

    return structure
extract_context_relevant_sections
Python
extract_context_relevant_sections(content: str, file_path: Path, prompt_keywords: List[str], search_depth: int = 2, min_confidence: float = 0.6, max_sections: int = 10) -> Dict[str, Any]

Extract sections of documentation that reference prompt keywords/concepts.

This method identifies and extracts the most relevant parts of documentation files based on direct references and semantic similarity to prompt keywords.

PARAMETERDESCRIPTION
content

File content

TYPE:str

file_path

Path to the file being analyzed

TYPE:Path

prompt_keywords

Keywords/phrases from the user's prompt

TYPE:List[str]

search_depth

How deep to search (1=direct, 2=semantic, 3=deep analysis)

TYPE:intDEFAULT:2

min_confidence

Minimum confidence threshold for relevance (0.0-1.0)

TYPE:floatDEFAULT:0.6

max_sections

Maximum number of contextual sections to preserve

TYPE:intDEFAULT:10

RETURNSDESCRIPTION
Dict[str, Any]

Dictionary containing relevant sections with metadata

Source code in tenets/core/analysis/implementations/generic_analyzer.py
Python
def extract_context_relevant_sections(
    self,
    content: str,
    file_path: Path,
    prompt_keywords: List[str],
    search_depth: int = 2,
    min_confidence: float = 0.6,
    max_sections: int = 10,
) -> Dict[str, Any]:
    """Extract sections of documentation that reference prompt keywords/concepts.

    This method identifies and extracts the most relevant parts of documentation
    files based on direct references and semantic similarity to prompt keywords.

    Args:
        content: File content
        file_path: Path to the file being analyzed
        prompt_keywords: Keywords/phrases from the user's prompt
        search_depth: How deep to search (1=direct, 2=semantic, 3=deep analysis)
        min_confidence: Minimum confidence threshold for relevance (0.0-1.0)
        max_sections: Maximum number of contextual sections to preserve

    Returns:
        Dictionary containing relevant sections with metadata
    """
    if not prompt_keywords:
        return {
            "relevant_sections": [],
            "metadata": {"total_sections": 0, "matched_sections": 0},
        }

    file_type = self._detect_file_type(file_path)

    # Extract sections based on file type
    sections = self._extract_document_sections(content, file_path, file_type)

    # Score sections based on relevance to prompt keywords
    scored_sections = []
    for section in sections:
        score, matches = self._calculate_section_relevance(
            section, prompt_keywords, search_depth
        )

        if score >= min_confidence:
            scored_sections.append(
                {
                    **section,
                    "relevance_score": score,
                    "keyword_matches": matches,
                    "context_type": self._determine_context_type(section, matches),
                }
            )

    # Sort by relevance and limit to max_sections
    scored_sections.sort(key=lambda x: x["relevance_score"], reverse=True)
    relevant_sections = scored_sections[:max_sections]

    # Extract code examples and references within relevant sections
    for section in relevant_sections:
        section["code_examples"] = self._extract_code_examples_from_section(section)
        section["api_references"] = self._extract_api_references_from_section(section)
        section["config_references"] = self._extract_config_references_from_section(section)

    metadata = {
        "total_sections": len(sections),
        "matched_sections": len(scored_sections),
        "relevant_sections": len(relevant_sections),
        "file_type": file_type,
        "search_depth": search_depth,
        "min_confidence": min_confidence,
        "avg_relevance_score": (
            sum(s["relevance_score"] for s in relevant_sections) / len(relevant_sections)
            if relevant_sections
            else 0.0
        ),
    }

    return {"relevant_sections": relevant_sections, "metadata": metadata}

Functions