generic_analyzer
¶
Full name: tenets.core.analysis.implementations.generic_analyzer
generic_analyzer¶
Generic code analyzer for unsupported file types.
This module provides basic analysis capabilities for files that don't have a specific language analyzer. It performs text-based analysis and pattern matching to extract basic information. Enhanced with context-aware documentation analysis for smart summarization based on prompt/query relevance.
Classes¶
GenericAnalyzer¶
Bases: LanguageAnalyzer
Generic analyzer for unsupported file types.
Provides basic analysis for text-based files including: - Line and character counting - Basic pattern matching for imports/includes - Simple complexity estimation - Keyword extraction - Configuration file parsing (JSON, YAML, XML, etc.)
This analyzer serves as a fallback for files without specific language support and can handle various text formats.
Initialize the generic analyzer with logger.
Source code in tenets/core/analysis/implementations/generic_analyzer.py
Functions¶
extract_imports¶
Extract potential imports/includes from generic text.
Looks for common import patterns across various languages and configuration files.
PARAMETER | DESCRIPTION |
---|---|
content | File content TYPE: |
file_path | Path to the file being analyzed TYPE: |
RETURNS | DESCRIPTION |
---|---|
List[ImportInfo] | List of ImportInfo objects with detected imports |
Source code in tenets/core/analysis/implementations/generic_analyzer.py
def extract_imports(self, content: str, file_path: Path) -> List[ImportInfo]:
"""Extract potential imports/includes from generic text.
Looks for common import patterns across various languages
and configuration files.
Args:
content: File content
file_path: Path to the file being analyzed
Returns:
List of ImportInfo objects with detected imports
"""
imports = []
lines = content.split("\n")
# Common import/include patterns
patterns = [
# Include patterns (C-style, various scripting languages)
(r"^\s*#include\s+<([^>]+)>", "include"), # angle includes
(r'^\s*#include\s+"([^"]+)"', "include"), # quote includes
(r"^\s*include\s+[\'\"]([^\'\"]+)[\'\"]", "include"),
# CMake include()
(r"^\s*include\s*\(\s*([^)\s]+)\s*\)", "include"),
# Import patterns (various languages)
(r'^\s*import\s+[\'"]([^\'"]+)[\'"]', "import"), # import "module"
(r"^\s*import\s+([A-Za-z_][\w\.]*)\b", "import"), # import os
(r'^\s*from\s+[\'"]([^\'"]+)[\'"]', "from"), # from "mod"
(r"^\s*from\s+([A-Za-z_][\w\.]*)\s+import\b", "from"), # from pkg import X
(r'^\s*require\s+[\'"]([^\'"]+)[\'"]', "require"),
# PHP/Perl and JS style use statements
(r"^\s*use\s+([\\\w:]+);?", "use"), # use Data::Dumper; or use Foo\Bar;
# Load/source patterns (shell scripts)
(r'^\s*source\s+[\'"]?([^\'"]+)[\'"]?', "source"),
(r'^\s*\.[ \t]+[\'"]?([^\'"]+)[\'"]?', "source"),
# Configuration file references
(r'[\'"]?(?:file|path|src|href|url)[\'"]?\s*[:=]\s*[\'"]([^\'"]+)[\'"]', "reference"),
]
captured_modules: set[str] = set()
for i, line in enumerate(lines, 1):
# Skip comments (generic comment patterns) but keep C preprocessor includes
if (
line.strip().startswith("#") and not re.match(r"^\s*#include\b", line)
) or line.strip().startswith("//"):
continue
for pattern, import_type in patterns:
match = re.search(pattern, line, re.IGNORECASE)
if match:
module = match.group(1)
imports.append(
ImportInfo(
module=module,
line=i,
type=import_type,
is_relative=self._is_relative_path(module),
)
)
captured_modules.add(module)
break
# Special case: 'use strict;' (JavaScript directive)
if re.match(r"^\s*use\s+strict\s*;?\s*$", line):
imports.append(ImportInfo(module="strict", line=i, type="use", is_relative=False))
captured_modules.add("strict")
# Special handling for specific file types
if file_path.suffix.lower() in [".json", ".yaml", ".yml"]:
imports.extend(self._extract_config_dependencies(content, file_path))
# Detect standalone file references like config.yml in logs
file_ref_pattern = re.compile(
r"\b([\w./-]+\.(?:ya?ml|json|conf|cfg|ini|xml|toml|log|txt|sh))\b"
)
for i, line in enumerate(lines, 1):
for m in file_ref_pattern.finditer(line):
module = m.group(1)
if module not in captured_modules:
imports.append(
ImportInfo(
module=module,
line=i,
type="reference",
is_relative=self._is_relative_path(module),
)
)
captured_modules.add(module)
return imports
extract_exports¶
Extract potential exports from generic text.
Looks for common export patterns and definitions.
PARAMETER | DESCRIPTION |
---|---|
content | File content TYPE: |
file_path | Path to the file being analyzed TYPE: |
RETURNS | DESCRIPTION |
---|---|
List[Dict[str, Any]] | List of potential exported symbols |
Source code in tenets/core/analysis/implementations/generic_analyzer.py
def extract_exports(self, content: str, file_path: Path) -> List[Dict[str, Any]]:
"""Extract potential exports from generic text.
Looks for common export patterns and definitions.
Args:
content: File content
file_path: Path to the file being analyzed
Returns:
List of potential exported symbols
"""
exports = []
# Common export/definition patterns
patterns = [
# Function-like definitions
(r"^(?:function|def|func|sub|proc)\s+(\w+)", "function"),
(r"^(\w+)\s*\(\)\s*\{", "function"),
# Class-like definitions
(r"^(?:class|struct|type|interface)\s+(\w+)", "class"),
# Variable/constant definitions
(r"^(?:export\s+)?(?:const|let|var|val)\s+(\w+)\s*=", "variable"),
(r'^(\w+)\s*=\s*[\'"]?[^\'"\n]+[\'"]?', "assignment"),
# Export statements
(r"^export\s+(\w+)", "export"),
(r"^module\.exports\.(\w+)", "export"),
]
for pattern, export_type in patterns:
for match in re.finditer(pattern, content, re.MULTILINE):
name = match.group(1)
exports.append(
{
"name": name,
"type": export_type,
"line": content[: match.start()].count("\n") + 1,
}
)
# For configuration files, extract top-level keys
if file_path.suffix.lower() in [".json", ".yaml", ".yml", ".toml", ".ini"]:
exports.extend(self._extract_config_keys(content, file_path))
return exports
calculate_complexity¶
Calculate basic complexity metrics for generic text.
Provides simplified complexity estimation based on: - Line count and length - Nesting depth (indentation/braces) - Decision keywords - File type specific metrics
PARAMETER | DESCRIPTION |
---|---|
content | File content TYPE: |
file_path | Path to the file being analyzed TYPE: |
RETURNS | DESCRIPTION |
---|---|
ComplexityMetrics | ComplexityMetrics object with basic metrics |
Source code in tenets/core/analysis/implementations/generic_analyzer.py
def calculate_complexity(self, content: str, file_path: Path) -> ComplexityMetrics:
"""Calculate basic complexity metrics for generic text.
Provides simplified complexity estimation based on:
- Line count and length
- Nesting depth (indentation/braces)
- Decision keywords
- File type specific metrics
Args:
content: File content
file_path: Path to the file being analyzed
Returns:
ComplexityMetrics object with basic metrics
"""
metrics = ComplexityMetrics()
# Basic line metrics
lines = content.split("\n")
# Trim leading/trailing empty lines for line count to match human expectations/tests
start = 0
end = len(lines)
while start < end and lines[start].strip() == "":
start += 1
while end > start and lines[end - 1].strip() == "":
end -= 1
trimmed_lines = lines[start:end]
# Preserve historical/test expectation: an entirely empty file counts as 1 line (logical line),
# while code_lines will be 0. Non-empty (after trimming) counts actual trimmed lines.
if not trimmed_lines:
metrics.line_count = 1
else:
metrics.line_count = len(trimmed_lines)
# Character count: count characters, and if file doesn't end with newline, count implicit final EOL
metrics.character_count = len(content) + (0 if content.endswith("\n") else 1)
# Count comment lines (generic patterns)
comment_patterns = [
r"^\s*#", # Hash comments
r"^\s*//", # Double slash comments
r"^\s*/\*", # Block comment start
r"^\s*\*", # Block comment continuation
r"^\s*<!--", # HTML/XML comments
r"^\s*;", # Semicolon comments (INI, assembly)
r"^\s*--", # SQL/Lua comments
r"^\s*%", # LaTeX/MATLAB comments
]
comment_lines = 0
for line in trimmed_lines:
if any(re.match(pattern, line) for pattern in comment_patterns):
comment_lines += 1
# Compute code lines as total lines minus comment lines (consistent with tests)
# For empty file (line_count==1 but no trimmed lines), code_lines should be 0
if not trimmed_lines:
metrics.code_lines = 0
else:
metrics.code_lines = metrics.line_count - comment_lines
metrics.comment_lines = comment_lines
metrics.comment_ratio = comment_lines / metrics.line_count if metrics.line_count > 0 else 0
# Estimate cyclomatic complexity (decision points)
decision_keywords = [
r"\bif\b",
r"\belse\b",
r"\belif\b",
r"\belsif\b",
r"\bfor\b",
r"\bwhile\b",
r"\bdo\b",
r"\bcase\b",
r"\bwhen\b",
r"\btry\b",
r"\bcatch\b",
r"\bexcept\b",
r"\bunless\b",
r"\buntil\b",
r"\bswitch\b",
r"\b\?\s*[^:]+\s*:",
r"\|\|",
r"&&",
r"\band\b",
r"\bor\b",
]
complexity = 1 # Base complexity
for keyword in decision_keywords:
complexity += len(re.findall(keyword, content, re.IGNORECASE))
metrics.cyclomatic = min(complexity, 50) # Cap at 50 for generic files
# Estimate nesting depth
max_depth = 0
current_depth = 0
for line in lines:
# Track braces
current_depth += line.count("{") - line.count("}")
current_depth += line.count("(") - line.count(")")
current_depth += line.count("[") - line.count("]")
max_depth = max(max_depth, current_depth)
# Reset if negative (mismatched brackets)
current_depth = max(current_depth, 0)
# Also check indentation depth
indent_depth = self._calculate_max_indent(lines)
# Combine and cap at 10
metrics.max_depth = min(max(max_depth, indent_depth), 10)
# File type specific metrics
file_type = self._detect_file_type(file_path)
if file_type == "configuration":
# For config files, count keys/sections
metrics.key_count = len(re.findall(r"^\s*[\w\-\.]+\s*[:=]", content, re.MULTILINE))
metrics.section_count = len(re.findall(r"^\s*\[[\w\-\.]+\]", content, re.MULTILINE))
elif file_type == "markup":
# For markup files, count tags
metrics.tag_count = len(re.findall(r"<\w+", content))
metrics.header_count = len(re.findall(r"^#{1,6}\s+", content, re.MULTILINE))
elif file_type == "data":
# For data files, estimate structure
if file_path.suffix.lower() == ".csv":
lines_sample = lines[:10] if len(lines) > 10 else lines
if lines_sample:
# Estimate columns
metrics.column_count = len(lines_sample[0].split(","))
metrics.row_count = len(lines) - 1 # Exclude header
# Calculate a simple maintainability index
if metrics.code_lines > 0:
# Simplified calculation
maintainability = 100
# Penalize high complexity
maintainability -= min(30, complexity * 0.5)
# Penalize deep nesting
maintainability -= min(20, metrics.max_depth * 2)
# Reward comments
maintainability += min(10, metrics.comment_ratio * 30)
# Penalize very long files
if metrics.line_count > 1000:
maintainability -= 10
elif metrics.line_count > 500:
maintainability -= 5
metrics.maintainability_index = max(0, min(100, maintainability))
return metrics
extract_structure¶
Extract basic structure from generic text.
Attempts to identify structural elements using pattern matching and indentation analysis.
PARAMETER | DESCRIPTION |
---|---|
content | File content TYPE: |
file_path | Path to the file being analyzed TYPE: |
RETURNS | DESCRIPTION |
---|---|
CodeStructure | CodeStructure object with detected elements |
Source code in tenets/core/analysis/implementations/generic_analyzer.py
def extract_structure(self, content: str, file_path: Path) -> CodeStructure:
"""Extract basic structure from generic text.
Attempts to identify structural elements using pattern matching
and indentation analysis.
Args:
content: File content
file_path: Path to the file being analyzed
Returns:
CodeStructure object with detected elements
"""
structure = CodeStructure()
# Detect file type category
file_type = self._detect_file_type(file_path)
structure.file_type = file_type
# Detect common YAML-based frameworks/configs
try:
if file_path.suffix.lower() in [".yaml", ".yml"]:
# Initialize modules collection if not present
if not hasattr(structure, "modules"):
structure.modules = []
if self._is_docker_compose_file(file_path, content):
structure.framework = "docker-compose"
for svc in self._extract_compose_services(content):
structure.modules.append({"type": "service", **svc})
elif self._looks_like_kubernetes_yaml(content):
structure.framework = "kubernetes"
for res in self._extract_k8s_resources(content):
structure.modules.append({"type": "resource", **res})
else:
# Helm/Kustomize/GitHub Actions quick hints
name = file_path.name.lower()
if name == "chart.yaml":
structure.framework = "helm"
elif name == "values.yaml":
structure.framework = getattr(structure, "framework", None) or "helm"
elif name == "kustomization.yaml":
structure.framework = "kustomize"
elif ".github" in str(file_path).replace("\\", "/") and "/workflows/" in str(
file_path
).replace("\\", "/"):
structure.framework = "github-actions"
except Exception:
# Never fail generic structure on heuristics
pass
# Extract functions (various patterns)
function_patterns = [
r"^(?:async\s+)?(?:function|def|func|sub|proc)\s+(\w+)",
r"^(\w+)\s*\(\)\s*\{",
r"^(\w+)\s*:\s*function",
r"^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?\([^)]*\)\s*=>",
]
for pattern in function_patterns:
for match in re.finditer(pattern, content, re.MULTILINE):
func_name = match.group(1)
structure.functions.append(
FunctionInfo(name=func_name, line=content[: match.start()].count("\n") + 1)
)
# Extract classes/types
class_patterns = [
r"^(?:export\s+)?(?:class|struct|type|interface|enum)\s+(\w+)",
r"^(\w+)\s*=\s*class\s*\{",
]
for pattern in class_patterns:
for match in re.finditer(pattern, content, re.MULTILINE):
class_name = match.group(1)
structure.classes.append(
ClassInfo(name=class_name, line=content[: match.start()].count("\n") + 1)
)
# Extract sections (markdown headers, etc.)
if file_type in ["markdown", "documentation", "markup"]:
section_pattern = r"^(#{1,6})\s+(.+)$"
for match in re.finditer(section_pattern, content, re.MULTILINE):
level = len(match.group(1))
title = match.group(2)
structure.sections.append(
{
"title": title,
"level": level,
"line": content[: match.start()].count("\n") + 1,
}
)
# Extract variables/constants
var_patterns = [
r"^(?:const|let|var|val)\s+(\w+)",
r"^(\w+)\s*[:=]\s*[^=]",
r"^export\s+(\w+)",
]
for pattern in var_patterns:
for match in re.finditer(pattern, content, re.MULTILINE):
var_name = match.group(1)
structure.variables.append(
{
"name": var_name,
"line": content[: match.start()].count("\n") + 1,
"type": "variable",
}
)
# Detect constants (UPPERCASE variables)
const_pattern = r"^([A-Z][A-Z0-9_]*)\s*[:=]"
for match in re.finditer(const_pattern, content, re.MULTILINE):
structure.constants.append(match.group(1))
# Extract TODO/FIXME comments
todo_pattern = r"(?:#|//|/\*|\*)\s*(TODO|FIXME|HACK|NOTE|XXX|BUG):\s*(.+)"
for match in re.finditer(todo_pattern, content, re.IGNORECASE):
structure.todos.append(
{
"type": match.group(1).upper(),
"message": match.group(2).strip(),
"line": content[: match.start()].count("\n") + 1,
}
)
# Count blocks (based on indentation or braces)
structure.block_count = content.count("{")
structure.indent_levels = self._analyze_indentation(content)
return structure
extract_context_relevant_sections¶
extract_context_relevant_sections(content: str, file_path: Path, prompt_keywords: List[str], search_depth: int = 2, min_confidence: float = 0.6, max_sections: int = 10) -> Dict[str, Any]
Extract sections of documentation that reference prompt keywords/concepts.
This method identifies and extracts the most relevant parts of documentation files based on direct references and semantic similarity to prompt keywords.
PARAMETER | DESCRIPTION |
---|---|
content | File content TYPE: |
file_path | Path to the file being analyzed TYPE: |
prompt_keywords | Keywords/phrases from the user's prompt |
search_depth | How deep to search (1=direct, 2=semantic, 3=deep analysis) TYPE: |
min_confidence | Minimum confidence threshold for relevance (0.0-1.0) TYPE: |
max_sections | Maximum number of contextual sections to preserve TYPE: |
RETURNS | DESCRIPTION |
---|---|
Dict[str, Any] | Dictionary containing relevant sections with metadata |
Source code in tenets/core/analysis/implementations/generic_analyzer.py
def extract_context_relevant_sections(
self,
content: str,
file_path: Path,
prompt_keywords: List[str],
search_depth: int = 2,
min_confidence: float = 0.6,
max_sections: int = 10,
) -> Dict[str, Any]:
"""Extract sections of documentation that reference prompt keywords/concepts.
This method identifies and extracts the most relevant parts of documentation
files based on direct references and semantic similarity to prompt keywords.
Args:
content: File content
file_path: Path to the file being analyzed
prompt_keywords: Keywords/phrases from the user's prompt
search_depth: How deep to search (1=direct, 2=semantic, 3=deep analysis)
min_confidence: Minimum confidence threshold for relevance (0.0-1.0)
max_sections: Maximum number of contextual sections to preserve
Returns:
Dictionary containing relevant sections with metadata
"""
if not prompt_keywords:
return {
"relevant_sections": [],
"metadata": {"total_sections": 0, "matched_sections": 0},
}
file_type = self._detect_file_type(file_path)
# Extract sections based on file type
sections = self._extract_document_sections(content, file_path, file_type)
# Score sections based on relevance to prompt keywords
scored_sections = []
for section in sections:
score, matches = self._calculate_section_relevance(
section, prompt_keywords, search_depth
)
if score >= min_confidence:
scored_sections.append(
{
**section,
"relevance_score": score,
"keyword_matches": matches,
"context_type": self._determine_context_type(section, matches),
}
)
# Sort by relevance and limit to max_sections
scored_sections.sort(key=lambda x: x["relevance_score"], reverse=True)
relevant_sections = scored_sections[:max_sections]
# Extract code examples and references within relevant sections
for section in relevant_sections:
section["code_examples"] = self._extract_code_examples_from_section(section)
section["api_references"] = self._extract_api_references_from_section(section)
section["config_references"] = self._extract_config_references_from_section(section)
metadata = {
"total_sections": len(sections),
"matched_sections": len(scored_sections),
"relevant_sections": len(relevant_sections),
"file_type": file_type,
"search_depth": search_depth,
"min_confidence": min_confidence,
"avg_relevance_score": (
sum(s["relevance_score"] for s in relevant_sections) / len(relevant_sections)
if relevant_sections
else 0.0
),
}
return {"relevant_sections": relevant_sections, "metadata": metadata}