strategies
¶
Full name: tenets.core.summarizer.strategies
strategies¶
Summarization strategies with NLP integration.
This module provides various summarization strategies that leverage the centralized NLP components for improved text processing and analysis.
Strategies: - ExtractiveStrategy: Selects important sentences using NLP keyword extraction - CompressiveStrategy: Removes redundancy using NLP tokenization - TextRankStrategy: Graph-based ranking with NLP preprocessing - TransformerStrategy: Neural summarization (requires ML) - NLPEnhancedStrategy: Advanced strategy using all NLP features
Classes¶
SummarizationStrategy¶
Bases: ABC
Abstract base class for summarization strategies.
Functions¶
summarizeabstractmethod
¶
summarize(text: str, target_ratio: float = 0.3, max_length: Optional[int] = None, min_length: Optional[int] = None) -> str
Summarize text.
PARAMETER | DESCRIPTION |
---|---|
text | Input text TYPE: |
target_ratio | Target compression ratio TYPE: |
max_length | Maximum summary length |
min_length | Minimum summary length |
RETURNS | DESCRIPTION |
---|---|
str | Summarized text |
Source code in tenets/core/summarizer/strategies.py
@abstractmethod
def summarize(
self,
text: str,
target_ratio: float = 0.3,
max_length: Optional[int] = None,
min_length: Optional[int] = None,
) -> str:
"""Summarize text.
Args:
text: Input text
target_ratio: Target compression ratio
max_length: Maximum summary length
min_length: Minimum summary length
Returns:
Summarized text
"""
pass
ExtractiveStrategy¶
Bases: SummarizationStrategy
Extractive summarization using NLP components.
Selects the most important sentences based on keyword density, position, and optionally semantic similarity. Uses centralized NLP components for improved sentence scoring.
Initialize extractive strategy.
PARAMETER | DESCRIPTION |
---|---|
use_nlp | Whether to use NLP components for enhanced extraction TYPE: |
Source code in tenets/core/summarizer/strategies.py
def __init__(self, use_nlp: bool = True):
"""Initialize extractive strategy.
Args:
use_nlp: Whether to use NLP components for enhanced extraction
"""
self.logger = get_logger(__name__)
self.use_nlp = use_nlp and NLP_AVAILABLE
if self.use_nlp:
# Initialize NLP components
self.keyword_extractor = KeywordExtractor(
use_stopwords=True,
stopword_set="prompt", # Use aggressive stopwords for summarization
)
self.tokenizer = TextTokenizer(use_stopwords=True)
self.logger.info("ExtractiveStrategy using NLP components")
else:
self.logger.info("ExtractiveStrategy using basic extraction")
Functions¶
summarize¶
summarize(text: str, target_ratio: float = 0.3, max_length: Optional[int] = None, min_length: Optional[int] = None) -> str
Extract important sentences to create summary.
PARAMETER | DESCRIPTION |
---|---|
text | Input text TYPE: |
target_ratio | Target compression ratio TYPE: |
max_length | Maximum summary length |
min_length | Minimum summary length |
RETURNS | DESCRIPTION |
---|---|
str | Extractive summary |
Source code in tenets/core/summarizer/strategies.py
def summarize(
self,
text: str,
target_ratio: float = 0.3,
max_length: Optional[int] = None,
min_length: Optional[int] = None,
) -> str:
"""Extract important sentences to create summary.
Args:
text: Input text
target_ratio: Target compression ratio
max_length: Maximum summary length
min_length: Minimum summary length
Returns:
Extractive summary
"""
# Split into sentences
sentences = self._split_sentences(text)
if not sentences:
return text
# Score sentences
if self.use_nlp:
scores = self._score_sentences_nlp(sentences, text)
else:
scores = self._score_sentences_basic(sentences)
# Select top sentences
target_length = int(len(text) * target_ratio)
if max_length:
target_length = min(target_length, max_length)
selected = self._select_sentences(sentences, scores, target_length, min_length)
return " ".join(selected)
CompressiveStrategy¶
Bases: SummarizationStrategy
Compressive summarization using NLP tokenization.
Removes redundant words and phrases while maintaining meaning. Uses NLP tokenizer for better word processing.
Initialize compressive strategy.
PARAMETER | DESCRIPTION |
---|---|
use_nlp | Whether to use NLP components TYPE: |
Source code in tenets/core/summarizer/strategies.py
def __init__(self, use_nlp: bool = True):
"""Initialize compressive strategy.
Args:
use_nlp: Whether to use NLP components
"""
self.logger = get_logger(__name__)
self.use_nlp = use_nlp and NLP_AVAILABLE
if self.use_nlp:
self.tokenizer = TextTokenizer(use_stopwords=True)
self.stopword_manager = StopwordManager()
self.stopwords = self.stopword_manager.get_set("prompt")
self.logger.info("CompressiveStrategy using NLP components")
Functions¶
summarize¶
summarize(text: str, target_ratio: float = 0.3, max_length: Optional[int] = None, min_length: Optional[int] = None) -> str
Compress text by removing redundancy.
PARAMETER | DESCRIPTION |
---|---|
text | Input text TYPE: |
target_ratio | Target compression ratio TYPE: |
max_length | Maximum summary length |
min_length | Minimum summary length |
RETURNS | DESCRIPTION |
---|---|
str | Compressed text |
Source code in tenets/core/summarizer/strategies.py
def summarize(
self,
text: str,
target_ratio: float = 0.3,
max_length: Optional[int] = None,
min_length: Optional[int] = None,
) -> str:
"""Compress text by removing redundancy.
Args:
text: Input text
target_ratio: Target compression ratio
max_length: Maximum summary length
min_length: Minimum summary length
Returns:
Compressed text
"""
sentences = re.split(r"[.!?]+", text)
sentences = [s.strip() for s in sentences if s.strip()]
compressed = []
seen_concepts = set()
current_length = 0
target_length = int(len(text) * target_ratio)
if max_length:
target_length = min(target_length, max_length)
for sentence in sentences:
# Compress sentence
if self.use_nlp:
compressed_sent = self._compress_sentence_nlp(sentence, seen_concepts)
else:
compressed_sent = self._compress_sentence_basic(sentence, seen_concepts)
if compressed_sent:
compressed.append(compressed_sent)
current_length += len(compressed_sent)
# Update seen concepts
if self.use_nlp:
tokens = self.tokenizer.tokenize(compressed_sent)
seen_concepts.update(tokens)
else:
words = compressed_sent.lower().split()
seen_concepts.update(words)
if current_length >= target_length:
break
result = " ".join(compressed)
# Check minimum length
if min_length and len(result) < min_length:
# Add more sentences
for sentence in sentences[len(compressed) :]:
compressed.append(sentence)
if len(" ".join(compressed)) >= min_length:
break
result = " ".join(compressed)
return result
TextRankStrategy¶
Bases: SummarizationStrategy
TextRank summarization with NLP preprocessing.
Graph-based ranking algorithm that uses NLP components for better text preprocessing and similarity computation.
Initialize TextRank strategy.
PARAMETER | DESCRIPTION |
---|---|
use_nlp | Whether to use NLP components TYPE: |
Source code in tenets/core/summarizer/strategies.py
def __init__(self, use_nlp: bool = True):
"""Initialize TextRank strategy.
Args:
use_nlp: Whether to use NLP components
"""
self.logger = get_logger(__name__)
self.use_nlp = use_nlp and NLP_AVAILABLE and SKLEARN_AVAILABLE
if not SKLEARN_AVAILABLE:
raise ImportError(
"TextRank requires scikit-learn. Install with: pip install scikit-learn"
)
if self.use_nlp:
self.tfidf_calc = TFIDFCalculator(use_stopwords=True)
self.logger.info("TextRankStrategy using NLP components")
Functions¶
summarize¶
summarize(text: str, target_ratio: float = 0.3, max_length: Optional[int] = None, min_length: Optional[int] = None) -> str
Summarize using TextRank algorithm.
PARAMETER | DESCRIPTION |
---|---|
text | Input text TYPE: |
target_ratio | Target compression ratio TYPE: |
max_length | Maximum summary length |
min_length | Minimum summary length |
RETURNS | DESCRIPTION |
---|---|
str | TextRank summary |
Source code in tenets/core/summarizer/strategies.py
def summarize(
self,
text: str,
target_ratio: float = 0.3,
max_length: Optional[int] = None,
min_length: Optional[int] = None,
) -> str:
"""Summarize using TextRank algorithm.
Args:
text: Input text
target_ratio: Target compression ratio
max_length: Maximum summary length
min_length: Minimum summary length
Returns:
TextRank summary
"""
sentences = re.split(r"[.!?]+", text)
sentences = [s.strip() for s in sentences if s.strip()]
if len(sentences) <= 2:
return text
# Build similarity matrix
if self.use_nlp:
similarity_matrix = self._build_similarity_matrix_nlp(sentences)
else:
similarity_matrix = self._build_similarity_matrix_sklearn(sentences)
# Calculate scores using PageRank-style algorithm
scores = self._calculate_scores(similarity_matrix)
# Select top sentences
target_length = int(len(text) * target_ratio)
if max_length:
target_length = min(target_length, max_length)
ranked_sentences = sorted(
zip(sentences, scores, range(len(sentences))), key=lambda x: x[1], reverse=True
)
selected = []
selected_indices = []
current_length = 0
for sentence, score, idx in ranked_sentences:
if current_length + len(sentence) <= target_length:
selected.append(sentence)
selected_indices.append(idx)
current_length += len(sentence)
elif min_length and current_length < min_length:
selected.append(sentence)
selected_indices.append(idx)
current_length += len(sentence)
else:
break
# Sort back to original order
selected_indices.sort()
return " ".join([sentences[i] for i in selected_indices])
TransformerStrategy¶
Bases: SummarizationStrategy
Transformer-based neural summarization.
Uses pre-trained transformer models for high-quality abstractive summarization.
Initialize transformer strategy.
PARAMETER | DESCRIPTION |
---|---|
model_name | HuggingFace model name TYPE: |
Source code in tenets/core/summarizer/strategies.py
def __init__(self, model_name: str = "facebook/bart-large-cnn"):
"""Initialize transformer strategy.
Args:
model_name: HuggingFace model name
"""
self.logger = get_logger(__name__)
if not TRANSFORMERS_AVAILABLE:
raise ImportError("Transformers not available. Install with: pip install transformers")
self.model_name = model_name
self.summarizer = None
self._initialize_model()
Functions¶
summarize¶
summarize(text: str, target_ratio: float = 0.3, max_length: Optional[int] = None, min_length: Optional[int] = None) -> str
Summarize using transformer model.
PARAMETER | DESCRIPTION |
---|---|
text | Input text TYPE: |
target_ratio | Target compression ratio TYPE: |
max_length | Maximum summary length |
min_length | Minimum summary length |
RETURNS | DESCRIPTION |
---|---|
str | Neural summary |
Source code in tenets/core/summarizer/strategies.py
def summarize(
self,
text: str,
target_ratio: float = 0.3,
max_length: Optional[int] = None,
min_length: Optional[int] = None,
) -> str:
"""Summarize using transformer model.
Args:
text: Input text
target_ratio: Target compression ratio
max_length: Maximum summary length
min_length: Minimum summary length
Returns:
Neural summary
"""
if not self.summarizer:
raise RuntimeError("Transformer model not initialized")
# Calculate target lengths
target_max = int(len(text) * target_ratio)
if max_length:
target_max = min(target_max, max_length)
target_min = min_length or int(target_max * 0.5)
# Adjust for model tokens (roughly 1 token = 4 chars)
max_tokens = min(target_max // 4, 512)
min_tokens = target_min // 4
try:
result = self.summarizer(
text, max_length=max_tokens, min_length=min_tokens, do_sample=False
)
return result[0]["summary_text"]
except Exception as e:
self.logger.error(f"Transformer summarization failed: {e}")
# Fallback to extractive
extractive = ExtractiveStrategy()
return extractive.summarize(text, target_ratio, max_length, min_length)
NLPEnhancedStrategy¶
Bases: SummarizationStrategy
Advanced summarization using all NLP features.
Combines multiple NLP components for advanced extractive summarization with semantic understanding.
Initialize NLP-enhanced strategy.
Source code in tenets/core/summarizer/strategies.py
def __init__(self):
"""Initialize NLP-enhanced strategy."""
self.logger = get_logger(__name__)
if not NLP_AVAILABLE:
raise ImportError("NLP components not available")
# Initialize all NLP components
self.keyword_extractor = KeywordExtractor(
use_yake=True, use_stopwords=True, stopword_set="prompt"
)
self.tokenizer = TextTokenizer(use_stopwords=True)
self.tfidf_calc = TFIDFCalculator(use_stopwords=True)
# Try to initialize embeddings for semantic similarity
try:
self.embedding_model = create_embedding_model()
self.semantic_sim = SemanticSimilarity(self.embedding_model)
self.use_embeddings = True
self.logger.info("NLPEnhancedStrategy using embeddings")
except Exception as e:
self.logger.warning(f"Embeddings not available: {e}")
self.use_embeddings = False
Functions¶
summarize¶
summarize(text: str, target_ratio: float = 0.3, max_length: Optional[int] = None, min_length: Optional[int] = None) -> str
Summarize using comprehensive NLP analysis.
PARAMETER | DESCRIPTION |
---|---|
text | Input text TYPE: |
target_ratio | Target compression ratio TYPE: |
max_length | Maximum summary length |
min_length | Minimum summary length |
RETURNS | DESCRIPTION |
---|---|
str | NLP-enhanced summary |
Source code in tenets/core/summarizer/strategies.py
def summarize(
self,
text: str,
target_ratio: float = 0.3,
max_length: Optional[int] = None,
min_length: Optional[int] = None,
) -> str:
"""Summarize using comprehensive NLP analysis.
Args:
text: Input text
target_ratio: Target compression ratio
max_length: Maximum summary length
min_length: Minimum summary length
Returns:
NLP-enhanced summary
"""
# Split into sentences
sentences = re.split(r"[.!?]+", text)
sentences = [s.strip() for s in sentences if s.strip()]
if not sentences:
return text
# Extract key concepts
keywords = self.keyword_extractor.extract(text, max_keywords=20, include_scores=True)
keyword_dict = dict(keywords) if keywords else {}
# Score sentences with multiple factors
scores = []
for i, sentence in enumerate(sentences):
score = 0.0
# 1. Keyword relevance (30%)
tokens = self.tokenizer.tokenize(sentence)
if tokens:
keyword_score = sum(keyword_dict.get(t, 0) for t in tokens) / len(tokens)
score += keyword_score * 0.3
# 2. Position importance (20%)
if i == 0: # First sentence
score += 0.2
elif i == len(sentences) - 1: # Last sentence
score += 0.1
else:
score += (1.0 - i / len(sentences)) * 0.1
# 3. TF-IDF relevance (25%)
self.tfidf_calc.add_document(f"sent_{i}", sentence)
# 4. Semantic similarity to document (25% if available)
if self.use_embeddings:
try:
doc_sim = self.semantic_sim.compute(sentence, text)
score += doc_sim * 0.25
except Exception:
pass
scores.append(score)
# Add TF-IDF scores
for i, sentence in enumerate(sentences):
tfidf_score = self.tfidf_calc.compute_similarity(text, f"sent_{i}")
scores[i] += tfidf_score * 0.25
# Select diverse sentences (avoid redundancy)
target_length = int(len(text) * target_ratio)
if max_length:
target_length = min(target_length, max_length)
selected = self._select_diverse_sentences(sentences, scores, target_length, min_length)
return " ".join(selected)