ml_utils
¶
Full name: tenets.core.nlp.ml_utils
ml_utils¶
Machine learning utilities for ranking.
This module provides ML-based ranking capabilities using NLP components. All embedding and similarity logic is handled by the NLP package to avoid duplication.
Classes¶
EmbeddingModel¶
EmbeddingModel(model_name: str = 'all-MiniLM-L6-v2', cache_dir: Optional[Path] = None, device: Optional[str] = None)
Wrapper for embedding models using NLP components.
Provides a unified interface for different embedding models with built-in caching and batch processing capabilities.
Initialize embedding model.
PARAMETER | DESCRIPTION |
---|---|
model_name | Name of the model to load TYPE: |
cache_dir | Directory for caching embeddings |
device | Device to run on ('cpu', 'cuda', or None for auto) |
Source code in tenets/core/nlp/ml_utils.py
def __init__(
self,
model_name: str = "all-MiniLM-L6-v2",
cache_dir: Optional[Path] = None,
device: Optional[str] = None,
):
"""Initialize embedding model.
Args:
model_name: Name of the model to load
cache_dir: Directory for caching embeddings
device: Device to run on ('cpu', 'cuda', or None for auto)
"""
self.logger = get_logger(__name__)
self.model_name = model_name
self.cache_dir = cache_dir
self.device = device
self.model = None
if not ML_AVAILABLE:
self.logger.warning(
"ML features not available. Install with: pip install sentence-transformers"
)
return
# Load model using NLP package
try:
self.model = LocalEmbeddings(model_name=model_name, device=device, cache_dir=cache_dir)
self.logger.info(f"Loaded embedding model: {model_name}")
except Exception as e:
self.logger.error(f"Failed to load embedding model: {e}")
Functions¶
encode¶
encode(texts: Union[str, List[str]], batch_size: int = 32, show_progress: bool = False, use_cache: bool = True) -> Union[list, Any]
Encode texts to embeddings.
PARAMETER | DESCRIPTION |
---|---|
texts | Text or list of texts to encode |
batch_size | Batch size for encoding TYPE: |
show_progress | Show progress bar TYPE: |
use_cache | Use cached embeddings if available TYPE: |
RETURNS | DESCRIPTION |
---|---|
Union[list, Any] | Numpy array of embeddings or fallback list |
Source code in tenets/core/nlp/ml_utils.py
def encode(
self,
texts: Union[str, List[str]],
batch_size: int = 32,
show_progress: bool = False,
use_cache: bool = True,
) -> Union[list, Any]: # Returns list or numpy array
"""Encode texts to embeddings.
Args:
texts: Text or list of texts to encode
batch_size: Batch size for encoding
show_progress: Show progress bar
use_cache: Use cached embeddings if available
Returns:
Numpy array of embeddings or fallback list
"""
if not self.model:
# Fallback to TF-IDF
return self._tfidf_fallback(texts)
return self.model.encode(texts, batch_size=batch_size, show_progress=show_progress)
NeuralReranker¶
Neural reranking model for improved ranking.
Uses cross-encoder models to rerank initial results for better accuracy. This is more accurate than bi-encoders but slower.
Initialize reranker.
PARAMETER | DESCRIPTION |
---|---|
model_name | Cross-encoder model name TYPE: |
Source code in tenets/core/nlp/ml_utils.py
def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
"""Initialize reranker.
Args:
model_name: Cross-encoder model name
"""
self.logger = get_logger(__name__)
self.model_name = model_name
self.model = None
if not ML_AVAILABLE:
self.logger.warning("Cross-encoder reranking not available without ML dependencies")
return
self._load_model()
Functions¶
rerank¶
rerank(query: str, documents: List[Tuple[str, float]], top_k: int = 10) -> List[Tuple[str, float]]
Rerank documents using cross-encoder.
PARAMETER | DESCRIPTION |
---|---|
query | Query text TYPE: |
documents | List of (document_text, initial_score) tuples |
top_k | Number of top results to rerank TYPE: |
RETURNS | DESCRIPTION |
---|---|
List[Tuple[str, float]] | Reranked list of (document_text, score) tuples |
Source code in tenets/core/nlp/ml_utils.py
def rerank(
self, query: str, documents: List[Tuple[str, float]], top_k: int = 10
) -> List[Tuple[str, float]]:
"""Rerank documents using cross-encoder.
Args:
query: Query text
documents: List of (document_text, initial_score) tuples
top_k: Number of top results to rerank
Returns:
Reranked list of (document_text, score) tuples
"""
if not self.model or not documents:
return documents
try:
# Take top-K for reranking
docs_to_rerank = documents[:top_k]
remaining_docs = documents[top_k:]
# Prepare pairs for cross-encoder
pairs = [(query, doc[0]) for doc in docs_to_rerank]
# Get reranking scores
scores = self.model.predict(pairs)
# Combine with original scores (weighted average)
reranked = []
for i, (doc_text, orig_score) in enumerate(docs_to_rerank):
# Combine original and reranking scores
combined_score = 0.3 * orig_score + 0.7 * scores[i]
reranked.append((doc_text, combined_score))
# Sort by new scores
reranked.sort(key=lambda x: x[1], reverse=True)
# Append remaining documents
reranked.extend(remaining_docs)
return reranked
except Exception as e:
self.logger.warning(f"Reranking failed: {e}")
return documents
Functions¶
cosine_similarity¶
Compute cosine similarity between two vectors.
PARAMETER | DESCRIPTION |
---|---|
vec1 | First vector (can be list, array, or dict for sparse vectors) |
vec2 | Second vector (can be list, array, or dict for sparse vectors) |
RETURNS | DESCRIPTION |
---|---|
float | Cosine similarity (-1 to 1) |
Source code in tenets/core/nlp/similarity.py
def cosine_similarity(vec1, vec2) -> float:
"""Compute cosine similarity between two vectors.
Args:
vec1: First vector (can be list, array, or dict for sparse vectors)
vec2: Second vector (can be list, array, or dict for sparse vectors)
Returns:
Cosine similarity (-1 to 1)
"""
# Check if inputs are sparse vectors (dicts)
if isinstance(vec1, dict) and isinstance(vec2, dict):
return sparse_cosine_similarity(vec1, vec2)
# Handle different input types for dense vectors
vec1 = np.asarray(vec1).flatten()
vec2 = np.asarray(vec2).flatten()
# Check dimensions
if vec1.shape != vec2.shape:
raise ValueError(f"Vectors must have same shape: {vec1.shape} != {vec2.shape}")
# Compute cosine similarity
dot_product = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
if norm1 == 0 or norm2 == 0:
return 0.0
similarity = dot_product / (norm1 * norm2)
# Clamp to [-1, 1] to handle floating point errors
return float(np.clip(similarity, -1.0, 1.0))
load_embedding_model¶
load_embedding_model(model_name: Optional[str] = None, cache_dir: Optional[Path] = None, device: Optional[str] = None) -> Optional[EmbeddingModel]
Load an embedding model.
PARAMETER | DESCRIPTION |
---|---|
model_name | Model name (default: all-MiniLM-L6-v2) |
cache_dir | Directory for caching |
device | Device to run on |
RETURNS | DESCRIPTION |
---|---|
Optional[EmbeddingModel] | EmbeddingModel instance or None if unavailable |
Source code in tenets/core/nlp/ml_utils.py
def load_embedding_model(
model_name: Optional[str] = None, cache_dir: Optional[Path] = None, device: Optional[str] = None
) -> Optional[EmbeddingModel]:
"""Load an embedding model.
Args:
model_name: Model name (default: all-MiniLM-L6-v2)
cache_dir: Directory for caching
device: Device to run on
Returns:
EmbeddingModel instance or None if unavailable
"""
logger = get_logger(__name__)
if not ML_AVAILABLE:
logger.warning("ML features not available. Install with: pip install sentence-transformers")
return None
try:
model_name = model_name or "all-MiniLM-L6-v2"
return EmbeddingModel(model_name, cache_dir, device)
except Exception as e:
logger.error(f"Failed to load embedding model: {e}")
return None
compute_similarity¶
compute_similarity(model: EmbeddingModel, text1: str, text2: str, cache: Optional[Dict[str, Any]] = None) -> float
Compute semantic similarity between two texts.
PARAMETER | DESCRIPTION |
---|---|
model | Embedding model TYPE: |
text1 | First text TYPE: |
text2 | Second text TYPE: |
cache | Optional cache dictionary (unused, for API compatibility) |
RETURNS | DESCRIPTION |
---|---|
float | Similarity score (0-1) |
Source code in tenets/core/nlp/ml_utils.py
def compute_similarity(
model: EmbeddingModel, text1: str, text2: str, cache: Optional[Dict[str, Any]] = None
) -> float:
"""Compute semantic similarity between two texts.
Args:
model: Embedding model
text1: First text
text2: Second text
cache: Optional cache dictionary (unused, for API compatibility)
Returns:
Similarity score (0-1)
"""
if not model or not model.model:
return 0.0
try:
# Use NLP similarity computation
similarity_calc = SemanticSimilarity(model.model)
return similarity_calc.compute(text1, text2)
except Exception as e:
logger = get_logger(__name__)
logger.warning(f"Similarity computation failed: {e}")
return 0.0
batch_similarity¶
batch_similarity(model: EmbeddingModel, query: str, documents: List[str], batch_size: int = 32) -> List[float]
Compute similarity between query and multiple documents.
PARAMETER | DESCRIPTION |
---|---|
model | Embedding model TYPE: |
query | Query text TYPE: |
documents | List of documents |
batch_size | Batch size for encoding TYPE: |
RETURNS | DESCRIPTION |
---|---|
List[float] | List of similarity scores |
Source code in tenets/core/nlp/ml_utils.py
def batch_similarity(
model: EmbeddingModel, query: str, documents: List[str], batch_size: int = 32
) -> List[float]:
"""Compute similarity between query and multiple documents.
Args:
model: Embedding model
query: Query text
documents: List of documents
batch_size: Batch size for encoding
Returns:
List of similarity scores
"""
if not model or not model.model or not documents:
return [0.0] * len(documents)
try:
# Use NLP batch similarity
similarity_calc = SemanticSimilarity(model.model)
results = similarity_calc.compute_batch(query, documents)
# Convert to list of scores in original order
score_dict = dict(results)
return [score_dict.get(i, 0.0) for i in range(len(documents))]
except Exception as e:
logger = get_logger(__name__)
logger.warning(f"Batch similarity computation failed: {e}")
return [0.0] * len(documents)
check_ml_dependencies¶
Check which ML dependencies are available.
RETURNS | DESCRIPTION |
---|---|
Dict[str, bool] | Dictionary of dependency availability |
Source code in tenets/core/nlp/ml_utils.py
def check_ml_dependencies() -> Dict[str, bool]:
"""Check which ML dependencies are available.
Returns:
Dictionary of dependency availability
"""
deps = {
"sentence_transformers": ML_AVAILABLE,
"torch": False,
"transformers": False,
"sklearn": False,
}
try:
import torch
deps["torch"] = True
except ImportError:
pass
try:
import transformers
deps["transformers"] = True
except ImportError:
pass
try:
import sklearn
deps["sklearn"] = True
except ImportError:
pass
return deps
get_available_models¶
Get list of available embedding models.
RETURNS | DESCRIPTION |
---|---|
List[str] | List of model names |
Source code in tenets/core/nlp/ml_utils.py
def get_available_models() -> List[str]:
"""Get list of available embedding models.
Returns:
List of model names
"""
models = []
if ML_AVAILABLE:
# Common small models
models.extend(
[
"all-MiniLM-L6-v2",
"all-MiniLM-L12-v2",
"all-mpnet-base-v2",
"multi-qa-MiniLM-L6-cos-v1",
"paraphrase-MiniLM-L6-v2",
]
)
# Always available fallback
models.append("tfidf")
return models
estimate_embedding_memory¶
Estimate memory requirements for embeddings.
PARAMETER | DESCRIPTION |
---|---|
num_files | Number of files to embed TYPE: |
embedding_dim | Dimension of embeddings TYPE: |
RETURNS | DESCRIPTION |
---|---|
Dict[str, float] | Dictionary with memory estimates |
Source code in tenets/core/nlp/ml_utils.py
def estimate_embedding_memory(num_files: int, embedding_dim: int = 384) -> Dict[str, float]:
"""Estimate memory requirements for embeddings.
Args:
num_files: Number of files to embed
embedding_dim: Dimension of embeddings
Returns:
Dictionary with memory estimates
"""
# Assume float32 (4 bytes per value)
bytes_per_embedding = embedding_dim * 4
total_bytes = num_files * bytes_per_embedding
return {
"per_file_mb": bytes_per_embedding / (1024 * 1024),
"total_mb": total_bytes / (1024 * 1024),
"total_gb": total_bytes / (1024 * 1024 * 1024),
}