embeddings
¶
Full name: tenets.core.nlp.embeddings
embeddings¶
Embedding generation and management.
This module provides local embedding generation using sentence transformers. No external API calls are made - everything runs locally.
Classes¶
EmbeddingModel¶
Base class for embedding models.
Initialize embedding model.
PARAMETER | DESCRIPTION |
---|---|
model_name | Name of the model to use TYPE: |
Source code in tenets/core/nlp/embeddings.py
Functions¶
encode¶
encode(texts: Union[str, List[str]], batch_size: int = 32, show_progress: bool = False) -> np.ndarray
Encode texts to embeddings.
PARAMETER | DESCRIPTION |
---|---|
texts | Text or list of texts |
batch_size | Batch size for encoding TYPE: |
show_progress | Show progress bar TYPE: |
RETURNS | DESCRIPTION |
---|---|
ndarray | Numpy array of embeddings |
Source code in tenets/core/nlp/embeddings.py
def encode(
self, texts: Union[str, List[str]], batch_size: int = 32, show_progress: bool = False
) -> np.ndarray:
"""Encode texts to embeddings.
Args:
texts: Text or list of texts
batch_size: Batch size for encoding
show_progress: Show progress bar
Returns:
Numpy array of embeddings
"""
raise NotImplementedError
get_embedding_dim¶
LocalEmbeddings¶
LocalEmbeddings(model_name: str = 'all-MiniLM-L6-v2', device: Optional[str] = None, cache_dir: Optional[Path] = None)
Bases: EmbeddingModel
Local embedding generation using sentence transformers.
This runs completely locally with no external API calls. Models are downloaded and cached by sentence-transformers.
Initialize local embeddings.
PARAMETER | DESCRIPTION |
---|---|
model_name | Sentence transformer model name TYPE: |
device | Device to use ('cpu', 'cuda', or None for auto) |
cache_dir | Directory to cache models |
Source code in tenets/core/nlp/embeddings.py
def __init__(
self,
model_name: str = "all-MiniLM-L6-v2",
device: Optional[str] = None,
cache_dir: Optional[Path] = None,
):
"""Initialize local embeddings.
Args:
model_name: Sentence transformer model name
device: Device to use ('cpu', 'cuda', or None for auto)
cache_dir: Directory to cache models
"""
super().__init__(model_name)
if not SENTENCE_TRANSFORMERS_AVAILABLE:
raise ImportError(
"Sentence transformers not available. "
"Install with: pip install sentence-transformers"
)
try:
# Lazy import SentenceTransformer when actually needed
global SentenceTransformer
if SentenceTransformer is None:
from sentence_transformers import SentenceTransformer
# Determine device
if device:
self.device = device
else:
import torch
self.device = "cuda" if torch.cuda.is_available() else "cpu"
# Load model
self.model = SentenceTransformer(
model_name, device=self.device, cache_folder=str(cache_dir) if cache_dir else None
)
# Get actual embedding dimension
self.embedding_dim = self.model.get_sentence_embedding_dimension()
self.logger.info(
f"Loaded {model_name} on {self.device}, embedding dim: {self.embedding_dim}"
)
except Exception as e:
self.logger.error(f"Failed to load embedding model: {e}")
raise
Functions¶
encode¶
encode(texts: Union[str, List[str]], batch_size: int = 32, show_progress: bool = False, normalize: bool = True) -> np.ndarray
Encode texts to embeddings.
PARAMETER | DESCRIPTION |
---|---|
texts | Text or list of texts |
batch_size | Batch size for encoding TYPE: |
show_progress | Show progress bar TYPE: |
normalize | L2 normalize embeddings TYPE: |
RETURNS | DESCRIPTION |
---|---|
ndarray | Numpy array of embeddings |
Source code in tenets/core/nlp/embeddings.py
def encode(
self,
texts: Union[str, List[str]],
batch_size: int = 32,
show_progress: bool = False,
normalize: bool = True,
) -> np.ndarray:
"""Encode texts to embeddings.
Args:
texts: Text or list of texts
batch_size: Batch size for encoding
show_progress: Show progress bar
normalize: L2 normalize embeddings
Returns:
Numpy array of embeddings
"""
if not self.model:
raise RuntimeError("Model not loaded")
# Handle single text
single_text = isinstance(texts, str)
if single_text:
texts = [texts]
# Encode
embeddings = self.model.encode(
texts,
batch_size=batch_size,
show_progress_bar=show_progress,
convert_to_numpy=True,
normalize_embeddings=normalize,
)
if single_text:
return embeddings[0]
return embeddings
encode_file¶
Encode a file with chunking for long files.
PARAMETER | DESCRIPTION |
---|---|
file_path | Path to file TYPE: |
chunk_size | Characters per chunk TYPE: |
overlap | Overlap between chunks TYPE: |
RETURNS | DESCRIPTION |
---|---|
ndarray | Mean pooled embedding for the file |
Source code in tenets/core/nlp/embeddings.py
def encode_file(
self, file_path: Path, chunk_size: int = 1000, overlap: int = 100
) -> np.ndarray:
"""Encode a file with chunking for long files.
Args:
file_path: Path to file
chunk_size: Characters per chunk
overlap: Overlap between chunks
Returns:
Mean pooled embedding for the file
"""
try:
with open(file_path, encoding="utf-8") as f:
content = f.read()
except Exception as e:
self.logger.warning(f"Failed to read {file_path}: {e}")
return np.zeros(self.embedding_dim)
if not content:
return np.zeros(self.embedding_dim)
# Chunk the content
chunks = []
for i in range(0, len(content), chunk_size - overlap):
chunk = content[i : i + chunk_size]
if chunk:
chunks.append(chunk)
if not chunks:
return np.zeros(self.embedding_dim)
# Encode chunks
chunk_embeddings = self.encode(chunks, show_progress=False)
# Mean pooling
return np.mean(chunk_embeddings, axis=0)
FallbackEmbeddings¶
Bases: EmbeddingModel
Fallback embeddings using TF-IDF when ML not available.
Initialize fallback embeddings.
PARAMETER | DESCRIPTION |
---|---|
embedding_dim | Dimension for embeddings TYPE: |
Source code in tenets/core/nlp/embeddings.py
Functions¶
encode¶
encode(texts: Union[str, List[str]], batch_size: int = 32, show_progress: bool = False) -> np.ndarray
Generate pseudo-embeddings using TF-IDF.
PARAMETER | DESCRIPTION |
---|---|
texts | Text or list of texts |
batch_size | Ignored TYPE: |
show_progress | Ignored TYPE: |
RETURNS | DESCRIPTION |
---|---|
ndarray | Numpy array of pseudo-embeddings |
Source code in tenets/core/nlp/embeddings.py
def encode(
self, texts: Union[str, List[str]], batch_size: int = 32, show_progress: bool = False
) -> np.ndarray:
"""Generate pseudo-embeddings using TF-IDF.
Args:
texts: Text or list of texts
batch_size: Ignored
show_progress: Ignored
Returns:
Numpy array of pseudo-embeddings
"""
single_text = isinstance(texts, str)
if single_text:
texts = [texts]
# Fit TF-IDF on texts
self.tfidf.fit(texts)
vectors = self.tfidf.transform(texts)
# Pad or truncate to embedding_dim
embeddings = []
for vec in vectors:
if len(vec) < self.embedding_dim:
# Pad with zeros
padded = vec + [0.0] * (self.embedding_dim - len(vec))
embeddings.append(padded)
else:
# Truncate
embeddings.append(vec[: self.embedding_dim])
embeddings = np.array(embeddings)
if single_text:
return embeddings[0]
return embeddings
Functions¶
create_embedding_model¶
create_embedding_model(prefer_local: bool = True, model_name: Optional[str] = None, **kwargs) -> EmbeddingModel
Create best available embedding model.
PARAMETER | DESCRIPTION |
---|---|
prefer_local | Prefer local models over API-based TYPE: |
model_name | Specific model to use |
**kwargs | Additional arguments for model DEFAULT: |
RETURNS | DESCRIPTION |
---|---|
EmbeddingModel | EmbeddingModel instance |
Source code in tenets/core/nlp/embeddings.py
def create_embedding_model(
prefer_local: bool = True, model_name: Optional[str] = None, **kwargs
) -> EmbeddingModel:
"""Create best available embedding model.
Args:
prefer_local: Prefer local models over API-based
model_name: Specific model to use
**kwargs: Additional arguments for model
Returns:
EmbeddingModel instance
"""
logger = get_logger(__name__)
# Try local embeddings first
if prefer_local and SENTENCE_TRANSFORMERS_AVAILABLE:
try:
return LocalEmbeddings(model_name or "all-MiniLM-L6-v2", **kwargs)
except Exception as e:
logger.warning(f"Failed to create local embeddings: {e}")
# Fallback to TF-IDF
logger.info("Using TF-IDF fallback for embeddings")
return FallbackEmbeddings()