similarity
¶
Full name: tenets.core.nlp.similarity
similarity¶
Similarity computation utilities.
This module provides various similarity metrics including cosine similarity and semantic similarity using embeddings.
Classes¶
SemanticSimilarity¶
Compute semantic similarity using embeddings.
Initialize semantic similarity.
PARAMETER | DESCRIPTION |
---|---|
model | Embedding model to use (creates default if None) |
cache_embeddings | Cache computed embeddings TYPE: |
Source code in tenets/core/nlp/similarity.py
def __init__(self, model: Optional[object] = None, cache_embeddings: bool = True):
"""Initialize semantic similarity.
Args:
model: Embedding model to use (creates default if None)
cache_embeddings: Cache computed embeddings
"""
self.logger = get_logger(__name__)
if model is None:
# Use module-level factory (patchable in tests)
self.model = create_embedding_model()
else:
self.model = model
self.cache_embeddings = cache_embeddings
self._cache = {} if cache_embeddings else None
Functions¶
compute¶
Compute semantic similarity between two texts.
PARAMETER | DESCRIPTION |
---|---|
text1 | First text TYPE: |
text2 | Second text TYPE: |
metric | Similarity metric ('cosine', 'euclidean', 'manhattan') TYPE: |
RETURNS | DESCRIPTION |
---|---|
float | Similarity score |
Source code in tenets/core/nlp/similarity.py
def compute(self, text1: str, text2: str, metric: str = "cosine") -> float:
"""Compute semantic similarity between two texts.
Args:
text1: First text
text2: Second text
metric: Similarity metric ('cosine', 'euclidean', 'manhattan')
Returns:
Similarity score
"""
# Get embeddings
emb1 = self._get_embedding(text1)
emb2 = self._get_embedding(text2)
# Compute similarity
if metric == "cosine":
return cosine_similarity(emb1, emb2)
elif metric == "euclidean":
# Convert distance to similarity
dist = euclidean_distance(emb1, emb2)
return 1.0 / (1.0 + dist)
elif metric == "manhattan":
# Convert distance to similarity
dist = manhattan_distance(emb1, emb2)
return 1.0 / (1.0 + dist)
else:
raise ValueError(f"Unknown metric: {metric}")
compute_batch¶
compute_batch(query: str, documents: List[str], metric: str = 'cosine', top_k: Optional[int] = None) -> List[Tuple[int, float]]
Compute similarity between query and multiple documents.
PARAMETER | DESCRIPTION |
---|---|
query | Query text TYPE: |
documents | List of documents |
metric | Similarity metric TYPE: |
top_k | Return only top K results |
RETURNS | DESCRIPTION |
---|---|
List[Tuple[int, float]] | List of (index, similarity) tuples sorted by similarity |
Source code in tenets/core/nlp/similarity.py
def compute_batch(
self, query: str, documents: List[str], metric: str = "cosine", top_k: Optional[int] = None
) -> List[Tuple[int, float]]:
"""Compute similarity between query and multiple documents.
Args:
query: Query text
documents: List of documents
metric: Similarity metric
top_k: Return only top K results
Returns:
List of (index, similarity) tuples sorted by similarity
"""
if not documents:
return []
# Get query embedding
query_emb = self._get_embedding(query)
query_emb = np.asarray(query_emb)
if query_emb.ndim > 1:
query_emb = query_emb[0]
# Get document embeddings (batch encode for efficiency)
doc_embeddings = self.model.encode(documents)
# Normalize possible single-vector or ndarray returns to list of 1D arrays
if isinstance(doc_embeddings, np.ndarray):
if doc_embeddings.ndim == 1:
doc_embeddings = [doc_embeddings]
elif doc_embeddings.ndim == 2:
doc_embeddings = [doc_embeddings[i] for i in range(doc_embeddings.shape[0])]
elif not isinstance(doc_embeddings, (list, tuple)):
doc_embeddings = [np.asarray(doc_embeddings)]
# Compute similarities
similarities = []
for i, doc_emb in enumerate(doc_embeddings):
# Ensure ndarray-like
doc_emb = np.asarray(doc_emb)
if metric == "cosine":
sim = cosine_similarity(query_emb, doc_emb)
elif metric == "euclidean":
dist = euclidean_distance(query_emb, doc_emb)
sim = 1.0 / (1.0 + dist)
elif metric == "manhattan":
dist = manhattan_distance(query_emb, doc_emb)
sim = 1.0 / (1.0 + dist)
else:
raise ValueError(f"Unknown metric: {metric}")
similarities.append((i, sim))
# Sort by similarity
similarities.sort(key=lambda x: x[1], reverse=True)
if top_k:
return similarities[:top_k]
return similarities
find_similar¶
find_similar(query: str, documents: List[str], threshold: float = 0.7, metric: str = 'cosine') -> List[Tuple[int, float]]
Find documents similar to query above threshold.
PARAMETER | DESCRIPTION |
---|---|
query | Query text TYPE: |
documents | List of documents |
threshold | Similarity threshold TYPE: |
metric | Similarity metric TYPE: |
RETURNS | DESCRIPTION |
---|---|
List[Tuple[int, float]] | List of (index, similarity) for documents above threshold |
Source code in tenets/core/nlp/similarity.py
def find_similar(
self, query: str, documents: List[str], threshold: float = 0.7, metric: str = "cosine"
) -> List[Tuple[int, float]]:
"""Find documents similar to query above threshold.
Args:
query: Query text
documents: List of documents
threshold: Similarity threshold
metric: Similarity metric
Returns:
List of (index, similarity) for documents above threshold
"""
similarities = self.compute_batch(query, documents, metric)
return [(i, sim) for i, sim in similarities if sim >= threshold]
clear_cache¶
Functions¶
cosine_similarity¶
Compute cosine similarity between two vectors.
PARAMETER | DESCRIPTION |
---|---|
vec1 | First vector (can be list, array, or dict for sparse vectors) |
vec2 | Second vector (can be list, array, or dict for sparse vectors) |
RETURNS | DESCRIPTION |
---|---|
float | Cosine similarity (-1 to 1) |
Source code in tenets/core/nlp/similarity.py
def cosine_similarity(vec1, vec2) -> float:
"""Compute cosine similarity between two vectors.
Args:
vec1: First vector (can be list, array, or dict for sparse vectors)
vec2: Second vector (can be list, array, or dict for sparse vectors)
Returns:
Cosine similarity (-1 to 1)
"""
# Check if inputs are sparse vectors (dicts)
if isinstance(vec1, dict) and isinstance(vec2, dict):
return sparse_cosine_similarity(vec1, vec2)
# Handle different input types for dense vectors
vec1 = np.asarray(vec1).flatten()
vec2 = np.asarray(vec2).flatten()
# Check dimensions
if vec1.shape != vec2.shape:
raise ValueError(f"Vectors must have same shape: {vec1.shape} != {vec2.shape}")
# Compute cosine similarity
dot_product = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
if norm1 == 0 or norm2 == 0:
return 0.0
similarity = dot_product / (norm1 * norm2)
# Clamp to [-1, 1] to handle floating point errors
return float(np.clip(similarity, -1.0, 1.0))
sparse_cosine_similarity¶
Compute cosine similarity between two sparse vectors.
Sparse vectors are represented as dictionaries mapping indices/keys to values. This is efficient for high-dimensional vectors with many zero values.
PARAMETER | DESCRIPTION |
---|---|
vec1 | First sparse vector as {key: value} dict TYPE: |
vec2 | Second sparse vector as {key: value} dict TYPE: |
RETURNS | DESCRIPTION |
---|---|
float | Cosine similarity (-1 to 1) |
Source code in tenets/core/nlp/similarity.py
def sparse_cosine_similarity(vec1: dict, vec2: dict) -> float:
"""Compute cosine similarity between two sparse vectors.
Sparse vectors are represented as dictionaries mapping indices/keys to values.
This is efficient for high-dimensional vectors with many zero values.
Args:
vec1: First sparse vector as {key: value} dict
vec2: Second sparse vector as {key: value} dict
Returns:
Cosine similarity (-1 to 1)
"""
# Compute dot product (only for common keys)
dot_product = sum(vec1.get(key, 0) * vec2.get(key, 0) for key in set(vec1) | set(vec2))
# Compute norms
norm1 = math.sqrt(sum(v**2 for v in vec1.values()))
norm2 = math.sqrt(sum(v**2 for v in vec2.values()))
if norm1 == 0 or norm2 == 0:
return 0.0
similarity = dot_product / (norm1 * norm2)
# Clamp to [-1, 1] to handle floating point errors
return max(-1.0, min(1.0, similarity))
euclidean_distance¶
Compute Euclidean distance between two vectors.
PARAMETER | DESCRIPTION |
---|---|
vec1 | First vector |
vec2 | Second vector |
RETURNS | DESCRIPTION |
---|---|
float | Euclidean distance (>= 0) |
Source code in tenets/core/nlp/similarity.py
def euclidean_distance(vec1, vec2) -> float:
"""Compute Euclidean distance between two vectors.
Args:
vec1: First vector
vec2: Second vector
Returns:
Euclidean distance (>= 0)
"""
vec1 = np.asarray(vec1).flatten()
vec2 = np.asarray(vec2).flatten()
if vec1.shape != vec2.shape:
raise ValueError(f"Vectors must have same shape: {vec1.shape} != {vec2.shape}")
return float(np.linalg.norm(vec1 - vec2))
manhattan_distance¶
Compute Manhattan (L1) distance between two vectors.
PARAMETER | DESCRIPTION |
---|---|
vec1 | First vector |
vec2 | Second vector |
RETURNS | DESCRIPTION |
---|---|
float | Manhattan distance (>= 0) |
Source code in tenets/core/nlp/similarity.py
def manhattan_distance(vec1, vec2) -> float:
"""Compute Manhattan (L1) distance between two vectors.
Args:
vec1: First vector
vec2: Second vector
Returns:
Manhattan distance (>= 0)
"""
vec1 = np.asarray(vec1).flatten()
vec2 = np.asarray(vec2).flatten()
if vec1.shape != vec2.shape:
raise ValueError(f"Vectors must have same shape: {vec1.shape} != {vec2.shape}")
return float(np.sum(np.abs(vec1 - vec2)))