Skip to content

similarity

Full name: tenets.core.nlp.similarity

similarity

Similarity computation utilities.

This module provides various similarity metrics including cosine similarity and semantic similarity using embeddings.

Classes

SemanticSimilarity

Python
SemanticSimilarity(model: Optional[object] = None, cache_embeddings: bool = True)

Compute semantic similarity using embeddings.

Initialize semantic similarity.

PARAMETERDESCRIPTION
model

Embedding model to use (creates default if None)

TYPE:Optional[object]DEFAULT:None

cache_embeddings

Cache computed embeddings

TYPE:boolDEFAULT:True

Source code in tenets/core/nlp/similarity.py
Python
def __init__(self, model: Optional[object] = None, cache_embeddings: bool = True):
    """Initialize semantic similarity.

    Args:
        model: Embedding model to use (creates default if None)
        cache_embeddings: Cache computed embeddings
    """
    self.logger = get_logger(__name__)

    if model is None:
        # Use module-level factory (patchable in tests)
        self.model = create_embedding_model()
    else:
        self.model = model

    self.cache_embeddings = cache_embeddings
    self._cache = {} if cache_embeddings else None
Functions
compute
Python
compute(text1: str, text2: str, metric: str = 'cosine') -> float

Compute semantic similarity between two texts.

PARAMETERDESCRIPTION
text1

First text

TYPE:str

text2

Second text

TYPE:str

metric

Similarity metric ('cosine', 'euclidean', 'manhattan')

TYPE:strDEFAULT:'cosine'

RETURNSDESCRIPTION
float

Similarity score

Source code in tenets/core/nlp/similarity.py
Python
def compute(self, text1: str, text2: str, metric: str = "cosine") -> float:
    """Compute semantic similarity between two texts.

    Args:
        text1: First text
        text2: Second text
        metric: Similarity metric ('cosine', 'euclidean', 'manhattan')

    Returns:
        Similarity score
    """
    # Get embeddings
    emb1 = self._get_embedding(text1)
    emb2 = self._get_embedding(text2)

    # Compute similarity
    if metric == "cosine":
        return cosine_similarity(emb1, emb2)
    elif metric == "euclidean":
        # Convert distance to similarity
        dist = euclidean_distance(emb1, emb2)
        return 1.0 / (1.0 + dist)
    elif metric == "manhattan":
        # Convert distance to similarity
        dist = manhattan_distance(emb1, emb2)
        return 1.0 / (1.0 + dist)
    else:
        raise ValueError(f"Unknown metric: {metric}")
compute_batch
Python
compute_batch(query: str, documents: List[str], metric: str = 'cosine', top_k: Optional[int] = None) -> List[Tuple[int, float]]

Compute similarity between query and multiple documents.

PARAMETERDESCRIPTION
query

Query text

TYPE:str

documents

List of documents

TYPE:List[str]

metric

Similarity metric

TYPE:strDEFAULT:'cosine'

top_k

Return only top K results

TYPE:Optional[int]DEFAULT:None

RETURNSDESCRIPTION
List[Tuple[int, float]]

List of (index, similarity) tuples sorted by similarity

Source code in tenets/core/nlp/similarity.py
Python
def compute_batch(
    self, query: str, documents: List[str], metric: str = "cosine", top_k: Optional[int] = None
) -> List[Tuple[int, float]]:
    """Compute similarity between query and multiple documents.

    Args:
        query: Query text
        documents: List of documents
        metric: Similarity metric
        top_k: Return only top K results

    Returns:
        List of (index, similarity) tuples sorted by similarity
    """
    if not documents:
        return []

    # Get query embedding
    query_emb = self._get_embedding(query)
    query_emb = np.asarray(query_emb)
    if query_emb.ndim > 1:
        query_emb = query_emb[0]

    # Get document embeddings (batch encode for efficiency)
    doc_embeddings = self.model.encode(documents)
    # Normalize possible single-vector or ndarray returns to list of 1D arrays
    if isinstance(doc_embeddings, np.ndarray):
        if doc_embeddings.ndim == 1:
            doc_embeddings = [doc_embeddings]
        elif doc_embeddings.ndim == 2:
            doc_embeddings = [doc_embeddings[i] for i in range(doc_embeddings.shape[0])]
    elif not isinstance(doc_embeddings, (list, tuple)):
        doc_embeddings = [np.asarray(doc_embeddings)]

    # Compute similarities
    similarities = []
    for i, doc_emb in enumerate(doc_embeddings):
        # Ensure ndarray-like
        doc_emb = np.asarray(doc_emb)
        if metric == "cosine":
            sim = cosine_similarity(query_emb, doc_emb)
        elif metric == "euclidean":
            dist = euclidean_distance(query_emb, doc_emb)
            sim = 1.0 / (1.0 + dist)
        elif metric == "manhattan":
            dist = manhattan_distance(query_emb, doc_emb)
            sim = 1.0 / (1.0 + dist)
        else:
            raise ValueError(f"Unknown metric: {metric}")

        similarities.append((i, sim))

    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)

    if top_k:
        return similarities[:top_k]

    return similarities
find_similar
Python
find_similar(query: str, documents: List[str], threshold: float = 0.7, metric: str = 'cosine') -> List[Tuple[int, float]]

Find documents similar to query above threshold.

PARAMETERDESCRIPTION
query

Query text

TYPE:str

documents

List of documents

TYPE:List[str]

threshold

Similarity threshold

TYPE:floatDEFAULT:0.7

metric

Similarity metric

TYPE:strDEFAULT:'cosine'

RETURNSDESCRIPTION
List[Tuple[int, float]]

List of (index, similarity) for documents above threshold

Source code in tenets/core/nlp/similarity.py
Python
def find_similar(
    self, query: str, documents: List[str], threshold: float = 0.7, metric: str = "cosine"
) -> List[Tuple[int, float]]:
    """Find documents similar to query above threshold.

    Args:
        query: Query text
        documents: List of documents
        threshold: Similarity threshold
        metric: Similarity metric

    Returns:
        List of (index, similarity) for documents above threshold
    """
    similarities = self.compute_batch(query, documents, metric)
    return [(i, sim) for i, sim in similarities if sim >= threshold]
clear_cache
Python
clear_cache()

Clear embedding cache.

Source code in tenets/core/nlp/similarity.py
Python
def clear_cache(self):
    """Clear embedding cache."""
    if self._cache:
        self._cache.clear()

Functions

cosine_similarity

Python
cosine_similarity(vec1, vec2) -> float

Compute cosine similarity between two vectors.

PARAMETERDESCRIPTION
vec1

First vector (can be list, array, or dict for sparse vectors)

vec2

Second vector (can be list, array, or dict for sparse vectors)

RETURNSDESCRIPTION
float

Cosine similarity (-1 to 1)

Source code in tenets/core/nlp/similarity.py
Python
def cosine_similarity(vec1, vec2) -> float:
    """Compute cosine similarity between two vectors.

    Args:
        vec1: First vector (can be list, array, or dict for sparse vectors)
        vec2: Second vector (can be list, array, or dict for sparse vectors)

    Returns:
        Cosine similarity (-1 to 1)
    """
    # Check if inputs are sparse vectors (dicts)
    if isinstance(vec1, dict) and isinstance(vec2, dict):
        return sparse_cosine_similarity(vec1, vec2)

    # Handle different input types for dense vectors
    vec1 = np.asarray(vec1).flatten()
    vec2 = np.asarray(vec2).flatten()

    # Check dimensions
    if vec1.shape != vec2.shape:
        raise ValueError(f"Vectors must have same shape: {vec1.shape} != {vec2.shape}")

    # Compute cosine similarity
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)

    if norm1 == 0 or norm2 == 0:
        return 0.0

    similarity = dot_product / (norm1 * norm2)

    # Clamp to [-1, 1] to handle floating point errors
    return float(np.clip(similarity, -1.0, 1.0))

sparse_cosine_similarity

Python
sparse_cosine_similarity(vec1: dict, vec2: dict) -> float

Compute cosine similarity between two sparse vectors.

Sparse vectors are represented as dictionaries mapping indices/keys to values. This is efficient for high-dimensional vectors with many zero values.

PARAMETERDESCRIPTION
vec1

First sparse vector as {key: value} dict

TYPE:dict

vec2

Second sparse vector as {key: value} dict

TYPE:dict

RETURNSDESCRIPTION
float

Cosine similarity (-1 to 1)

Source code in tenets/core/nlp/similarity.py
Python
def sparse_cosine_similarity(vec1: dict, vec2: dict) -> float:
    """Compute cosine similarity between two sparse vectors.

    Sparse vectors are represented as dictionaries mapping indices/keys to values.
    This is efficient for high-dimensional vectors with many zero values.

    Args:
        vec1: First sparse vector as {key: value} dict
        vec2: Second sparse vector as {key: value} dict

    Returns:
        Cosine similarity (-1 to 1)
    """
    # Compute dot product (only for common keys)
    dot_product = sum(vec1.get(key, 0) * vec2.get(key, 0) for key in set(vec1) | set(vec2))

    # Compute norms
    norm1 = math.sqrt(sum(v**2 for v in vec1.values()))
    norm2 = math.sqrt(sum(v**2 for v in vec2.values()))

    if norm1 == 0 or norm2 == 0:
        return 0.0

    similarity = dot_product / (norm1 * norm2)

    # Clamp to [-1, 1] to handle floating point errors
    return max(-1.0, min(1.0, similarity))

euclidean_distance

Python
euclidean_distance(vec1, vec2) -> float

Compute Euclidean distance between two vectors.

PARAMETERDESCRIPTION
vec1

First vector

vec2

Second vector

RETURNSDESCRIPTION
float

Euclidean distance (>= 0)

Source code in tenets/core/nlp/similarity.py
Python
def euclidean_distance(vec1, vec2) -> float:
    """Compute Euclidean distance between two vectors.

    Args:
        vec1: First vector
        vec2: Second vector

    Returns:
        Euclidean distance (>= 0)
    """
    vec1 = np.asarray(vec1).flatten()
    vec2 = np.asarray(vec2).flatten()

    if vec1.shape != vec2.shape:
        raise ValueError(f"Vectors must have same shape: {vec1.shape} != {vec2.shape}")

    return float(np.linalg.norm(vec1 - vec2))

manhattan_distance

Python
manhattan_distance(vec1, vec2) -> float

Compute Manhattan (L1) distance between two vectors.

PARAMETERDESCRIPTION
vec1

First vector

vec2

Second vector

RETURNSDESCRIPTION
float

Manhattan distance (>= 0)

Source code in tenets/core/nlp/similarity.py
Python
def manhattan_distance(vec1, vec2) -> float:
    """Compute Manhattan (L1) distance between two vectors.

    Args:
        vec1: First vector
        vec2: Second vector

    Returns:
        Manhattan distance (>= 0)
    """
    vec1 = np.asarray(vec1).flatten()
    vec2 = np.asarray(vec2).flatten()

    if vec1.shape != vec2.shape:
        raise ValueError(f"Vectors must have same shape: {vec1.shape} != {vec2.shape}")

    return float(np.sum(np.abs(vec1 - vec2)))