Skip to content

cache

Full name: tenets.core.nlp.cache

cache

Embedding cache management.

This module provides caching for embeddings to avoid recomputation of expensive embedding operations.

Classes

EmbeddingCache

Python
EmbeddingCache(cache_dir: Path, max_memory_items: int = 1000, ttl_days: int = 30)

Cache for embedding vectors.

Uses a two-level cache: 1. Memory cache for hot embeddings 2. Disk cache for persistence

Initialize embedding cache.

PARAMETERDESCRIPTION
cache_dir

Directory for disk cache

TYPE:Path

max_memory_items

Maximum items in memory cache

TYPE:intDEFAULT:1000

ttl_days

Time to live for cached embeddings

TYPE:intDEFAULT:30

Source code in tenets/core/nlp/cache.py
Python
def __init__(self, cache_dir: Path, max_memory_items: int = 1000, ttl_days: int = 30):
    """Initialize embedding cache.

    Args:
        cache_dir: Directory for disk cache
        max_memory_items: Maximum items in memory cache
        ttl_days: Time to live for cached embeddings
    """
    self.logger = get_logger(__name__)
    self.cache_dir = Path(cache_dir)
    self.cache_dir.mkdir(parents=True, exist_ok=True)

    # Memory cache
    self._memory_cache: Dict[str, np.ndarray] = {}
    self._access_order: list[str] = []
    self.max_memory_items = max_memory_items

    # Disk cache
    self.disk_cache = DiskCache(self.cache_dir, name="embeddings")
    self.ttl_seconds = ttl_days * 24 * 3600
Functions
get
Python
get(text: str, model_name: str = 'default') -> Optional[np.ndarray]

Get cached embedding.

PARAMETERDESCRIPTION
text

Text that was embedded

TYPE:str

model_name

Model used for embedding

TYPE:strDEFAULT:'default'

RETURNSDESCRIPTION
Optional[ndarray]

Cached embedding or None

Source code in tenets/core/nlp/cache.py
Python
def get(self, text: str, model_name: str = "default") -> Optional[np.ndarray]:
    """Get cached embedding.

    Args:
        text: Text that was embedded
        model_name: Model used for embedding

    Returns:
        Cached embedding or None
    """
    key = self._make_key(text, model_name)

    # Check memory cache
    if key in self._memory_cache:
        # Move to end (LRU)
        if key in self._access_order:
            self._access_order.remove(key)
        self._access_order.append(key)
        return self._memory_cache[key]

    # Check disk cache
    cached = self.disk_cache.get(key)
    if cached is not None:
        # Validate it's an embedding
        if isinstance(cached, np.ndarray):
            # Promote to memory cache
            self._add_to_memory(key, cached)
            return cached
        else:
            self.logger.warning(f"Invalid cached embedding for {key}")
            self.disk_cache.delete(key)

    return None
put
Python
put(text: str, embedding: ndarray, model_name: str = 'default')

Cache an embedding.

PARAMETERDESCRIPTION
text

Text that was embedded

TYPE:str

embedding

Embedding vector

TYPE:ndarray

model_name

Model used

TYPE:strDEFAULT:'default'

Source code in tenets/core/nlp/cache.py
Python
def put(self, text: str, embedding: np.ndarray, model_name: str = "default"):
    """Cache an embedding.

    Args:
        text: Text that was embedded
        embedding: Embedding vector
        model_name: Model used
    """
    key = self._make_key(text, model_name)

    # Add to memory cache
    self._add_to_memory(key, embedding)

    # Add to disk cache
    self.disk_cache.put(
        key,
        embedding,
        ttl=self.ttl_seconds,
        metadata={"model": model_name, "dim": embedding.shape[0], "text_preview": text[:100]},
    )
get_batch
Python
get_batch(texts: list[str], model_name: str = 'default') -> Dict[str, Optional[np.ndarray]]

Get multiple cached embeddings.

PARAMETERDESCRIPTION
texts

List of texts

TYPE:list[str]

model_name

Model used

TYPE:strDEFAULT:'default'

RETURNSDESCRIPTION
Dict[str, Optional[ndarray]]

Dict mapping text to embedding (or None if not cached)

Source code in tenets/core/nlp/cache.py
Python
def get_batch(
    self, texts: list[str], model_name: str = "default"
) -> Dict[str, Optional[np.ndarray]]:
    """Get multiple cached embeddings.

    Args:
        texts: List of texts
        model_name: Model used

    Returns:
        Dict mapping text to embedding (or None if not cached)
    """
    results = {}

    for text in texts:
        results[text] = self.get(text, model_name)

    return results
put_batch
Python
put_batch(embeddings: Dict[str, ndarray], model_name: str = 'default')

Cache multiple embeddings.

PARAMETERDESCRIPTION
embeddings

Dict mapping text to embedding

TYPE:Dict[str, ndarray]

model_name

Model used

TYPE:strDEFAULT:'default'

Source code in tenets/core/nlp/cache.py
Python
def put_batch(self, embeddings: Dict[str, np.ndarray], model_name: str = "default"):
    """Cache multiple embeddings.

    Args:
        embeddings: Dict mapping text to embedding
        model_name: Model used
    """
    for text, embedding in embeddings.items():
        self.put(text, embedding, model_name)
clear_memory
Python
clear_memory()

Clear memory cache.

Source code in tenets/core/nlp/cache.py
Python
def clear_memory(self):
    """Clear memory cache."""
    self._memory_cache.clear()
    self._access_order.clear()
clear_all
Python
clear_all()

Clear all caches.

Source code in tenets/core/nlp/cache.py
Python
def clear_all(self):
    """Clear all caches."""
    self.clear_memory()
    self.disk_cache.clear()
cleanup
Python
cleanup() -> int

Clean up old cache entries.

RETURNSDESCRIPTION
int

Number of entries deleted

Source code in tenets/core/nlp/cache.py
Python
def cleanup(self) -> int:
    """Clean up old cache entries.

    Returns:
        Number of entries deleted
    """
    return self.disk_cache.cleanup(max_age_days=self.ttl_seconds // (24 * 3600))
stats
Python
stats() -> Dict[str, Any]

Get cache statistics.

RETURNSDESCRIPTION
Dict[str, Any]

Cache statistics

Source code in tenets/core/nlp/cache.py
Python
def stats(self) -> Dict[str, Any]:
    """Get cache statistics.

    Returns:
        Cache statistics
    """
    return {
        "memory_items": len(self._memory_cache),
        "memory_size_mb": sum(e.nbytes for e in self._memory_cache.values()) / (1024 * 1024),
        "access_order_length": len(self._access_order),
    }

Functions