`cache`¶

Full name: tenets.core.nlp.cache

cache¶

Embedding cache management.

This module provides caching for embeddings to avoid recomputation of expensive embedding operations.

Classes¶

EmbeddingCache¶

Python

EmbeddingCache(cache_dir: Path, max_memory_items: int = 1000, ttl_days: int = 30)

Cache for embedding vectors.

Uses a two-level cache: 1. Memory cache for hot embeddings 2. Disk cache for persistence

Initialize embedding cache.

PARAMETER	DESCRIPTION
`cache_dir`	Directory for disk cache TYPE:`Path`
`max_memory_items`	Maximum items in memory cache TYPE:`int`DEFAULT:`1000`
`ttl_days`	Time to live for cached embeddings TYPE:`int`DEFAULT:`30`

Source code in tenets/core/nlp/cache.py

Python

def __init__(self, cache_dir: Path, max_memory_items: int = 1000, ttl_days: int = 30):
    """Initialize embedding cache.

    Args:
        cache_dir: Directory for disk cache
        max_memory_items: Maximum items in memory cache
        ttl_days: Time to live for cached embeddings
    """
    self.logger = get_logger(__name__)
    self.cache_dir = Path(cache_dir)
    self.cache_dir.mkdir(parents=True, exist_ok=True)

    # Memory cache
    self._memory_cache: Dict[str, np.ndarray] = {}
    self._access_order: list[str] = []
    self.max_memory_items = max_memory_items

    # Disk cache
    self.disk_cache = DiskCache(self.cache_dir, name="embeddings")
    self.ttl_seconds = ttl_days * 24 * 3600

Functions¶

get¶

Python

get(text: str, model_name: str = 'default') -> Optional[np.ndarray]

Get cached embedding.

PARAMETER	DESCRIPTION
`text`	Text that was embedded TYPE:`str`
`model_name`	Model used for embedding TYPE:`str`DEFAULT:`'default'`

RETURNS	DESCRIPTION
`Optional[ndarray]`	Cached embedding or None

Source code in tenets/core/nlp/cache.py

Python

def get(self, text: str, model_name: str = "default") -> Optional[np.ndarray]:
    """Get cached embedding.

    Args:
        text: Text that was embedded
        model_name: Model used for embedding

    Returns:
        Cached embedding or None
    """
    key = self._make_key(text, model_name)

    # Check memory cache
    if key in self._memory_cache:
        # Move to end (LRU)
        if key in self._access_order:
            self._access_order.remove(key)
        self._access_order.append(key)
        return self._memory_cache[key]

    # Check disk cache
    cached = self.disk_cache.get(key)
    if cached is not None:
        # Validate it's an embedding
        if isinstance(cached, np.ndarray):
            # Promote to memory cache
            self._add_to_memory(key, cached)
            return cached
        else:
            self.logger.warning(f"Invalid cached embedding for {key}")
            self.disk_cache.delete(key)

    return None

put¶

Python

put(text: str, embedding: ndarray, model_name: str = 'default')

Cache an embedding.

PARAMETER	DESCRIPTION
`text`	Text that was embedded TYPE:`str`
`embedding`	Embedding vector TYPE:`ndarray`
`model_name`	Model used TYPE:`str`DEFAULT:`'default'`

Source code in tenets/core/nlp/cache.py

Python

def put(self, text: str, embedding: np.ndarray, model_name: str = "default"):
    """Cache an embedding.

    Args:
        text: Text that was embedded
        embedding: Embedding vector
        model_name: Model used
    """
    key = self._make_key(text, model_name)

    # Add to memory cache
    self._add_to_memory(key, embedding)

    # Add to disk cache
    self.disk_cache.put(
        key,
        embedding,
        ttl=self.ttl_seconds,
        metadata={"model": model_name, "dim": embedding.shape[0], "text_preview": text[:100]},
    )

get_batch¶

Python

get_batch(texts: list[str], model_name: str = 'default') -> Dict[str, Optional[np.ndarray]]

Get multiple cached embeddings.

PARAMETER	DESCRIPTION
`texts`	List of texts TYPE:`list[str]`
`model_name`	Model used TYPE:`str`DEFAULT:`'default'`

RETURNS	DESCRIPTION
`Dict[str, Optional[ndarray]]`	Dict mapping text to embedding (or None if not cached)

Source code in tenets/core/nlp/cache.py

Python

def get_batch(
    self, texts: list[str], model_name: str = "default"
) -> Dict[str, Optional[np.ndarray]]:
    """Get multiple cached embeddings.

    Args:
        texts: List of texts
        model_name: Model used

    Returns:
        Dict mapping text to embedding (or None if not cached)
    """
    results = {}

    for text in texts:
        results[text] = self.get(text, model_name)

    return results

put_batch¶

Python

put_batch(embeddings: Dict[str, ndarray], model_name: str = 'default')

Cache multiple embeddings.

PARAMETER	DESCRIPTION
`embeddings`	Dict mapping text to embedding TYPE:`Dict[str, ndarray]`
`model_name`	Model used TYPE:`str`DEFAULT:`'default'`

Source code in tenets/core/nlp/cache.py

Python

def put_batch(self, embeddings: Dict[str, np.ndarray], model_name: str = "default"):
    """Cache multiple embeddings.

    Args:
        embeddings: Dict mapping text to embedding
        model_name: Model used
    """
    for text, embedding in embeddings.items():
        self.put(text, embedding, model_name)

clear_memory¶

Python

clear_memory()

Clear memory cache.

Source code in tenets/core/nlp/cache.py

Python

def clear_memory(self):
    """Clear memory cache."""
    self._memory_cache.clear()
    self._access_order.clear()

clear_all¶

Python

clear_all()

Clear all caches.

Source code in tenets/core/nlp/cache.py

Python

def clear_all(self):
    """Clear all caches."""
    self.clear_memory()
    self.disk_cache.clear()

cleanup¶

Python

cleanup() -> int

Clean up old cache entries.

RETURNS	DESCRIPTION
`int`	Number of entries deleted

Source code in tenets/core/nlp/cache.py

Python

def cleanup(self) -> int:
    """Clean up old cache entries.

    Returns:
        Number of entries deleted
    """
    return self.disk_cache.cleanup(max_age_days=self.ttl_seconds // (24 * 3600))

stats¶

Python

stats() -> Dict[str, Any]

Get cache statistics.

RETURNS	DESCRIPTION
`Dict[str, Any]`	Cache statistics

Source code in tenets/core/nlp/cache.py

Python

def stats(self) -> Dict[str, Any]:
    """Get cache statistics.

    Returns:
        Cache statistics
    """
    return {
        "memory_items": len(self._memory_cache),
        "memory_size_mb": sum(e.nbytes for e in self._memory_cache.values()) / (1024 * 1024),
        "access_order_length": len(self._access_order),
    }