Skip to content

entity_recognizer

Full name: tenets.core.prompt.entity_recognizer

entity_recognizer

Hybrid entity recognition system.

Combines fast regex-based extraction with optional NLP-based NER for improved accuracy. Includes confidence scoring and fuzzy matching.

Classes

Entitydataclass

Python
Entity(name: str, type: str, confidence: float, context: str = '', start_pos: int = -1, end_pos: int = -1, source: str = 'regex', metadata: Dict[str, Any] = dict())

Recognized entity with confidence and context.

EntityPatternMatcher

Python
EntityPatternMatcher(patterns_file: Optional[Path] = None)

Regex-based entity pattern matching.

Initialize with entity patterns.

PARAMETERDESCRIPTION
patterns_file

Path to entity patterns JSON file

TYPE:Optional[Path]DEFAULT:None

Source code in tenets/core/prompt/entity_recognizer.py
Python
def __init__(self, patterns_file: Optional[Path] = None):
    """Initialize with entity patterns.

    Args:
        patterns_file: Path to entity patterns JSON file
    """
    self.logger = get_logger(__name__)
    self.patterns = self._load_patterns(patterns_file)
    self.compiled_patterns = self._compile_patterns()
Functions
extract
Python
extract(text: str) -> List[Entity]

Extract entities using regex patterns.

PARAMETERDESCRIPTION
text

Text to extract entities from

TYPE:str

RETURNSDESCRIPTION
List[Entity]

List of extracted entities

Source code in tenets/core/prompt/entity_recognizer.py
Python
def extract(self, text: str) -> List[Entity]:
    """Extract entities using regex patterns.

    Args:
        text: Text to extract entities from

    Returns:
        List of extracted entities
    """
    entities = []

    for entity_type, patterns in self.compiled_patterns.items():
        for pattern, base_confidence, description in patterns:
            for match in pattern.finditer(text):
                # Get entity name from first non-empty group
                entity_name = None
                if match.groups():
                    for group in match.groups():
                        if group:
                            entity_name = group
                            break
                else:
                    entity_name = match.group(0)

                if not entity_name:
                    continue

                # Calculate confidence based on context
                confidence = self._calculate_confidence(
                    base_confidence, entity_name, entity_type, text, match.start(), match.end()
                )

                # Get surrounding context
                context_start = max(0, match.start() - 50)
                context_end = min(len(text), match.end() + 50)
                context = text[context_start:context_end]

                entity = Entity(
                    name=entity_name,
                    type=entity_type,
                    confidence=confidence,
                    context=context,
                    start_pos=match.start(),
                    end_pos=match.end(),
                    source="regex",
                    metadata={"pattern_description": description},
                )

                entities.append(entity)

    return entities

NLPEntityRecognizer

Python
NLPEntityRecognizer(model_name: str = 'en_core_web_sm')

NLP-based named entity recognition using spaCy.

Initialize NLP entity recognizer.

PARAMETERDESCRIPTION
model_name

spaCy model to use

TYPE:strDEFAULT:'en_core_web_sm'

Source code in tenets/core/prompt/entity_recognizer.py
Python
def __init__(self, model_name: str = "en_core_web_sm"):
    """Initialize NLP entity recognizer.

    Args:
        model_name: spaCy model to use
    """
    self.logger = get_logger(__name__)
    self.nlp = None

    if SPACY_AVAILABLE:
        try:
            self.nlp = spacy.load(model_name)
            self.logger.info(f"Loaded spaCy model: {model_name}")
        except Exception as e:
            self.logger.warning(f"Failed to load spaCy model {model_name}: {e}")
            self.logger.info("Install with: python -m spacy download en_core_web_sm")
Functions
extract
Python
extract(text: str) -> List[Entity]

Extract entities using NLP.

PARAMETERDESCRIPTION
text

Text to extract entities from

TYPE:str

RETURNSDESCRIPTION
List[Entity]

List of extracted entities

Source code in tenets/core/prompt/entity_recognizer.py
Python
def extract(self, text: str) -> List[Entity]:
    """Extract entities using NLP.

    Args:
        text: Text to extract entities from

    Returns:
        List of extracted entities
    """
    if not self.nlp:
        return []

    entities = []
    doc = self.nlp(text)

    # Map spaCy entity types to our types
    type_mapping = {
        "PERSON": "person",
        "ORG": "organization",
        "GPE": "location",
        "DATE": "date",
        "TIME": "time",
        "MONEY": "money",
        "PERCENT": "percent",
        "PRODUCT": "product",
        "EVENT": "event",
        "WORK_OF_ART": "project",
        "LAW": "regulation",
        "LANGUAGE": "language",
        "FAC": "facility",
    }

    # Extract named entities
    for ent in doc.ents:
        entity_type = type_mapping.get(ent.label_, "other")

        entity = Entity(
            name=ent.text,
            type=entity_type,
            confidence=0.8,  # spaCy entities are generally reliable
            context=text[max(0, ent.start_char - 50) : min(len(text), ent.end_char + 50)],
            start_pos=ent.start_char,
            end_pos=ent.end_char,
            source="ner",
            metadata={"spacy_label": ent.label_},
        )
        entities.append(entity)

    # Also extract noun chunks as potential entities
    for chunk in doc.noun_chunks:
        # Filter out common/short chunks
        if len(chunk.text) > 3 and chunk.root.pos_ in ["NOUN", "PROPN"]:
            entity = Entity(
                name=chunk.text,
                type="concept",
                confidence=0.6,
                context=text[
                    max(0, chunk.start_char - 50) : min(len(text), chunk.end_char + 50)
                ],
                start_pos=chunk.start_char,
                end_pos=chunk.end_char,
                source="ner",
                metadata={"chunk_type": "noun_chunk"},
            )
            entities.append(entity)

    return entities

FuzzyEntityMatcher

Python
FuzzyEntityMatcher(known_entities: Optional[Dict[str, List[str]]] = None)

Fuzzy matching for entity recognition.

Initialize fuzzy matcher.

PARAMETERDESCRIPTION
known_entities

Dictionary of entity type -> list of known entity names

TYPE:Optional[Dict[str, List[str]]]DEFAULT:None

Source code in tenets/core/prompt/entity_recognizer.py
Python
def __init__(self, known_entities: Optional[Dict[str, List[str]]] = None):
    """Initialize fuzzy matcher.

    Args:
        known_entities: Dictionary of entity type -> list of known entity names
    """
    self.logger = get_logger(__name__)
    self.known_entities = known_entities or self._get_default_known_entities()
Functions
find_fuzzy_matches
Python
find_fuzzy_matches(text: str, threshold: float = 0.8) -> List[Entity]

Find fuzzy matches for known entities.

PARAMETERDESCRIPTION
text

Text to search in

TYPE:str

threshold

Similarity threshold (0-1)

TYPE:floatDEFAULT:0.8

RETURNSDESCRIPTION
List[Entity]

List of matched entities

Source code in tenets/core/prompt/entity_recognizer.py
Python
def find_fuzzy_matches(self, text: str, threshold: float = 0.8) -> List[Entity]:
    """Find fuzzy matches for known entities.

    Args:
        text: Text to search in
        threshold: Similarity threshold (0-1)

    Returns:
        List of matched entities
    """
    entities = []
    text_lower = text.lower()

    for entity_type, known_names in self.known_entities.items():
        for known_name in known_names:
            known_lower = known_name.lower()

            # Check for exact match first (case-insensitive, word-boundaries)
            exact_pat = re.compile(r"\b" + re.escape(known_lower) + r"\b", re.IGNORECASE)
            m = exact_pat.search(text_lower)
            if m:
                pos = m.start()
                entity = Entity(
                    name=known_name,
                    type=entity_type,
                    confidence=0.95,
                    context=text[max(0, pos - 50) : min(len(text), m.end() + 50)],
                    start_pos=pos,
                    end_pos=m.end(),
                    source="fuzzy",
                    metadata={"match_type": "exact"},
                )
                entities.append(entity)
                continue

            # Check for fuzzy match in words
            words = re.findall(r"\b\w+\b", text)
            for i, word in enumerate(words):
                similarity = SequenceMatcher(None, word.lower(), known_lower).ratio()

                if similarity >= threshold:
                    # Find position in original text
                    word_pattern = re.compile(r"\b" + re.escape(word) + r"\b", re.IGNORECASE)
                    match = word_pattern.search(text)

                    if match:
                        entity = Entity(
                            name=known_name,
                            type=entity_type,
                            confidence=similarity * 0.9,  # Slightly lower than exact match
                            context=text[
                                max(0, match.start() - 50) : min(len(text), match.end() + 50)
                            ],
                            start_pos=match.start(),
                            end_pos=match.end(),
                            source="fuzzy",
                            metadata={
                                "match_type": "fuzzy",
                                "similarity": similarity,
                                "matched_text": word,
                            },
                        )
                        entities.append(entity)

    return entities

HybridEntityRecognizer

Python
HybridEntityRecognizer(use_nlp: bool = True, use_fuzzy: bool = True, patterns_file: Optional[Path] = None, spacy_model: str = 'en_core_web_sm', known_entities: Optional[Dict[str, List[str]]] = None)

Main entity recognizer combining all approaches.

Initialize hybrid entity recognizer.

PARAMETERDESCRIPTION
use_nlp

Whether to use NLP-based NER

TYPE:boolDEFAULT:True

use_fuzzy

Whether to use fuzzy matching

TYPE:boolDEFAULT:True

patterns_file

Path to entity patterns JSON

TYPE:Optional[Path]DEFAULT:None

spacy_model

spaCy model name

TYPE:strDEFAULT:'en_core_web_sm'

known_entities

Known entities for fuzzy matching

TYPE:Optional[Dict[str, List[str]]]DEFAULT:None

Source code in tenets/core/prompt/entity_recognizer.py
Python
def __init__(
    self,
    use_nlp: bool = True,
    use_fuzzy: bool = True,
    patterns_file: Optional[Path] = None,
    spacy_model: str = "en_core_web_sm",
    known_entities: Optional[Dict[str, List[str]]] = None,
):
    """Initialize hybrid entity recognizer.

    Args:
        use_nlp: Whether to use NLP-based NER
        use_fuzzy: Whether to use fuzzy matching
        patterns_file: Path to entity patterns JSON
        spacy_model: spaCy model name
        known_entities: Known entities for fuzzy matching
    """
    self.logger = get_logger(__name__)

    # Initialize components
    self.pattern_matcher = EntityPatternMatcher(patterns_file)

    self.nlp_recognizer = None
    if use_nlp and SPACY_AVAILABLE:
        self.nlp_recognizer = NLPEntityRecognizer(spacy_model)

    self.fuzzy_matcher = None
    if use_fuzzy:
        self.fuzzy_matcher = FuzzyEntityMatcher(known_entities)

    self.keyword_extractor = KeywordExtractor(use_stopwords=True, stopword_set="prompt")
Functions
recognize
Python
recognize(text: str, merge_overlapping: bool = True, min_confidence: float = 0.5) -> List[Entity]

Recognize entities using all available methods.

PARAMETERDESCRIPTION
text

Text to extract entities from

TYPE:str

merge_overlapping

Whether to merge overlapping entities

TYPE:boolDEFAULT:True

min_confidence

Minimum confidence threshold

TYPE:floatDEFAULT:0.5

RETURNSDESCRIPTION
List[Entity]

List of recognized entities

Source code in tenets/core/prompt/entity_recognizer.py
Python
def recognize(
    self, text: str, merge_overlapping: bool = True, min_confidence: float = 0.5
) -> List[Entity]:
    """Recognize entities using all available methods.

    Args:
        text: Text to extract entities from
        merge_overlapping: Whether to merge overlapping entities
        min_confidence: Minimum confidence threshold

    Returns:
        List of recognized entities
    """
    all_entities = []

    # 1. Regex-based extraction (fastest)
    regex_entities = self.pattern_matcher.extract(text)
    all_entities.extend(regex_entities)
    self.logger.debug(f"Regex extraction found {len(regex_entities)} entities")

    # 2. NLP-based NER (if available)
    if self.nlp_recognizer:
        nlp_entities = self.nlp_recognizer.extract(text)
        all_entities.extend(nlp_entities)
        self.logger.debug(f"NLP extraction found {len(nlp_entities)} entities")

    # 3. Fuzzy matching (if enabled)
    if self.fuzzy_matcher:
        fuzzy_entities = self.fuzzy_matcher.find_fuzzy_matches(text)
        all_entities.extend(fuzzy_entities)
        self.logger.debug(f"Fuzzy matching found {len(fuzzy_entities)} entities")

    # 4. Extract keywords as potential entities
    keywords = self.keyword_extractor.extract(text, max_keywords=20)
    for keyword in keywords:
        # Check if keyword is already covered
        if not any(keyword.lower() in e.name.lower() for e in all_entities):
            # Find keyword position in text
            keyword_lower = keyword.lower()
            text_lower = text.lower()
            pos = text_lower.find(keyword_lower)

            if pos >= 0:
                entity = Entity(
                    name=keyword,
                    type="keyword",
                    confidence=0.6,
                    context=text[max(0, pos - 50) : min(len(text), pos + len(keyword) + 50)],
                    start_pos=pos,
                    end_pos=pos + len(keyword),
                    source="keyword",
                    metadata={"extraction_method": "keyword"},
                )
                all_entities.append(entity)

    # Filter by confidence
    filtered_entities = [e for e in all_entities if e.confidence >= min_confidence]

    # Merge overlapping entities if requested
    if merge_overlapping:
        filtered_entities = self._merge_overlapping_entities(filtered_entities)

    # Sort by position and confidence
    filtered_entities.sort(key=lambda e: (e.start_pos, -e.confidence))

    return filtered_entities
get_entity_summary
Python
get_entity_summary(entities: List[Entity]) -> Dict[str, Any]

Get summary statistics about recognized entities.

PARAMETERDESCRIPTION
entities

List of entities

TYPE:List[Entity]

RETURNSDESCRIPTION
Dict[str, Any]

Summary dictionary

Source code in tenets/core/prompt/entity_recognizer.py
Python
def get_entity_summary(self, entities: List[Entity]) -> Dict[str, Any]:
    """Get summary statistics about recognized entities.

    Args:
        entities: List of entities

    Returns:
        Summary dictionary
    """
    summary = {
        "total": len(entities),
        "by_type": {},
        "by_source": {},
        "avg_confidence": 0.0,
        "high_confidence": 0,
        "unique_names": set(),
    }

    for entity in entities:
        # Count by type
        summary["by_type"][entity.type] = summary["by_type"].get(entity.type, 0) + 1

        # Count by source
        summary["by_source"][entity.source] = summary["by_source"].get(entity.source, 0) + 1

        # Track unique names
        summary["unique_names"].add(entity.name.lower())

        # Count high confidence
        # Tests expect a stricter high-confidence count
        if entity.confidence > 0.85:
            summary["high_confidence"] += 1

    # Calculate average confidence
    if entities:
        summary["avg_confidence"] = sum(e.confidence for e in entities) / len(entities)

    # Convert set to count
    summary["unique_names"] = len(summary["unique_names"])

    return summary

Functions