entity_recognizer
¶
Full name: tenets.core.prompt.entity_recognizer
entity_recognizer¶
Hybrid entity recognition system.
Combines fast regex-based extraction with optional NLP-based NER for improved accuracy. Includes confidence scoring and fuzzy matching.
Classes¶
Entitydataclass
¶
Entity(name: str, type: str, confidence: float, context: str = '', start_pos: int = -1, end_pos: int = -1, source: str = 'regex', metadata: Dict[str, Any] = dict())
Recognized entity with confidence and context.
EntityPatternMatcher¶
Regex-based entity pattern matching.
Initialize with entity patterns.
PARAMETER | DESCRIPTION |
---|---|
patterns_file | Path to entity patterns JSON file |
Source code in tenets/core/prompt/entity_recognizer.py
Functions¶
extract¶
Extract entities using regex patterns.
PARAMETER | DESCRIPTION |
---|---|
text | Text to extract entities from TYPE: |
RETURNS | DESCRIPTION |
---|---|
List[Entity] | List of extracted entities |
Source code in tenets/core/prompt/entity_recognizer.py
def extract(self, text: str) -> List[Entity]:
"""Extract entities using regex patterns.
Args:
text: Text to extract entities from
Returns:
List of extracted entities
"""
entities = []
for entity_type, patterns in self.compiled_patterns.items():
for pattern, base_confidence, description in patterns:
for match in pattern.finditer(text):
# Get entity name from first non-empty group
entity_name = None
if match.groups():
for group in match.groups():
if group:
entity_name = group
break
else:
entity_name = match.group(0)
if not entity_name:
continue
# Calculate confidence based on context
confidence = self._calculate_confidence(
base_confidence, entity_name, entity_type, text, match.start(), match.end()
)
# Get surrounding context
context_start = max(0, match.start() - 50)
context_end = min(len(text), match.end() + 50)
context = text[context_start:context_end]
entity = Entity(
name=entity_name,
type=entity_type,
confidence=confidence,
context=context,
start_pos=match.start(),
end_pos=match.end(),
source="regex",
metadata={"pattern_description": description},
)
entities.append(entity)
return entities
NLPEntityRecognizer¶
NLP-based named entity recognition using spaCy.
Initialize NLP entity recognizer.
PARAMETER | DESCRIPTION |
---|---|
model_name | spaCy model to use TYPE: |
Source code in tenets/core/prompt/entity_recognizer.py
def __init__(self, model_name: str = "en_core_web_sm"):
"""Initialize NLP entity recognizer.
Args:
model_name: spaCy model to use
"""
self.logger = get_logger(__name__)
self.nlp = None
if SPACY_AVAILABLE:
try:
self.nlp = spacy.load(model_name)
self.logger.info(f"Loaded spaCy model: {model_name}")
except Exception as e:
self.logger.warning(f"Failed to load spaCy model {model_name}: {e}")
self.logger.info("Install with: python -m spacy download en_core_web_sm")
Functions¶
extract¶
Extract entities using NLP.
PARAMETER | DESCRIPTION |
---|---|
text | Text to extract entities from TYPE: |
RETURNS | DESCRIPTION |
---|---|
List[Entity] | List of extracted entities |
Source code in tenets/core/prompt/entity_recognizer.py
def extract(self, text: str) -> List[Entity]:
"""Extract entities using NLP.
Args:
text: Text to extract entities from
Returns:
List of extracted entities
"""
if not self.nlp:
return []
entities = []
doc = self.nlp(text)
# Map spaCy entity types to our types
type_mapping = {
"PERSON": "person",
"ORG": "organization",
"GPE": "location",
"DATE": "date",
"TIME": "time",
"MONEY": "money",
"PERCENT": "percent",
"PRODUCT": "product",
"EVENT": "event",
"WORK_OF_ART": "project",
"LAW": "regulation",
"LANGUAGE": "language",
"FAC": "facility",
}
# Extract named entities
for ent in doc.ents:
entity_type = type_mapping.get(ent.label_, "other")
entity = Entity(
name=ent.text,
type=entity_type,
confidence=0.8, # spaCy entities are generally reliable
context=text[max(0, ent.start_char - 50) : min(len(text), ent.end_char + 50)],
start_pos=ent.start_char,
end_pos=ent.end_char,
source="ner",
metadata={"spacy_label": ent.label_},
)
entities.append(entity)
# Also extract noun chunks as potential entities
for chunk in doc.noun_chunks:
# Filter out common/short chunks
if len(chunk.text) > 3 and chunk.root.pos_ in ["NOUN", "PROPN"]:
entity = Entity(
name=chunk.text,
type="concept",
confidence=0.6,
context=text[
max(0, chunk.start_char - 50) : min(len(text), chunk.end_char + 50)
],
start_pos=chunk.start_char,
end_pos=chunk.end_char,
source="ner",
metadata={"chunk_type": "noun_chunk"},
)
entities.append(entity)
return entities
FuzzyEntityMatcher¶
Fuzzy matching for entity recognition.
Initialize fuzzy matcher.
PARAMETER | DESCRIPTION |
---|---|
known_entities | Dictionary of entity type -> list of known entity names |
Source code in tenets/core/prompt/entity_recognizer.py
def __init__(self, known_entities: Optional[Dict[str, List[str]]] = None):
"""Initialize fuzzy matcher.
Args:
known_entities: Dictionary of entity type -> list of known entity names
"""
self.logger = get_logger(__name__)
self.known_entities = known_entities or self._get_default_known_entities()
Functions¶
find_fuzzy_matches¶
Find fuzzy matches for known entities.
PARAMETER | DESCRIPTION |
---|---|
text | Text to search in TYPE: |
threshold | Similarity threshold (0-1) TYPE: |
RETURNS | DESCRIPTION |
---|---|
List[Entity] | List of matched entities |
Source code in tenets/core/prompt/entity_recognizer.py
def find_fuzzy_matches(self, text: str, threshold: float = 0.8) -> List[Entity]:
"""Find fuzzy matches for known entities.
Args:
text: Text to search in
threshold: Similarity threshold (0-1)
Returns:
List of matched entities
"""
entities = []
text_lower = text.lower()
for entity_type, known_names in self.known_entities.items():
for known_name in known_names:
known_lower = known_name.lower()
# Check for exact match first (case-insensitive, word-boundaries)
exact_pat = re.compile(r"\b" + re.escape(known_lower) + r"\b", re.IGNORECASE)
m = exact_pat.search(text_lower)
if m:
pos = m.start()
entity = Entity(
name=known_name,
type=entity_type,
confidence=0.95,
context=text[max(0, pos - 50) : min(len(text), m.end() + 50)],
start_pos=pos,
end_pos=m.end(),
source="fuzzy",
metadata={"match_type": "exact"},
)
entities.append(entity)
continue
# Check for fuzzy match in words
words = re.findall(r"\b\w+\b", text)
for i, word in enumerate(words):
similarity = SequenceMatcher(None, word.lower(), known_lower).ratio()
if similarity >= threshold:
# Find position in original text
word_pattern = re.compile(r"\b" + re.escape(word) + r"\b", re.IGNORECASE)
match = word_pattern.search(text)
if match:
entity = Entity(
name=known_name,
type=entity_type,
confidence=similarity * 0.9, # Slightly lower than exact match
context=text[
max(0, match.start() - 50) : min(len(text), match.end() + 50)
],
start_pos=match.start(),
end_pos=match.end(),
source="fuzzy",
metadata={
"match_type": "fuzzy",
"similarity": similarity,
"matched_text": word,
},
)
entities.append(entity)
return entities
HybridEntityRecognizer¶
HybridEntityRecognizer(use_nlp: bool = True, use_fuzzy: bool = True, patterns_file: Optional[Path] = None, spacy_model: str = 'en_core_web_sm', known_entities: Optional[Dict[str, List[str]]] = None)
Main entity recognizer combining all approaches.
Initialize hybrid entity recognizer.
PARAMETER | DESCRIPTION |
---|---|
use_nlp | Whether to use NLP-based NER TYPE: |
use_fuzzy | Whether to use fuzzy matching TYPE: |
patterns_file | Path to entity patterns JSON |
spacy_model | spaCy model name TYPE: |
known_entities | Known entities for fuzzy matching |
Source code in tenets/core/prompt/entity_recognizer.py
def __init__(
self,
use_nlp: bool = True,
use_fuzzy: bool = True,
patterns_file: Optional[Path] = None,
spacy_model: str = "en_core_web_sm",
known_entities: Optional[Dict[str, List[str]]] = None,
):
"""Initialize hybrid entity recognizer.
Args:
use_nlp: Whether to use NLP-based NER
use_fuzzy: Whether to use fuzzy matching
patterns_file: Path to entity patterns JSON
spacy_model: spaCy model name
known_entities: Known entities for fuzzy matching
"""
self.logger = get_logger(__name__)
# Initialize components
self.pattern_matcher = EntityPatternMatcher(patterns_file)
self.nlp_recognizer = None
if use_nlp and SPACY_AVAILABLE:
self.nlp_recognizer = NLPEntityRecognizer(spacy_model)
self.fuzzy_matcher = None
if use_fuzzy:
self.fuzzy_matcher = FuzzyEntityMatcher(known_entities)
self.keyword_extractor = KeywordExtractor(use_stopwords=True, stopword_set="prompt")
Functions¶
recognize¶
recognize(text: str, merge_overlapping: bool = True, min_confidence: float = 0.5) -> List[Entity]
Recognize entities using all available methods.
PARAMETER | DESCRIPTION |
---|---|
text | Text to extract entities from TYPE: |
merge_overlapping | Whether to merge overlapping entities TYPE: |
min_confidence | Minimum confidence threshold TYPE: |
RETURNS | DESCRIPTION |
---|---|
List[Entity] | List of recognized entities |
Source code in tenets/core/prompt/entity_recognizer.py
def recognize(
self, text: str, merge_overlapping: bool = True, min_confidence: float = 0.5
) -> List[Entity]:
"""Recognize entities using all available methods.
Args:
text: Text to extract entities from
merge_overlapping: Whether to merge overlapping entities
min_confidence: Minimum confidence threshold
Returns:
List of recognized entities
"""
all_entities = []
# 1. Regex-based extraction (fastest)
regex_entities = self.pattern_matcher.extract(text)
all_entities.extend(regex_entities)
self.logger.debug(f"Regex extraction found {len(regex_entities)} entities")
# 2. NLP-based NER (if available)
if self.nlp_recognizer:
nlp_entities = self.nlp_recognizer.extract(text)
all_entities.extend(nlp_entities)
self.logger.debug(f"NLP extraction found {len(nlp_entities)} entities")
# 3. Fuzzy matching (if enabled)
if self.fuzzy_matcher:
fuzzy_entities = self.fuzzy_matcher.find_fuzzy_matches(text)
all_entities.extend(fuzzy_entities)
self.logger.debug(f"Fuzzy matching found {len(fuzzy_entities)} entities")
# 4. Extract keywords as potential entities
keywords = self.keyword_extractor.extract(text, max_keywords=20)
for keyword in keywords:
# Check if keyword is already covered
if not any(keyword.lower() in e.name.lower() for e in all_entities):
# Find keyword position in text
keyword_lower = keyword.lower()
text_lower = text.lower()
pos = text_lower.find(keyword_lower)
if pos >= 0:
entity = Entity(
name=keyword,
type="keyword",
confidence=0.6,
context=text[max(0, pos - 50) : min(len(text), pos + len(keyword) + 50)],
start_pos=pos,
end_pos=pos + len(keyword),
source="keyword",
metadata={"extraction_method": "keyword"},
)
all_entities.append(entity)
# Filter by confidence
filtered_entities = [e for e in all_entities if e.confidence >= min_confidence]
# Merge overlapping entities if requested
if merge_overlapping:
filtered_entities = self._merge_overlapping_entities(filtered_entities)
# Sort by position and confidence
filtered_entities.sort(key=lambda e: (e.start_pos, -e.confidence))
return filtered_entities
get_entity_summary¶
Get summary statistics about recognized entities.
PARAMETER | DESCRIPTION |
---|---|
entities | List of entities |
RETURNS | DESCRIPTION |
---|---|
Dict[str, Any] | Summary dictionary |
Source code in tenets/core/prompt/entity_recognizer.py
def get_entity_summary(self, entities: List[Entity]) -> Dict[str, Any]:
"""Get summary statistics about recognized entities.
Args:
entities: List of entities
Returns:
Summary dictionary
"""
summary = {
"total": len(entities),
"by_type": {},
"by_source": {},
"avg_confidence": 0.0,
"high_confidence": 0,
"unique_names": set(),
}
for entity in entities:
# Count by type
summary["by_type"][entity.type] = summary["by_type"].get(entity.type, 0) + 1
# Count by source
summary["by_source"][entity.source] = summary["by_source"].get(entity.source, 0) + 1
# Track unique names
summary["unique_names"].add(entity.name.lower())
# Count high confidence
# Tests expect a stricter high-confidence count
if entity.confidence > 0.85:
summary["high_confidence"] += 1
# Calculate average confidence
if entities:
summary["avg_confidence"] = sum(e.confidence for e in entities) / len(entities)
# Convert set to count
summary["unique_names"] = len(summary["unique_names"])
return summary