diff --git a/Halgorithem/__init__.py b/Halgorithem/__init__.py index 5d992c1..6929e15 100644 --- a/Halgorithem/__init__.py +++ b/Halgorithem/__init__.py @@ -1,3 +1,4 @@ from .core import Halgorithm +from .main import HalgorithemVerifier, verify, verify_urls -__all__ = ["Halgorithm"] +__all__ = ["Halgorithm", "HalgorithemVerifier", "verify", "verify_urls"] diff --git a/Halgorithem/checks/__init__.py b/Halgorithem/checks/__init__.py new file mode 100644 index 0000000..2b00b9d --- /dev/null +++ b/Halgorithem/checks/__init__.py @@ -0,0 +1 @@ +"""Higher-level verification checks for Halgorithem.""" diff --git a/Halgorithem/checks/atomic.py b/Halgorithem/checks/atomic.py new file mode 100644 index 0000000..eba9ffd --- /dev/null +++ b/Halgorithem/checks/atomic.py @@ -0,0 +1,91 @@ +import re +from functools import lru_cache + +from ..claim_extraction import split_atomic_claims +from ..models import AtomicCheck, AtomicClaim + + +@lru_cache(maxsize=8192) +def _tokens(text): + return frozenset(t.lower() for t in re.findall(r"\b[a-zA-Z][a-zA-Z'-]+\b", text or "") if len(t) > 2) + + +def _overlap(left, right): + if not left: + return 0.0 + return len(left & right) / len(left) + + +def _priority(verdict): + return {"CONTRADICT": 3, "ENTAIL": 2, "NEUTRAL": 1}.get(verdict, 0) + + +def _score_claim(verdict, confidence): + if verdict == "ENTAIL": + return confidence + if verdict == "CONTRADICT": + return -confidence + return 0.5 * confidence + + +def prepare_document_claims(document): + doc_claims = [] + for sentence in document: + for claim in split_atomic_claims(sentence.resolved_text) or [sentence.resolved_text]: + doc_claims.append((claim, _tokens(claim))) + return doc_claims + + +def atomic_claim_nli(processed_sentence, document, nli_model=None, doc_claims=None): + ai_text = getattr(processed_sentence, "resolved_text", processed_sentence) + ai_claims = split_atomic_claims(ai_text) or [ai_text] + doc_claims = doc_claims if doc_claims is not None else prepare_document_claims(document) + + if not doc_claims: + return AtomicCheck(status="no_document_claims") + + matched = [] + for claim in ai_claims: + claim_tokens = _tokens(claim) + best_claim, best_tokens = max(doc_claims, key=lambda pair: _overlap(claim_tokens, pair[1])) + matched.append((claim, best_claim, best_tokens)) + + if nli_model is not None: + nli_results = nli_model.predict_batch( + [best_claim for _, best_claim, _ in matched], + [claim for claim, _, _ in matched], + ) + else: + nli_results = [None] * len(matched) + + results = [] + for (claim, best_claim, best_tokens), nli in zip(matched, nli_results): + claim_tokens = _tokens(claim) + if not claim_tokens or not best_tokens: + results.append(AtomicClaim(claim=claim, verdict="NEUTRAL", confidence=0.0, evidence="")) + continue + if nli is None: + overlap = _overlap(claim_tokens, best_tokens) + verdict = "ENTAIL" if overlap >= 0.70 else "NEUTRAL" + confidence = overlap if verdict == "ENTAIL" else 0.50 + elif nli.label == "CONTRADICTION": + verdict = "CONTRADICT" + confidence = nli.score + elif nli.label == "ENTAILMENT": + verdict = "ENTAIL" + confidence = nli.score + else: + verdict = "NEUTRAL" + confidence = nli.score + results.append(AtomicClaim(claim=claim, verdict=verdict, confidence=confidence, evidence=best_claim)) + + entail = sum(1 for c in results if c.verdict == "ENTAIL") + contradict = sum(1 for c in results if c.verdict == "CONTRADICT") + total = len(results) + if total: + weighted_sum = sum(_score_claim(claim.verdict, claim.confidence) for claim in results) + score = weighted_sum / total + else: + score = None + results.sort(key=lambda c: (_priority(c.verdict), c.confidence), reverse=True) + return AtomicCheck(claims=results, score=score, status="ok") diff --git a/Halgorithem/checks/nli.py b/Halgorithem/checks/nli.py new file mode 100644 index 0000000..1cd8966 --- /dev/null +++ b/Halgorithem/checks/nli.py @@ -0,0 +1,101 @@ +import re +from functools import lru_cache + +from ..contradiction import find_contradiction +from ..models import NLICheck +from ..text_processing import extract_numbers, has_negation_mismatch, lemmatize_tokens + + +@lru_cache(maxsize=8192) +def _tokens(text): + return frozenset(t.lower() for t in re.findall(r"\b[a-zA-Z][a-zA-Z'-]+\b", text or "") if len(t) > 2) + + +@lru_cache(maxsize=8192) +def _content_lemmas(text): + return frozenset(lemma for lemma in lemmatize_tokens(text) if len(lemma) > 2) + + +class NLIModel: + model_quality = 0.75 + + def predict(self, premise, hypothesis): + return self.predict_batch([premise], [hypothesis])[0] + + def predict_batch(self, premises, hypotheses): + return [rule_nli(premise, hypothesis) for premise, hypothesis in zip(premises, hypotheses)] + + +def rule_nli(premise, hypothesis): + chunk = {"text": premise or "", "numbers": extract_numbers(premise)} + issue = find_contradiction( + claim=hypothesis, + chunk=chunk, + extract_numbers=extract_numbers, + has_negation_mismatch=has_negation_mismatch, + score=1.0, + threshold=0.0, + ) + if issue: + return NLICheck("CONTRADICTION", 0.84, issue.get("reason", "Contradiction"), model_quality=0.75) + + premise_tokens = _tokens(premise) + hypothesis_tokens = _tokens(hypothesis) + premise_numbers = set(extract_numbers(premise)) + hypothesis_numbers = set(extract_numbers(hypothesis)) + if hypothesis_numbers and not premise_numbers: + return NLICheck("NEUTRAL", 0.35, "Missing number evidence", model_quality=0.75) + if hypothesis_numbers and premise_numbers and not hypothesis_numbers.issubset(premise_numbers): + return NLICheck("NEUTRAL", 0.42, "Number evidence differs", model_quality=0.75) + if hypothesis_tokens: + token_overlap = len(premise_tokens & hypothesis_tokens) / len(hypothesis_tokens) + premise_lemmas = _content_lemmas(premise) + hypothesis_lemmas = _content_lemmas(hypothesis) + lemma_overlap = len(premise_lemmas & hypothesis_lemmas) / len(hypothesis_lemmas) if hypothesis_lemmas else 0.0 + overlap = max(token_overlap, lemma_overlap) + if overlap >= 0.70: + return NLICheck("ENTAILMENT", min(0.60 + overlap * 0.30, 0.92), model_quality=0.75) + return NLICheck("NEUTRAL", 0.50, model_quality=0.75) + + +def sentence_nli(processed_sentence, document=None, nli_model=None, hits=None): + model = nli_model or NLIModel() + claim = getattr(processed_sentence, "resolved_text", processed_sentence) + claim = claim if isinstance(claim, str) else str(claim) + relevant_hits = hits or [] + if not relevant_hits and document is not None: + relevant_hits = [{"sentence": s.resolved_text, "score": 1.0} for s in document[:1]] + + best_hit_score = max( + ( + (hit.get("score", 0.0) if isinstance(hit, dict) else getattr(hit, "score", 0.0)) + for hit in relevant_hits + ), + default=0.0, + ) + min_hit_score = max(0.30, best_hit_score * 0.80) + premises = [] + hypotheses = [] + hit_scores = [] + for hit in relevant_hits[:5]: + premise = hit.get("sentence") if isinstance(hit, dict) else getattr(hit, "sentence", str(hit)) + hit_score = hit.get("score", 0.0) if isinstance(hit, dict) else getattr(hit, "score", 0.0) + if hit_score < min_hit_score: + continue + premises.append(premise) + hypotheses.append(claim) + hit_scores.append(hit_score) + if not premises: + return NLICheck("NEUTRAL", 0.50, model_quality=getattr(model, "model_quality", 1.0)) + + results = model.predict_batch(premises, hypotheses) + entailments = [r for r in results if r.label == "ENTAILMENT"] + strong_entailment = max(entailments, key=lambda r: r.score) if entailments else None + contradictions = [r for r in results if r.label == "CONTRADICTION"] + if strong_entailment and strong_entailment.score >= 0.82: + return strong_entailment + if contradictions: + return max(contradictions, key=lambda r: r.score) + if entailments: + return max(entailments, key=lambda r: r.score) + return max(results, key=lambda r: r.score) diff --git a/Halgorithem/checks/similarity.py b/Halgorithem/checks/similarity.py new file mode 100644 index 0000000..42f1be9 --- /dev/null +++ b/Halgorithem/checks/similarity.py @@ -0,0 +1,54 @@ +import heapq + +from ..models import SimilarityCheck, SimilarityHit +def _token_overlap(left_tokens, right_tokens): + left_tokens = {token for token in left_tokens if len(token) > 2} + if not left_tokens or not right_tokens: + return 0.0 + return len(left_tokens & set(right_tokens)) / len(left_tokens) + + +def _similarities(embedder, left, rights): + if hasattr(embedder, "similarity_many"): + try: + return embedder.similarity_many(left, rights) + except Exception: + pass + return [float(embedder.similarity(left, right)) for right in rights] + + +def similarity_search(processed_sentence, document, embedder, top_k=5): + query_embedding = processed_sentence.embedding + if query_embedding is None: + query_embedding = embedder.encode(processed_sentence.resolved_text, convert_to_tensor=True) + + embeddings = [] + for doc_sentence in document: + embedding = doc_sentence.embedding + if embedding is None: + embedding = embedder.encode(doc_sentence.resolved_text, convert_to_tensor=True) + embeddings.append(embedding) + + raw_scores = _similarities(embedder, query_embedding, embeddings) + hits = [] + for doc_sentence, raw_score in zip(document, raw_scores): + overlap = _token_overlap(processed_sentence.tokens, doc_sentence.tokens) + lemma_overlap = _token_overlap(processed_sentence.lemmas, doc_sentence.lemmas) + number_bonus = 0.05 if processed_sentence.numbers and processed_sentence.numbers.issubset(doc_sentence.numbers) else 0.0 + overlap = max(overlap, lemma_overlap) + score = min(raw_score + 0.12 * overlap, 1.0) + score = min(score + number_bonus, 1.0) + hits.append( + SimilarityHit( + sentence=doc_sentence.context_text or doc_sentence.resolved_text, + score=score, + source=doc_sentence.source, + sentence_id=doc_sentence.sentence_id, + source_quality=doc_sentence.source_quality, + ) + ) + + selected = heapq.nlargest(top_k, hits, key=lambda hit: hit.score) + source_quality = max((hit.source_quality for hit in selected), default=0.55) + score = selected[0].score if selected else 0.0 + return SimilarityCheck(score=score, hits=selected, source_quality=source_quality) diff --git a/Halgorithem/checks/units.py b/Halgorithem/checks/units.py new file mode 100644 index 0000000..09a6573 --- /dev/null +++ b/Halgorithem/checks/units.py @@ -0,0 +1,38 @@ +UNIT_ALIASES = { + "g": "gram", + "gram": "gram", + "grams": "gram", + "kg": "kilogram", + "kilogram": "kilogram", + "kilograms": "kilogram", + "lb": "pound", + "lbs": "pound", + "pound": "pound", + "pounds": "pound", + "m": "meter", + "meter": "meter", + "meters": "meter", + "cm": "centimeter", + "centimeter": "centimeter", + "centimeters": "centimeter", + "km": "kilometer", + "kilometer": "kilometer", + "kilometers": "kilometer", + "mile": "mile", + "miles": "mile", +} + +NORMALIZATION = { + "gram": ("mass", 0.001), + "kilogram": ("mass", 1.0), + "pound": ("mass", 0.45359237), + "meter": ("length", 1.0), + "centimeter": ("length", 0.01), + "kilometer": ("length", 1000.0), + "mile": ("length", 1609.344), +} + + +def normalize_unit(unit): + canonical = UNIT_ALIASES.get((unit or "").lower()) + return NORMALIZATION.get(canonical) if canonical else None diff --git a/Halgorithem/claim_extraction.py b/Halgorithem/claim_extraction.py index 99c851f..577b42f 100644 --- a/Halgorithem/claim_extraction.py +++ b/Halgorithem/claim_extraction.py @@ -1,18 +1,14 @@ import re -from .nlp import nlp +from .nlp import parse CLAIM_SPLIT_RE = re.compile(r"\s*(?:;|\n+|\s+-\s+)\s*") -CONJUNCTION_RE = re.compile( - r"\s+(?:and|but|while|whereas)\s+" - r"(?=(?:[A-Z][a-z]+|\d|it\b|he\b|she\b|they\b|the\b|a\b|an\b))", - re.IGNORECASE, -) +CONJUNCTION_RE = re.compile(r"\s+(?:and|but|while|whereas)\s+", re.IGNORECASE) def _has_factual_shape(text): - doc = nlp(text) + doc = parse(text) has_subject = any(t.dep_ in {"nsubj", "nsubjpass"} for t in doc) has_verb = any(t.pos_ in {"VERB", "AUX"} for t in doc) has_anchor = any(doc.ents) or any(t.like_num for t in doc) or any(t.pos_ == "PROPN" for t in doc) @@ -27,7 +23,7 @@ def _has_factual_shape(text): def _has_event_verb(text): - doc = nlp(text) + doc = parse(text) return any(t.pos_ in {"VERB", "AUX"} for t in doc) diff --git a/Halgorithem/confidence.py b/Halgorithem/confidence.py index 60d8959..76a4ae9 100644 --- a/Halgorithem/confidence.py +++ b/Halgorithem/confidence.py @@ -1,3 +1,6 @@ +from .nlp import parse + + INFERENTIAL_TERMS = { "helped", "made", @@ -9,6 +12,14 @@ "significant", "influential", } +INFERENTIAL_ROOT_LEMMAS = { + "help", + "ease", + "learn", + "influence", + "matter", + "signify", +} NEGATION_TERMS = { "no", @@ -28,8 +39,9 @@ def is_inferential_claim(claim): - words = set((claim or "").lower().replace(".", "").split()) - return bool(words & INFERENTIAL_TERMS) + doc = parse(claim) + root = next((t for t in doc if t.dep_ == "ROOT"), None) + return bool(root and root.lemma_.lower() in INFERENTIAL_ROOT_LEMMAS) def is_negative_claim(claim): @@ -42,7 +54,14 @@ def classify_support(score, threshold=0.30, contradiction=None, unsupported_term supported_threshold = max(threshold + 0.10, 0.40) hard_contradiction = contradiction and contradiction.get("reason") in { - "Date mismatch", "Number mismatch", "Unit mismatch", "Negation mismatch" + "Date mismatch", + "Number mismatch", + "Unit mismatch", + "Negation mismatch", + "Entity-role mismatch", + "Location mismatch", + "Source qualifier mismatch", + "NLI contradiction", } if hard_contradiction: return "CONTRADICTION" @@ -71,6 +90,6 @@ def confidence_score(score, evidence_count=0, contradiction=None, unsupported_te confidence = max(0.0, min(float(score), 1.0)) confidence += min(evidence_count, 3) * 0.04 confidence -= min(len(unsupported_terms), 4) * 0.06 - if contradiction: + if contradiction and status in {"CONTRADICTION", "HALLUCINATION"}: confidence += 0.10 return round(max(0.0, min(confidence, 1.0)), 3) diff --git a/Halgorithem/contradiction.py b/Halgorithem/contradiction.py index 372baaf..da039c9 100644 --- a/Halgorithem/contradiction.py +++ b/Halgorithem/contradiction.py @@ -7,6 +7,9 @@ } UNIT_ALIASES = { + "g": "gram", + "gram": "gram", + "grams": "gram", "kg": "kilogram", "kilogram": "kilogram", "kilograms": "kilogram", @@ -36,6 +39,19 @@ "euros": "eur", "euro": "eur", } +UNIT_TO_BASE = { + "gram": ("mass", 0.001), + "kilogram": ("mass", 1.0), + "pound": ("mass", 0.45359237), + "kilometer": ("length", 1000.0), + "meter": ("length", 1.0), + "centimeter": ("length", 0.01), + "mile": ("length", 1609.344), + "celsius": ("temperature_c", 1.0), + "fahrenheit": ("temperature_f", 1.0), + "usd": ("money", 1.0), + "eur": ("money", 1.0), +} def numbers_conflict(claim, chunk, extract_numbers): @@ -46,23 +62,26 @@ def numbers_conflict(claim, chunk, extract_numbers): if claim_numbers.issubset(truth_numbers): return None - def skip(number): + def skip_year(number): try: value = float(number) - return 1400 <= value <= 2100 or value <= 31 + return 1400 <= value <= 2100 except (ValueError, TypeError): return True for claim_number in claim_numbers: - if skip(claim_number): + if skip_year(claim_number): continue claim_value = float(claim_number) for truth_number in truth_numbers: - if skip(truth_number): + if skip_year(truth_number): continue truth_value = float(truth_number) if claim_value == 0 or truth_value == 0: continue + if 0 < min(claim_value, truth_value) <= 100: + if abs(claim_value - truth_value) / max(claim_value, truth_value) < 0.05: + continue if min(claim_value, truth_value) / max(claim_value, truth_value) >= 0.5: if claim_value != truth_value: return { @@ -84,12 +103,60 @@ def _units(text): return units +def _quantities(text): + import re + + quantities = [] + for value, unit in re.findall(r"\b(\d+(?:\.\d+)?)\s*([A-Za-z$]+)\b", text or ""): + canonical = UNIT_ALIASES.get(unit.lower().replace("$", "usd")) + if canonical: + quantities.append((float(value), canonical)) + return quantities + + +def _base_value(value, unit): + dimension, multiplier = UNIT_TO_BASE[unit] + if dimension == "temperature_c": + return "temperature", value + if dimension == "temperature_f": + return "temperature", (value - 32) * 5 / 9 + return dimension, value * multiplier + + +def equivalent_unit_numbers(claim, chunk_text, tolerance=0.02): + claim_quantities = _quantities(claim) + truth_quantities = _quantities(chunk_text) + equivalents = set() + for claim_value, claim_unit in claim_quantities: + if claim_unit not in UNIT_TO_BASE: + continue + claim_dim, claim_base = _base_value(claim_value, claim_unit) + for truth_value, truth_unit in truth_quantities: + if truth_unit not in UNIT_TO_BASE: + continue + truth_dim, truth_base = _base_value(truth_value, truth_unit) + if claim_dim != truth_dim: + continue + denom = max(abs(claim_base), abs(truth_base), 1.0) + if abs(claim_base - truth_base) / denom <= tolerance: + equivalents.add(str(claim_value).rstrip("0").rstrip(".")) + equivalents.add(str(truth_value).rstrip("0").rstrip(".")) + return equivalents + + def unit_conflict(claim, chunk_text): claim_units = _units(claim) truth_units = _units(chunk_text) for value, units in claim_units.items(): truth = truth_units.get(value) if truth and units.isdisjoint(truth): + claim_unit = next(iter(units)) + truth_unit = next(iter(truth)) + if claim_unit in UNIT_TO_BASE and truth_unit in UNIT_TO_BASE: + claim_dim, _ = UNIT_TO_BASE[claim_unit] + truth_dim, _ = UNIT_TO_BASE[truth_unit] + if claim_dim == truth_dim and equivalent_unit_numbers(claim, chunk_text): + continue return { "reason": "Unit mismatch", "claim_units": sorted(units), @@ -137,6 +204,30 @@ def source_qualifier_conflict(claim, chunk_text): return None +def location_conflict(claim, chunk_text): + import re + + pattern = r"\b(?:in|at|from|near)\s+([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+){0,3})" + claim_locations = {m.group(1).lower() for m in re.finditer(pattern, claim or "")} + truth_locations = {m.group(1).lower() for m in re.finditer(pattern, chunk_text or "")} + if claim_locations and truth_locations and claim_locations.isdisjoint(truth_locations): + return { + "reason": "Location mismatch", + "claim_locations": sorted(claim_locations), + "truth_locations": sorted(truth_locations), + } + return None + + +def missing_location_evidence(claim, chunk_text): + import re + + pattern = r"\b(?:in|at|from|near)\s+([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+){0,3})" + claim_locations = {m.group(1).lower() for m in re.finditer(pattern, claim or "")} + truth_locations = {m.group(1).lower() for m in re.finditer(pattern, chunk_text or "")} + return bool(claim_locations and not (claim_locations & truth_locations)) + + def entity_role_conflict(claim, chunk_text): claim_rel = _relation(claim) truth_relations = _relations(chunk_text) @@ -197,6 +288,10 @@ def find_contradiction(claim, chunk, extract_numbers, has_negation_mismatch, sco if source_issue and score >= threshold: return source_issue + location_issue = location_conflict(claim, chunk.get("text", "")) + if location_issue and score >= threshold: + return location_issue + temporal_issue = temporal_conflict(claim, chunk.get("text", "")) if temporal_issue and score >= threshold: return temporal_issue diff --git a/Halgorithem/core.py b/Halgorithem/core.py index 976de59..d81fe29 100644 --- a/Halgorithem/core.py +++ b/Halgorithem/core.py @@ -6,10 +6,16 @@ import pysbd from sklearn.feature_extraction.text import HashingVectorizer from sklearn.metrics.pairwise import cosine_similarity +from scipy import sparse from .claim_extraction import extract_claims -from .confidence import classify_support, confidence_score -from .contradiction import find_contradiction, numbers_conflict +from .confidence import classify_support, confidence_score, is_negative_claim +from .contradiction import ( + equivalent_unit_numbers, + find_contradiction, + missing_location_evidence, + numbers_conflict, +) from .evidence import best_evidence, build_evidence from .math_utils import numbers_close, safe_eval from .retrieval import rank_chunks @@ -25,7 +31,7 @@ lemmatize_tokens, tokenize, ) -from .nlp import nlp +from .nlp import parse class LocalEmbedder: @@ -38,11 +44,19 @@ def __init__(self): ) def encode(self, text, convert_to_tensor=False): + if isinstance(text, (list, tuple)): + return self.vectorizer.transform([t or "" for t in text]) return self.vectorizer.transform([text or ""]) def similarity(self, left, right): return float(cosine_similarity(left, right)[0][0]) + def similarity_many(self, left, rights): + if not rights: + return [] + matrix = sparse.vstack(rights) + return [float(score) for score in cosine_similarity(left, matrix)[0]] + def _load_embedder(): mode = os.getenv("HALGORITHEM_EMBEDDER", "local").lower() @@ -73,9 +87,18 @@ def similarity(self, left, right): class Halgorithm: - def __init__(self, sentences_per_chunk=2, sentence_overlap=1): + def __init__(self, sentences_per_chunk=2, sentence_overlap=1, embedder=None): + if sentences_per_chunk < 1: + raise ValueError("sentences_per_chunk must be at least 1.") + if sentence_overlap < 0: + raise ValueError("sentence_overlap must be non-negative.") + if sentence_overlap >= sentences_per_chunk: + raise ValueError("sentence_overlap must be smaller than sentences_per_chunk.") self.sentences_per_chunk = sentences_per_chunk self.sentence_overlap = sentence_overlap + self.embedder = embedder or _embedder + self._claim_embedding_cache = {} + self._nli_model = None self.parser = pysbd.Segmenter(language="en", clean=False) # ── Text prep ───────────────────────────────────────────────────────────── @@ -152,9 +175,10 @@ def chunk_text(self, text, doc_id=1, source_name=None): "sentence_end": min(end, len(sentences)), "text": chunk, "tokens": self.tokenize(chunk), + "lemmas": self.lemmatize_tokens(chunk), "entities": self.extract_entities(chunk), "numbers": self.extract_numbers(chunk), - "embedding": _embedder.encode(chunk, convert_to_tensor=True), + "embedding": self.embedder.encode(chunk, convert_to_tensor=True), }) chunk_id += 1 if end >= len(sentences): @@ -165,14 +189,22 @@ def chunk_text(self, text, doc_id=1, source_name=None): # ── Scoring ─────────────────────────────────────────────────────────────── def support_score(self, claim, chunk): - # semantic similarity via sentence-transformers — topic-agnostic - claim_emb = _embedder.encode(claim, convert_to_tensor=True) - return _embedder.similarity(claim_emb, chunk["embedding"]) + return self._claim_score_fn(claim)(claim, chunk) + + def _claim_score_fn(self, claim): + if claim not in self._claim_embedding_cache: + self._claim_embedding_cache[claim] = self.embedder.encode(claim, convert_to_tensor=True) + claim_emb = self._claim_embedding_cache[claim] + + def score_fn(_claim, chunk): + return self.embedder.similarity(claim_emb, chunk["embedding"]) + + return score_fn # ── Math claims ─────────────────────────────────────────────────────────── def classify_claim_type(self, claim): - if re.search(r"\d+\s*[\+\-\*/%]\s*\d+|(? MAX_EXPR_LENGTH: + raise ValueError("Expression too long") + if not ALLOWED_EXPR_RE.fullmatch(expr): + raise ValueError("Expression contains unsupported characters") + for exponent in re.findall(r"(?:\*\*|\^)\s*([+-]?\d+(?:\.\d+)?)", expr): + if abs(float(exponent)) > MAX_EXPONENT_ABS: + raise ValueError("Exponent too large") + return expr.replace("^", "**") def safe_eval(expr): try: - result = parse_expr(str(expr), transformations=TRANSFORMATIONS) + expr = _validate_expr(expr) + result = parse_expr(expr, transformations=TRANSFORMATIONS, evaluate=True) return float(result.evalf()) except Exception as e: raise ValueError(f"Cannot evaluate: {expr}") from e def numbers_close(left, right, rel_tol=1e-6): - return sympy.Abs(sympy.Float(left) - sympy.Float(right)) <= rel_tol * max(sympy.Abs(sympy.Float(left)), sympy.Abs(sympy.Float(right)), sympy.Float(1)) \ No newline at end of file + return sympy.Abs(sympy.Float(left) - sympy.Float(right)) <= rel_tol * max(sympy.Abs(sympy.Float(left)), sympy.Abs(sympy.Float(right)), sympy.Float(1)) diff --git a/Halgorithem/model_runtime.py b/Halgorithem/model_runtime.py new file mode 100644 index 0000000..f18a574 --- /dev/null +++ b/Halgorithem/model_runtime.py @@ -0,0 +1,70 @@ +from functools import lru_cache +import re + +from .claim_extraction import split_atomic_claims +from .core import _embedder + + +PRONOUN_RE = re.compile( + r"\b(it|he|she|they|his|her|their|its|him|them|this|that|these|those)\b", + re.IGNORECASE, +) + + +class NoOpCoref: + def resolve_text(self, text): + return text or "" + + +def maybe_resolve(text, coref): + if not PRONOUN_RE.search(text or ""): + return text or "" + return coref.resolve_text(text or "") + + +@lru_cache(maxsize=1) +def default_embedder(): + return _embedder + + +@lru_cache(maxsize=1) +def default_coref(): + return NoOpCoref() + + +@lru_cache(maxsize=1) +def default_nli_model(): + from .checks.nli import NLIModel + + return NLIModel() + + +class RuleClaimExtractor: + def extract(self, text): + return split_atomic_claims(text) + + def __call__(self, text): + return self.extract(text) + + +@lru_cache(maxsize=1) +def _default_claim_extractor_instance(): + return RuleClaimExtractor() + + +def default_claim_extractor(text=None): + extractor = _default_claim_extractor_instance() + if text is None: + return extractor + return extractor.extract(text) + + +def encode_texts(embedder, texts): + texts = [text or "" for text in texts] + if not texts: + return [] + try: + encoded = embedder.encode(texts, convert_to_tensor=True) + return [encoded[index] for index in range(len(texts))] + except Exception: + return [embedder.encode(text, convert_to_tensor=True) for text in texts] diff --git a/Halgorithem/models.py b/Halgorithem/models.py new file mode 100644 index 0000000..1cfb340 --- /dev/null +++ b/Halgorithem/models.py @@ -0,0 +1,86 @@ +from dataclasses import dataclass, field +from typing import Any + + +@dataclass +class DocumentSentence: + doc_id: int + source: str + sentence_id: int + text: str + resolved_text: str + context_text: str = "" + embedding: Any = None + source_quality: float = 0.55 + tokens: set[str] = field(default_factory=set) + lemmas: set[str] = field(default_factory=set) + numbers: set[str] = field(default_factory=set) + + +@dataclass +class ProcessedSentence: + sentence_id: int + text: str + resolved_text: str + embedding: Any = None + claims: list[str] = field(default_factory=list) + tokens: set[str] = field(default_factory=set) + lemmas: set[str] = field(default_factory=set) + numbers: set[str] = field(default_factory=set) + + +@dataclass +class SimilarityHit: + sentence: str + score: float + source: str = "" + sentence_id: int | None = None + source_quality: float = 0.55 + + +@dataclass +class SimilarityCheck: + score: float + hits: list[SimilarityHit] = field(default_factory=list) + source_quality: float = 0.55 + + +@dataclass +class NLICheck: + label: str + score: float + reason: str = "" + model_quality: float = 1.0 + + +@dataclass +class AtomicClaim: + claim: str + verdict: str + confidence: float + evidence: str = "" + + +@dataclass +class AtomicCheck: + claims: list[AtomicClaim] = field(default_factory=list) + score: float | None = None + status: str = "ok" + + +@dataclass +class VoteResult: + verdict: str + confidence: float + diagnostics: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class VerificationResult: + sentence: str + verdict: str + confidence: float + similarity: SimilarityCheck + nli: NLICheck + atomic: AtomicCheck + diagnostics: dict[str, Any] = field(default_factory=dict) diff --git a/Halgorithem/nlp.py b/Halgorithem/nlp.py index de66382..7ce7b5d 100644 --- a/Halgorithem/nlp.py +++ b/Halgorithem/nlp.py @@ -1,3 +1,6 @@ +from functools import lru_cache +import threading + import nltk import spacy from negspacy.negation import Negex @@ -40,3 +43,20 @@ def _load_spacy_model(): WORDNET_AVAILABLE = True except LookupError: WORDNET_AVAILABLE = False + + +_parse_lock = threading.Lock() + + +@lru_cache(maxsize=2048) +def _cached_parse(text): + return nlp(text or "") + + +def parse(text): + with _parse_lock: + return _cached_parse(text or "") + + +parse.cache_info = _cached_parse.cache_info +parse.cache_clear = _cached_parse.cache_clear diff --git a/Halgorithem/process.py b/Halgorithem/process.py new file mode 100644 index 0000000..b76521b --- /dev/null +++ b/Halgorithem/process.py @@ -0,0 +1,29 @@ +from .core import Halgorithm +from .model_runtime import default_claim_extractor, default_coref, default_embedder, encode_texts, maybe_resolve +from .models import ProcessedSentence +from .text_processing import extract_numbers, lemmatize_tokens, tokenize + + +def process_response(text, embedder=None, coref=None, claim_extractor=None): + embedder = embedder or default_embedder() + coref = coref or default_coref() + claim_extractor = claim_extractor or default_claim_extractor + splitter = Halgorithm() + raw_sentences = splitter.split_sentences(text) + resolved_sentences = [maybe_resolve(sentence, coref) for sentence in raw_sentences] + embeddings = encode_texts(embedder, resolved_sentences) + sentences = [] + for sentence_id, (sentence, resolved, embedding) in enumerate(zip(raw_sentences, resolved_sentences, embeddings), 1): + sentences.append( + ProcessedSentence( + sentence_id=sentence_id, + text=sentence, + resolved_text=resolved, + embedding=embedding, + claims=claim_extractor(resolved), + tokens=set(tokenize(resolved)), + lemmas=set(lemmatize_tokens(resolved)), + numbers=set(extract_numbers(resolved)), + ) + ) + return sentences diff --git a/Halgorithem/retrieval.py b/Halgorithem/retrieval.py index f5e1684..c0720a8 100644 --- a/Halgorithem/retrieval.py +++ b/Halgorithem/retrieval.py @@ -1,24 +1,58 @@ +import re + + +TOKEN_ALIASES = { + "created": "create", + "creates": "create", + "creating": "create", + "creator": "create", + "invented": "invent", + "invents": "invent", + "inventor": "invent", + "developed": "develop", + "developer": "develop", + "developers": "develop", + "released": "release", + "running": "run", + "ran": "run", +} + + +def _tokens(text): + return {TOKEN_ALIASES.get(t.lower(), t.lower()) for t in re.findall(r"\b[\w'-]+\b", text or "") if len(t) > 2} + + def rank_chunks( claim, chunks, score_fn, extract_numbers, has_negation_mismatch, + lemmatize_fn=None, threshold=0.30, top_k=5, ): candidates = [] claim_numbers = set(extract_numbers(claim)) + claim_tokens = _tokens(claim) + claim_lemmas = set() + if lemmatize_fn is not None: + claim_lemmas = {TOKEN_ALIASES.get(t, t) for t in lemmatize_fn(claim) if len(t) > 2} for chunk in chunks: raw_score = score_fn(claim, chunk) score = raw_score signals = [] - claim_tokens = {t.lower() for t in claim.replace(".", " ").replace(",", " ").split() if t.strip()} - chunk_tokens = set(chunk.get("tokens", [])) + chunk_tokens = {TOKEN_ALIASES.get(t, t) for t in set(chunk.get("tokens", []))} + if not chunk_tokens: + chunk_tokens = _tokens(chunk.get("text", "")) content_tokens = {t for t in claim_tokens if len(t) > 2} + overlap = 0.0 if content_tokens: - overlap = len(content_tokens & chunk_tokens) / len(content_tokens) + token_overlap = len(content_tokens & chunk_tokens) / len(content_tokens) + chunk_lemmas = {TOKEN_ALIASES.get(t, t) for t in set(chunk.get("lemmas", []))} + lemma_overlap = len(claim_lemmas & chunk_lemmas) / len(claim_lemmas) if claim_lemmas and chunk_lemmas else 0.0 + overlap = max(token_overlap, lemma_overlap) if overlap >= 0.85: score = min(score + 0.18, 1.0) signals.append("high_token_overlap") @@ -29,6 +63,8 @@ def rank_chunks( if claim_numbers and claim_numbers.issubset(set(chunk.get("numbers", []))): score = min(score + 0.10, 1.0) signals.append("number_subset") + elif claim_numbers and set(chunk.get("numbers", [])) and overlap >= 0.60: + signals.append("number_anchor_overlap") if has_negation_mismatch(claim, chunk.get("text", "")) and score >= threshold: score = max(score - 0.30, 0.0) diff --git a/Halgorithem/source_quality.py b/Halgorithem/source_quality.py index 7d28f8c..708962a 100644 --- a/Halgorithem/source_quality.py +++ b/Halgorithem/source_quality.py @@ -19,13 +19,15 @@ def score_source(source_name, text): host = parsed.netloc.lower().removeprefix("www.") score = 0.55 + domain_matched = False for domain, domain_score in HIGH_TRUST_DOMAINS.items(): if host == domain or host.endswith("." + domain): score = max(score, domain_score) + domain_matched = True if source_name.startswith("inline_text") or not parsed.scheme: score = max(score, 0.65) - if len(text.split()) < 80: + if not domain_matched and len(text.split()) < 80: score -= 0.15 if text.count("\n") > len(text.split()) / 4: score -= 0.05 diff --git a/Halgorithem/temporal.py b/Halgorithem/temporal.py index cf341dd..924697a 100644 --- a/Halgorithem/temporal.py +++ b/Halgorithem/temporal.py @@ -1,8 +1,11 @@ import re from datetime import date +from .nlp import parse + YEAR_RE = re.compile(r"\b(?:1[5-9]\d{2}|20\d{2}|21\d{2})\b") +ENTITY_RE = re.compile(r"\b[A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+)*\b") CURRENT_TERMS = { "current", "currently", @@ -26,14 +29,40 @@ def has_temporal_language(text): def temporal_conflict(claim, chunk_text): claim_years = extract_years(claim) chunk_years = extract_years(chunk_text) - if claim_years and chunk_years and claim_years.isdisjoint(chunk_years): - return { - "reason": "Date mismatch", - "claim_years": sorted(claim_years), - "truth_years": sorted(chunk_years), - } + if not (claim_years and chunk_years and claim_years.isdisjoint(chunk_years)): + return None + claim_anchors = temporal_anchors(claim) + chunk_anchors = temporal_anchors(chunk_text) + if claim_anchors and chunk_anchors and claim_anchors.isdisjoint(chunk_anchors): + return None + return { + "reason": "Date mismatch", + "claim_years": sorted(claim_years), + "truth_years": sorted(chunk_years), + } - return None + +def temporal_anchors(text): + anchors = {m.group(0).lower() for m in ENTITY_RE.finditer(text or "")} + stop = { + "the", "a", "an", "in", "on", "at", "by", "of", "for", "with", + "was", "is", "are", "were", "as", "current", "currently", + "created", "invented", "developed", "designed", "launched", + "released", "founded", "reported", "started", "ended", + } + anchors.update( + token.lower() + for token in re.findall(r"\b[a-zA-Z][a-zA-Z'-]+\b", text or "") + if len(token) > 3 and token.lower() not in stop + ) + doc = parse(text) + for ent in doc.ents: + if ent.label_ not in {"DATE", "TIME", "CARDINAL", "ORDINAL", "QUANTITY", "PERCENT", "MONEY"}: + anchors.add(ent.text.lower()) + for token in doc: + if token.pos_ in {"PROPN", "NOUN"} and not token.is_stop and not token.like_num: + anchors.add(token.lemma_.lower()) + return {anchor for anchor in anchors if anchor and not YEAR_RE.fullmatch(anchor)} def temporal_warning(claim): diff --git a/Halgorithem/text_processing.py b/Halgorithem/text_processing.py index 4127652..393ba8d 100644 --- a/Halgorithem/text_processing.py +++ b/Halgorithem/text_processing.py @@ -8,11 +8,21 @@ from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS import textacy.preprocessing as tprep -from .nlp import WORDNET_AVAILABLE, nlp +from .nlp import WORDNET_AVAILABLE, parse STOPWORDS = set(ENGLISH_STOP_WORDS) md = MarkdownIt() +NUMBER_WORD_RE = re.compile( + r"\b(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|" + r"eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|" + r"eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|" + r"eighty|ninety|hundred|thousand|million|billion|trillion|" + r"first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|" + r"tenth|percent|percentage|dollars?|euros?|grams?|kilograms?|" + r"pounds?|meters?|centimeters?|kilometers?|miles?)\b", + re.IGNORECASE, +) @lru_cache(maxsize=4096) @@ -63,7 +73,7 @@ def clean_text(text): def tokenize(text): - doc = nlp(text) + doc = parse(text) return [ t.text.lower() for t in doc if not t.is_punct and not t.is_space @@ -72,7 +82,7 @@ def tokenize(text): def lemmatize_tokens(text): - doc = nlp(text) + doc = parse(text) return [ t.lemma_.lower() for t in doc if not t.is_punct and not t.is_space @@ -81,7 +91,11 @@ def lemmatize_tokens(text): ] -def extract_numbers(text): +@lru_cache(maxsize=4096) +def _extract_numbers_cached(text): + text = text or "" + if not re.search(r"\d", text) and not NUMBER_WORD_RE.search(text): + return () # quantulum3 handles "seven billion", "3.5 million", "$4.2B", ordinals try: quantities = qparser.parse(text) @@ -100,11 +114,15 @@ def extract_numbers(text): if d not in seen: extracted.append(d) seen.add(d) - return extracted + return tuple(extracted) + + +def extract_numbers(text): + return list(_extract_numbers_cached(text or "")) def extract_entities(text): - doc = nlp(text) + doc = parse(text) entities = set() for ent in doc.ents: tokens = tuple( @@ -118,8 +136,8 @@ def extract_entities(text): def has_negation_mismatch(claim, chunk_text): # negspacy marks negated entities on the doc - claim_doc = nlp(claim) - chunk_doc = nlp(chunk_text) + claim_doc = parse(claim) + chunk_doc = parse(chunk_text) claim_has_negation = any( getattr(t._, "negex", False) for t in claim_doc ) @@ -129,7 +147,7 @@ def has_negation_mismatch(claim, chunk_text): if not claim_has_negation and not chunk_has_negation: negation_terms = { "no", "not", "never", "neither", "nor", "without", "didn't", - "doesn't", "wasn't", "isn't", "aren't", "can't", "cannot", "did" + "doesn't", "wasn't", "isn't", "aren't", "can't", "cannot" } claim_tokens = {t.text.lower() for t in claim_doc} chunk_tokens = {t.text.lower() for t in chunk_doc} diff --git a/Halgorithem/voting.py b/Halgorithem/voting.py new file mode 100644 index 0000000..7935bdf --- /dev/null +++ b/Halgorithem/voting.py @@ -0,0 +1,114 @@ +from .models import AtomicCheck, NLICheck, SimilarityCheck, VoteResult +from .nlp import parse + + +def entropy_gate(sentence_text, threshold=0.92): + doc = parse(sentence_text) + if not any(t.dep_ == "conj" for t in doc): + return None, None, 1.0 + if "," not in sentence_text and ";" not in sentence_text: + return None, None, 1.0 + if getattr(doc._, "has_coref_resolution", False): + return None, None, 1.0 + pronouns = { + "it", "its", "they", "them", "their", "this", "that", "these", + "those", "he", "him", "his", "she", "her", + } + has_ambiguous_pronoun = any(t.text.lower() in pronouns for t in doc) + has_named_anchor = any(doc.ents) or any(t.pos_ == "PROPN" for t in doc) + if has_ambiguous_pronoun and not has_named_anchor: + return "UNVERIFIABLE", 0.5, threshold + return None, None, 1.0 + + +def similarity_weight(check: SimilarityCheck): + quality = max(0.0, min(check.source_quality, 1.0)) + return 0.2 + 0.3 * quality + + +def nli_score(check: NLICheck): + if check.label == "ENTAILMENT": + return check.score + if check.label == "CONTRADICTION": + return 1.0 - check.score + return 0.0 + + +def contradiction_confidence(nli: NLICheck, atomic: AtomicCheck): + scores = [] + if nli.label == "CONTRADICTION": + scores.append(nli.score) + scores.extend(claim.confidence for claim in atomic.claims if claim.verdict == "CONTRADICT") + return max(scores, default=0.0) + + +def atomic_score(check: AtomicCheck): + """Return atomic support on [-1, 1]. + + Negative values are deliberate: they represent atomic contradiction strength + and are combined with separate contradiction confidence in fuse_votes(). + """ + if check.score is not None: + return check.score + if not check.claims: + return None + entail = sum(1 for c in check.claims if c.verdict == "ENTAIL") + contradict = sum(1 for c in check.claims if c.verdict == "CONTRADICT") + total = len(check.claims) + return (entail - contradict) / total if total else None + + +def fuse_votes(similarity: SimilarityCheck, nli: NLICheck, atomic: AtomicCheck): + """Fuse support evidence while keeping contradiction confidence separate. + + NLI model_quality scales how much the NLI support score affects the weighted + support average. The rule-based fallback reports 0.75; a transformer-backed + model can report 1.0 to carry the full NLI weight. + """ + weighted = [] + weighted.append((similarity_weight(similarity), similarity.score)) + weighted.append((0.5 * max(0.0, min(nli.model_quality, 1.0)), nli_score(nli))) + atom = atomic_score(atomic) + if atom is not None: + weighted.append((0.3, atom)) + + total_weight = sum(weight for weight, _ in weighted) or 1.0 + support_score = sum(weight * score for weight, score in weighted) / total_weight + atomic_contradictions = sum(1 for claim in atomic.claims if claim.verdict == "CONTRADICT") + atomic_entails = sum(1 for claim in atomic.claims if claim.verdict == "ENTAIL") + contra = contradiction_confidence(nli, atomic) + if contra >= 0.80 and (atomic_contradictions or similarity.score >= 0.25): + verdict = "CONTRADICTION" + confidence = contra + elif atomic_contradictions and atom is not None and atom < -0.25: + verdict = "CONTRADICTION" + confidence = max(contra, abs(atom)) + elif nli.label == "ENTAILMENT" and nli.score >= 0.80 and atomic_entails and similarity.score >= 0.30: + verdict = "SUPPORTED" + confidence = max(support_score, nli.score * 0.9) + elif support_score >= 0.55: + verdict = "SUPPORTED" + confidence = support_score + elif support_score >= 0.30: + verdict = "WEAK_SUPPORT" + confidence = support_score + elif similarity.score < 0.12 and nli.label == "NEUTRAL" and (atom is None or atom <= 0.0): + verdict = "HALLUCINATION" + confidence = 1.0 - max(similarity.score, support_score) + else: + verdict = "UNVERIFIABLE" + confidence = 1.0 - max(support_score, contra) + + diagnostics = { + "similarity_score": similarity.score, + "similarity_source_quality": similarity.source_quality, + "nli_label": nli.label, + "nli_score": nli.score, + "atomic_score": atom, + "atomic_check_status": atomic.status, + "support_score": support_score, + "contradiction_confidence": contra, + "atomic_contradictions": atomic_contradictions, + "atomic_entails": atomic_entails, + } + return VoteResult(verdict=verdict, confidence=round(confidence, 3), diagnostics=diagnostics) diff --git a/Halgorithem/web.py b/Halgorithem/web.py index 3d0a73a..bedd7ac 100644 --- a/Halgorithem/web.py +++ b/Halgorithem/web.py @@ -1,50 +1,77 @@ +import asyncio +import concurrent.futures +from pathlib import Path +from urllib.parse import unquote + from bs4 import BeautifulSoup -import requests import html2text +import httpx + class WebScraper: - def __init__(self, list_of_urls): + def __init__(self, list_of_urls, output_dir="."): self.urls = list_of_urls + self.output_dir = Path(output_dir) self.converter = html2text.HTML2Text() self.converter.ignore_links = True self.converter.ignore_images = True self.converter.ignore_tables = False - self.counter = 0 + + async def _fetch_wikipedia(self, client, url): + title = unquote(url.split("/wiki/")[-1]) + api_url = f"https://en.wikipedia.org/api/rest_v1/page/mobile-sections/{title}" + response = await client.get(api_url) + response.raise_for_status() + data = response.json() + sections = data.get("lead", {}).get("sections", []) + plain_text = "\n".join(section.get("text", "") for section in sections if section.get("text")) + if plain_text: + return plain_text + summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{title}" + response = await client.get(summary_url) + response.raise_for_status() + return response.json().get("extract", "") + + async def _fetch_page(self, client, url): + response = await client.get(url) + response.raise_for_status() + soup = BeautifulSoup(response.content, "html.parser") + for tag in soup(["nav", "footer", "script", "style", "header", "aside"]): + tag.decompose() + return self.converter.handle(str(soup))[:8000] + + async def _scrape_one(self, client, index, url): + try: + if "wikipedia.org/wiki/" in url: + plain_text = await self._fetch_wikipedia(client, url) + else: + plain_text = await self._fetch_page(client, url) + self.output_dir.mkdir(parents=True, exist_ok=True) + path = self.output_dir / f"file{index}.txt" + path.write_text(plain_text, encoding="utf-8") + print(f"Scraped: {url} -> {path}") + return str(path) + except httpx.TimeoutException: + print(f"Timeout: {url}") + except httpx.HTTPStatusError as e: + print(f"HTTP error {e}: {url}") + except Exception as e: + print(f"Failed {url}: {e}") + return None + + async def scrape_async(self): + headers = {"User-Agent": "Mozilla/5.0 (compatible; HalgorithemBot/1.0)"} + timeout = httpx.Timeout(5.0) + async with httpx.AsyncClient(headers=headers, timeout=timeout, follow_redirects=True) as client: + results = await asyncio.gather( + *(self._scrape_one(client, index, url) for index, url in enumerate(self.urls)) + ) + return [path for path in results if path] def scrape(self): - headers = { - "User-Agent": "Mozilla/5.0 (compatible; HalgorithemBot/1.0)" - } - for url in self.urls: - try: - # use clean Wikipedia API instead of scraping - if "wikipedia.org/wiki/" in url: - title = url.split("/wiki/")[-1] - api_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{title}" - response = requests.get(api_url, - timeout=5, - headers=headers) - response.raise_for_status() - plain_text = response.json().get("extract", "") - else: - page = requests.get(url, timeout=5, headers=headers) - page.raise_for_status() - soup = BeautifulSoup(page.content, "html.parser") - for tag in soup(["nav", "footer", "script", - "style", "header", "aside"]): - tag.decompose() - plain_text = self.converter.handle(str(soup)) - plain_text = plain_text[:8000] # cap non-wiki sources - - with open(f"file{self.counter}.txt", "w", - encoding="utf-8") as f: - f.write(plain_text) - print(f"Scraped: {url} → file{self.counter}.txt") - self.counter += 1 - - except requests.exceptions.Timeout: - print(f"Timeout: {url}") - except requests.exceptions.HTTPError as e: - print(f"HTTP error {e}: {url}") - except Exception as e: - print(f"Failed {url}: {e}") \ No newline at end of file + try: + asyncio.get_running_loop() + except RuntimeError: + return asyncio.run(self.scrape_async()) + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: + return pool.submit(asyncio.run, self.scrape_async()).result() diff --git a/README.md b/README.md index 71a6971..60cfdae 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,26 @@ If neither spaCy model is installed, Halgorithem falls back to `spacy.blank("en" ## Quick Start +Recommended high-accuracy verifier: + +```python +from Halgorithem import verify + +results = verify( + docs="BASIC was created in 1964 by John Kemeny at Dartmouth College.", + response_text="BASIC was created in 1964.", +) + +for result in results: + print(result.verdict, result.confidence, result.diagnostics) +``` + +The `verify()` helper and `HalgorithemVerifier` return dataclass results and use the newer similarity, NLI, atomic-claim, and vote-fusion pipeline. This is the preferred API for new integrations. + +Vote fusion keeps support and contradiction evidence separate. Atomic scores intentionally use `[-1, 1]`, where negative values represent contradiction strength. NLI checks also expose `model_quality`: the built-in rule-based fallback is weighted lower than a transformer-backed NLI model that reports full quality. + +Legacy chunk API: + ```python from Halgorithem import Halgorithm @@ -63,8 +83,24 @@ for result in results: print(result["status"], result["claim"], result["reason"]) ``` +`Halgorithm.compare_to_docs()` remains supported for compatibility and returns dictionaries. It uses the lower-level chunk scoring path, so its output shape differs from `verify()` / `HalgorithemVerifier`. + ## Python API +Preferred verifier: + +```python +from Halgorithem import HalgorithemVerifier + +with HalgorithemVerifier() as verifier: + results = verifier.verify( + docs="BASIC was created in 1964.", + response_text="BASIC was created in 1964.", + ) +``` + +Legacy verifier: + ```python from Halgorithem import Halgorithm @@ -145,7 +181,21 @@ It reports accuracy, accuracy by category, a confusion matrix, failures, tempora ## Output Schema -Every claim result includes: +The preferred verifier returns `VerificationResult` dataclasses: + +```python +{ + "sentence": str, + "verdict": "SUPPORTED | WEAK_SUPPORT | CONTRADICTION | HALLUCINATION | UNVERIFIABLE", + "confidence": float, + "similarity": SimilarityCheck, + "nli": NLICheck, + "atomic": AtomicCheck, + "diagnostics": dict, +} +``` + +The legacy `Halgorithm` API returns dictionaries: ```python { diff --git a/pyproject.toml b/pyproject.toml index 59c494a..0fe05b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "beautifulsoup4>=4.12", "clean-text>=0.6", "html2text>=2024.2.26", + "httpx>=0.27", "markdown-it-py>=3", "negspacy>=1.0", "nltk>=3.8", @@ -25,6 +26,7 @@ dependencies = [ "requests>=2.31", "rich>=13", "scikit-learn>=1.3", + "scipy>=1.10", "sentence-transformers>=2.7", "spacy>=3.7", "sympy>=1.12", diff --git a/requirements.txt b/requirements.txt index e07f5a7..786f6e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ beautifulsoup4>=4.12 clean-text>=0.6 html2text>=2024.2.26 +httpx>=0.27 markdown-it-py>=3 negspacy>=1.0 nltk>=3.8 @@ -10,6 +11,7 @@ quantulum3[classifier]>=0.9 requests>=2.31 rich>=13 scikit-learn>=1.3 +scipy>=1.10 sentence-transformers>=2.7 spacy>=3.7 sympy>=1.12 diff --git a/tests/__pycache__/test_core_pipeline.cpython-312-pytest-9.0.3.pyc b/tests/__pycache__/test_core_pipeline.cpython-312-pytest-9.0.3.pyc new file mode 100644 index 0000000..38ac16b Binary files /dev/null and b/tests/__pycache__/test_core_pipeline.cpython-312-pytest-9.0.3.pyc differ diff --git a/tests/__pycache__/test_core_pipeline.cpython-312.pyc b/tests/__pycache__/test_core_pipeline.cpython-312.pyc new file mode 100644 index 0000000..342fe08 Binary files /dev/null and b/tests/__pycache__/test_core_pipeline.cpython-312.pyc differ diff --git a/tests/__pycache__/test_halgorithem.cpython-312.pyc b/tests/__pycache__/test_halgorithem.cpython-312.pyc new file mode 100644 index 0000000..ae99ee0 Binary files /dev/null and b/tests/__pycache__/test_halgorithem.cpython-312.pyc differ diff --git a/tests/__pycache__/test_voting.cpython-312-pytest-9.0.3.pyc b/tests/__pycache__/test_voting.cpython-312-pytest-9.0.3.pyc new file mode 100644 index 0000000..60151b3 Binary files /dev/null and b/tests/__pycache__/test_voting.cpython-312-pytest-9.0.3.pyc differ diff --git a/tests/__pycache__/test_voting.cpython-312.pyc b/tests/__pycache__/test_voting.cpython-312.pyc new file mode 100644 index 0000000..a9b311b Binary files /dev/null and b/tests/__pycache__/test_voting.cpython-312.pyc differ diff --git a/tests/test_halgorithem.py b/tests/test_halgorithem.py index 9edfc89..137fbb8 100644 --- a/tests/test_halgorithem.py +++ b/tests/test_halgorithem.py @@ -1,9 +1,22 @@ import pytest +import asyncio -from Halgorithem import Halgorithm +from Halgorithem import Halgorithm, HalgorithemVerifier from Halgorithem.claim_extraction import split_atomic_claims -from Halgorithem.contradiction import find_contradiction +from Halgorithem.contradiction import equivalent_unit_numbers, find_contradiction, numbers_conflict +from Halgorithem.math_utils import safe_eval from Halgorithem.retrieval import rank_chunks +from Halgorithem.source_quality import score_source +from Halgorithem.temporal import temporal_conflict +from Halgorithem.voting import atomic_score, similarity_weight +from Halgorithem.models import AtomicCheck, AtomicClaim, SimilarityCheck +from Halgorithem.models import DocumentSentence, NLICheck +from Halgorithem.evidence import candidate_to_evidence +from Halgorithem.model_runtime import default_claim_extractor +from Halgorithem.process import process_response +from Halgorithem.text_processing import has_negation_mismatch +from Halgorithem.voting import entropy_gate, fuse_votes, nli_score +from Halgorithem.web import WebScraper @pytest.fixture() @@ -37,6 +50,12 @@ def test_claim_extraction_splits_atomic_claims(): assert "BASIC was created in 1964." in claims +def test_claim_extraction_splits_verb_led_conjunction(): + claims = split_atomic_claims("Python was created in 1991 and released publicly in 1994.") + assert "Python was created in 1991." in claims + assert "released publicly in 1994." in claims + + def test_retrieval_ranks_best_chunk(algo): chunks = algo.chunk_text( "Cats sleep often. BASIC was created in 1964 at Dartmouth College.", @@ -61,7 +80,7 @@ def test_weak_support(algo, docs): def test_hallucination(algo, docs): - assert first_status(algo, docs, "BASIC was created by NASA.") == "HALLUCINATION" + assert first_status(algo, docs, "BASIC was created by NASA.") == "CONTRADICTION" def test_denial(algo, docs): @@ -80,6 +99,24 @@ def test_unit_contradiction(algo, docs): assert result["reason"] == "Unit mismatch" +def test_equivalent_unit_numbers_support_grams(): + assert equivalent_unit_numbers("The sample weighs 1000 grams.", "The sample weighs 1 kilogram.") + + +def test_percentage_rounding_tolerance(): + chunk = {"numbers": ["31"]} + assert numbers_conflict("The rate was 30%.", chunk, lambda text: ["30"] if "30" in text else ["31"]) is None + + +def test_temporal_conflict_requires_shared_anchor(): + assert temporal_conflict("Apollo launched in 1969.", "Gemini launched in 1965.") is None + assert temporal_conflict("Apollo launched in 1970.", "Apollo launched in 1969.")["reason"] == "Date mismatch" + + +def test_trusted_short_sources_keep_domain_quality(): + assert score_source("https://www.nasa.gov/example", "short text") == 0.92 + + def test_math_checks(algo): supported = algo.compare_to_docs("Math source.", "2 + 2 = 4.")[0] contradicted = algo.compare_to_docs("Math source.", "2 + 2 = 5.")[0] @@ -89,6 +126,14 @@ def test_math_checks(algo): assert malformed["status"] == "ERROR" +def test_safe_eval_blocks_non_math_input(): + assert safe_eval("2^3") == 8.0 + with pytest.raises(ValueError): + safe_eval("__import__('os').system('echo nope')") + with pytest.raises(ValueError): + safe_eval("9**999999") + + def test_temporal_warning(algo, docs): result = algo.compare_to_docs(docs, "The current status of Project Helios is active.")[0] assert result["warning"] == "Time-sensitive claim" @@ -126,3 +171,151 @@ def test_runtime_hardening_errors(algo, tmp_path): with pytest.raises(ValueError): algo.compare_to_docs([{"file_path": "bad"}], "A claim.") assert algo.compare_to_docs("A source.", "") == [] + + +def test_verifier_stack_exists_and_returns_result(docs): + with HalgorithemVerifier() as verifier: + result = verifier.verify(docs, "BASIC was created in 1964.")[0] + assert result.verdict in {"SUPPORTED", "WEAK_SUPPORT"} + assert result.similarity.hits + assert "atomic_check_status" in result.diagnostics + + +def test_voting_weights_and_atomic_fallback(): + weight = similarity_weight(SimilarityCheck(score=0.8, source_quality=0.92)) + assert weight > 0.45 + check = AtomicCheck(claims=[ + AtomicClaim("a", "ENTAIL", 0.9), + AtomicClaim("b", "CONTRADICT", 0.8), + ]) + assert atomic_score(check) == 0 + + +def test_entropy_gate_returns_unverifiable_for_ambiguous_compound(): + status, confidence, entropy = entropy_gate("It rose quickly, and they said it changed.") + assert status == "UNVERIFIABLE" + assert confidence == 0.5 + assert entropy == 0.92 + + +def test_fuse_votes_uses_unverifiable_fallback_not_hallucination(): + vote = fuse_votes( + SimilarityCheck(score=0.20, source_quality=0.5), + NLICheck("NEUTRAL", 0.50), + AtomicCheck(claims=[], score=None, status="empty"), + ) + assert vote.verdict == "UNVERIFIABLE" + + +def test_nli_contradiction_score_stays_in_support_range(): + assert nli_score(NLICheck("CONTRADICTION", 0.90)) == pytest.approx(0.10) + + +def test_did_is_not_negation(): + assert not has_negation_mismatch("She did create Python.", "She created Python.") + + +def test_default_claim_extractor_works_as_factory_and_function(): + extractor = default_claim_extractor() + assert extractor.extract("Python was created in 1991.") + assert default_claim_extractor("Python was created in 1991.") + + +def test_document_sentence_evidence_compatibility(): + sentence = DocumentSentence( + doc_id=7, + source="doc.txt", + sentence_id=3, + text="Raw", + resolved_text="Resolved", + source_quality=0.8, + ) + evidence = candidate_to_evidence({"chunk": sentence, "score": 0.9}) + assert evidence["doc_id"] == 7 + assert evidence["source"] == "doc.txt" + assert evidence["chunk_id"] == 3 + assert evidence["text"] == "Resolved" + + +def test_web_scraper_accepts_output_dir(tmp_path): + scraper = WebScraper([], output_dir=tmp_path) + assert scraper.scrape() == [] + + +def test_web_scraper_scrape_inside_event_loop(tmp_path): + async def run(): + scraper = WebScraper([], output_dir=tmp_path) + return scraper.scrape() + + assert asyncio.run(run()) == [] + + +def test_process_response_batches_embeddings(): + class BatchEmbedder: + def __init__(self): + self.calls = [] + + def encode(self, text, convert_to_tensor=False): + self.calls.append(text) + if isinstance(text, list): + return list(text) + return text + + embedder = BatchEmbedder() + sentences = process_response("BASIC was created in 1964. It was designed for students.", embedder=embedder) + assert len(sentences) == 2 + assert len(embedder.calls) == 1 + assert isinstance(embedder.calls[0], list) + + +def test_atomic_check_batches_nli(docs): + class BatchNLI: + def __init__(self): + self.batch_calls = 0 + + def predict_batch(self, premises, hypotheses): + self.batch_calls += 1 + from Halgorithem.models import NLICheck + return [NLICheck("ENTAILMENT", 0.9) for _ in hypotheses] + + from Halgorithem.ingest import ingest_documents + from Halgorithem.process import process_response + from Halgorithem.checks.atomic import atomic_claim_nli, prepare_document_claims + + document = ingest_documents(docs) + sentence = process_response("BASIC was created in 1964 and it was designed for students.")[0] + nli = BatchNLI() + result = atomic_claim_nli(sentence, document, nli_model=nli, doc_claims=prepare_document_claims(document)) + assert nli.batch_calls == 1 + assert result.score is not None + + +def test_atomic_empty_token_claim_stays_neutral(): + from Halgorithem.checks.atomic import atomic_claim_nli + + result = atomic_claim_nli("It.", []) + assert result.status == "no_document_claims" + + from Halgorithem.models import DocumentSentence + doc = [DocumentSentence(doc_id=1, source="x", sentence_id=1, text="A.", resolved_text="A.")] + result = atomic_claim_nli("It.", doc) + assert result.claims[0].verdict == "NEUTRAL" + assert result.claims[0].evidence == "" + + +def test_similarity_search_does_not_mutate_embeddings(): + from Halgorithem.checks.similarity import similarity_search + from Halgorithem.models import DocumentSentence, ProcessedSentence + + class Embedder: + def encode(self, text, convert_to_tensor=False): + return {text} + + def similarity(self, left, right): + return 1.0 if left == right else 0.0 + + sentence = ProcessedSentence(sentence_id=1, text="A.", resolved_text="A.") + doc = [DocumentSentence(doc_id=1, source="x", sentence_id=1, text="A.", resolved_text="A.")] + similarity_search(sentence, doc, Embedder()) + assert sentence.embedding is None + assert doc[0].embedding is None