diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 560b871..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b7fdd95 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +__pycache__/ +*.py[cod] +.pytest_cache/ +.DS_Store +*.egg-info/ +dist/ +build/ +.venv/ +venv/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..31b5093 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,431 @@ +# Halgorithem Project - Contributing guide + +Welcome to the Halgorithem Project. If you're reading this guide, that probably means you'd like to get involved and start contributing to the project. This document should hopefully help you become a full templateer (which is how we refer to the members of our community). + +One of the core missions of the Halgorithem Project is to provide high-quality documentation templates to open source software projects and beyond. However, we also engage in many other similar initiatives around docs advocacy, docs education, and docs tooling. We value all contributions to the Halgorithem Project initiatives, including templates and our other initiatives. The [Join the community](#join-the-community) section of this guide explains how to get involved in those other related initiatives and provides links for more information. + +## Table of contents + +[TOC] + +## What do we work on + +The template working groups create or improve templates for a variety of content types used in documentation projects. + +Each template project consists of the following required files: + +* **Template file** - The raw template for the content type. +* **Template guide** - Provides a deeper explanation of how to fill in the template. +* **Process** - Explains best practices for researching, writing, and maintaining this content type. +* **Resources** - Includes the resources consulted during the research phase of creating the template. Also includes high quality examples of that content type that served as inspiration for the template. +* **Example** - After a template project is complete, our Chronologue working group creates an example of the template. They test the template for overall usability. + +See [Template deliverables](template_deliverables.md) for more detailed information about each template deliverable. + +### Template packs + +A template pack is a collection of templates organized together by: + +* Common use cases or tasks +* Needs of particular user personas +* Popular or interesting documentation frameworks +* Maturity models +* Future criteria or needs based on user research and feedback + +The core documentation pack is our flagship template pack and it includes the core, fundamental content types that every documentation project needs. If you download one template pack for your project, it should be this one. + +From a contribution standpoint, we anticipate that our long-term contributors develop a specialty or area of expertise in some specific template packs over time. + +### The Halgorithem Project tactic articles + +The Halgorithem Project takes in requests from users and stakeholders about what types of content they would like us to provide. Sometimes we receive requests for content that doesn't make sense as a template and instead makes more sense as an article or blog entry. + +We call these types of articles **tactics** and publish them on our website under a section called [The Halgorithem Project tactics](https://www.thegooddocsproject.dev/tactic). Articles written for this framework use the title format {Article Title} Tactic, such as the Docs Landing Page tactic. + +Tactic projects go through the same contributing process as templates, but we publish them on our website instead of in the template repository. + +### Template issues and boards + +All the template and tactics projects that contributors are actively working on or which contributors might work on have a corresponding issue in the templates repository. +Use the issue list or the kanban board to find a project to work on and track your progress as your project moves throughout the template writing phases. + +* [Template issues list](https://gitlab.com/tgdp/templates/-/issues) +* [Templates in progress kanban board](https://gitlab.com/tgdp/templates/-/boards/4801048) + +## Before you start + +Before starting, register for a [Welcome Wagon meeting](https://thegooddocsproject.dev/welcome/). At this 1-hour orientation meeting, you get an introduction to our project's goals, key concepts, and workflows. + +We expect all members of our project to be nice to each other and to follow our [Code of Conduct](https://thegooddocsproject.dev/code-of-conduct/) when interacting with other members of the Halgorithem Project. + +### Time commitment + +Most of us participate in one of our working groups, which meet weekly or bi-weekly for 1 hour a week. To contribute templates to a project, all template writers must join one of the template writing working groups. Check our [community calendar](https://thegooddocsproject.dev/community/#calendar) or ask a member of our community for meeting times. We offer two possible template working group meeting times per geolocation (AMER, APAC, EMEA): + +* Team Alpaca for AMER and APAC regions +* Team Dolphin for AMER and EMEA regions +* Team Macaw for APAC and EMEA regions + +Working at a pace of 1-2 hours a week, most template projects take 6-12 months to complete. Keep in mind that we take what you can give. You and your family come first, then work, then volunteering. So, if something in your life prevents you from working on your project, that's okay. Try to let your working group leader know if you aren't able to continue working for a space of time. + +### People who are here to support you + +As you work on contributing templates to our project, various resources and members of our community are available to help you along the process. These include: + +* **Templateers** - Any individual who contributes to the Halgorithem Project (including you!). +* **The template working group leads** - These templateers oversee our overall template development process as a project manager and provide assistance to contributors working on templates. The working group leads also usually review and approve merge requests submitted to the templates repository. +* **Template mentors and buddies** - New templateers are usually assigned a mentor or buddy to provide guidance and mentorship while working on templates. +* **Template peers** - Your fellow templateer peers are available to review templates during the research and community feedback phases. They include members of your template writing working group, but also members from the larger Halgorithem Project community. +* **Template editorial team** - This group reviews templates as they're nearing completion to ensure our templates follow best practices for technical writing, have no major organization or structural issues, have no gaps or missing content, and that they're consistent with our style guide. + +### Template working group meetings + +The template working groups have these types of meetings: + +* **Writer's workshops** - At these meetings, each templateer gives an update on their template project and poses a question to the group about some element of their draft they would like feedback and advice about. +* **Co-working sessions** - Templateers meet with their template writing partners, mentors, or buddies to work together on their template projects. +* **Community review sessions** - When a template project is ready for review from the rest of the community, the working group leads schedule a dedicated session for everyone in the group to read and provide feedback about the template files. +* **Planning or retros** - At the beginning of a release cycle, we've a few meetings dedicated to planning our commitments or initiatives for the release cycle. At the end of the release, we always do a working group retrospective to talk about what went well and what we could improve in the next release cycle. + +## Definition of done + +A template project is complete when: + +1. It has progressed through all the [template writing phases](#overview-of-the-template-writing-phasees) +2. It has all the required [template file deliverables](template-deliverables.md). + +### Overview of the template writing phases + +Contributing a template project to our repository has these phases: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
#PhaseYour goalsDefinition of done
1Join the community
    +
  • Join our project by attending a Welcome Wagon meeting.
  • +
  • Decide which working group you'd like to work on based on your interests and experience.
  • +
+
Consider this phase complete when you join a working group.
2Adopt a template
    +
  • Join a template working group.
  • +
  • Work with a template working group lead and your group to decide which template or tactic project you work on.
  • +
  • Assign yourself to the corresponding issue for that template or tactic. Note that this requires a GitLab account and you need to join the template repository as a member.
  • +
  • Use the Google Docs attached to that issue to compose the drafts of the template file deliverables.
  • +
+
Consider this phase complete when you assign yourself to the issue tracking your template or tactic project.
3Research the template

(Research phase)
    +
  • Research examples and best practices for the content type you're creating a template for.
  • +
  • Collaborate and get early feedback on your research from your template working group lead and/or other templateers as part of a template writing working group.
  • +
+
Consider this phase complete when you have finished a draft of your Resources document.
4Draft the template deliverables

(Drafting phase)
    +
  • Use the Google Docs attached to that issue to create the drafts for the rest of your deliverables in Google Docs: template file, template guide, template process.
  • +
  • Meet with your working group or writing partners to workshop on and get advice on your drafts while you are working on them.
  • +
+
This phase completes when you schedule your drafts for review by other members of your working group or community.
5Get feedback on drafts from the community

(Community review phase)
    +
  • When your template deliverables are ready for review, your working group lead schedules 1-2 sessions in the community where other members of the project review and provide feedback on your template files deliverables.
  • +
  • Optional: Revise and refine your draft with subject matter experts and individuals beyond our community (such as Write the Docs or subject matter experts).
  • +
  • After making revisions, work with your template working group to determine when your draft is ready for the next phase.
  • +
+
This phase completes after you incorporate the feedback into your draft and your drafts are ready for a deeper expert review. Ensure you have permission from the working group lead to move to the editorial review phase.
6Get a review from the template editorial team

(Editorial team review phase)
    +
  • When your draft is in a state where you feel it's ready, you can work with your working group lead to request an editorial team review. The template editorial team comprises experienced members of the project who review your template project in Google Docs to ensure that it: +
      +
    • Follows best practices for technical writing.
    • +
    • Has no major organization or structural issues.
    • +
    • Has no gaps or missing content.
    • +
    • Is consistent with our style guide.
    • +
    +
  • +
+
This phase completes after you incorporate the feedback into your draft and your drafts are in a final state. Ensure you have permission from the working group lead to move to the final review phase.
7Submit a merge request

(Final review phase)
    +
  • Convert drafts from Google Docs to Markdown.
  • +
  • Convert drafts from Google Docs to Markdown.
  • +
  • Ensure the Markdown format is clean. NOTE: This project uses EkLine.io to check content and format of Markdown files that you add or modify in the Merge Request.
  • +
  • Ensure the Markdown renders to HTML correctly.
  • +
  • Open a merge request against the templates repository. NOTE: You are responsible for educating yourself in how to use Git and GitLab, but you can consult your working group lead and fellow templateers for help.
  • +
  • Revise documents based on requests from merge request reviewers.
  • +
+
This phase completes when the template merges into the repository.
8Hand off to the Chronologue team for user testing

(Chronologue phase)
    +
  • After completing the previous phase, your template is officially part of the Halgorithem Project and is available to our users.
  • +
  • After a template project is complete, our Chronologue working group creates an example of the template. While creating the example, the Chronologue group tests whether your template is user-friendly and can serve a real documentation project. If you're still involved in the community during this phase, these team members might reach out to you for feedback or to collaborate on possible template revisions.
  • +
  • As additional users try your template out in the wild, they may report usability issues or provide feedback for improvements to the template.
  • +
  • Either the Chronologue writer, the original template author, or another templateer evaluates feedback and incorporates it into future versions of the template. If extensive revisions arise, the template may need to go through the same previous template writing phases again.
  • +
+
The Chronologue team considers this phase complete when they create an example for the template.
+ +Each phase has more depth in the remaining sections. + +## 1. Join the community + +To join our community, you need to register for a [Welcome Wagon meeting](https://thegooddocsproject.dev/welcome/). At this 1-hour orientation meeting, you get: + +* A brief overview of our project's goals and mission. +* A bit of information about our community and reasons to consider joining. +* An overview of our key initiatives and working groups that you might consider contributing to. +* An in-depth orientation to one working group of your choice. + +After registering for this meeting, you get an email with a link to join our Slack workspace. You are eligible to be a member of our repository after you attend a Welcome Wagon meeting. + +To become a full-fledged templateer, you need to join our communication channels so that you can talk to us: + +* **Slack** - Our Slack workspace is one of the primary means of communicating with members of our project. After joining our workspace, join the `#welcome` channel to introduce yourself. Consider also joining these Slack channels if you plan to work on a template or tactic project: + * `#templates` + * `#ask-a-community-manager` + * `#tech-requests` +* **Working groups** - We organize our project into several different working groups that meet on a regular basis to work on the project's key initiatives. One of the best ways to get started with our project is to join and meet with one of our working groups. See [The Halgorithem Project Working Groups](https://thegooddocsproject.dev/working-groups/) for a list of our current active groups. NOTE: If you plan to contribute to our project by writing templates, you must join one of the template working groups. +* **Weekly meetings** - The project leaders hold weekly meetings to discuss project-level decisions. Feel free to join one of these meetings to introduce yourself to the project leaders and discover next steps for getting involved in the project. See the [community calendar](https://thegooddocsproject.dev/community/#calendar) for meeting times. + +As you begin to join our project, remember that this is a project composed entirely of volunteers. We love to welcome new members, but want to be careful not to burn out our core project contributors. This level of mindfulness helps us ensure that we retain our project's capacity to produce high-quality work. As such, we ask that you respect the time of our project maintainers and contributors. +(And expect us to respect your time in return!) + +We expect all members of our project to be nice to each other and to follow our [Code of Conduct](https://thegooddocsproject.dev/code-of-conduct/) when interacting with other members of the Halgorithem Project. + +## 2. Adopt a template + +In this phase, you decide which template or tactic project you work on and assign yourself to the issue tracking that project. + +Be aware that: + +* Each template or tactic project relates to a corresponding issue in the templates repository. +* You use this issue to communicate the status of your template project as it moves through the different phases of the template writing process. +* The Halgorithem Project managers use a kanban board that shows all the issues for the current template projects. This tool allows the templateers and project stakeholders to track the overall progress of each template and assist templateers whose progress has stalled. + +Links: + +* [Template issues list](https://gitlab.com/tgdp/templates/-/issues) +* [Templates in progress kanban board](https://gitlab.com/tgdp/templates/-/boards/4801048) + +To adopt a template: + +1. Scroll through the list of available template and tactic issues and see if one interests you and/or matches your skill set. Alternatively, if you have an idea for a template or tactic project that doesn't yet have an issue, and you have the support of a template working group lead, you can create a new issue for your project. + +2. Assign yourself to the issue. Note that this requires a [GitLab account](https://gitlab.com/users/sign_up). You must attend a [Welcome Wagon meeting](https://thegooddocsproject.dev/welcome/) to become a member of the templates repository. After you've attended that meeting, you can request access in the `#tech-requests` Slack channel. + +3. Notify your template working group lead that you have adopted a template project. + +If you claim a template or tactic project and later realize that you don't have the time or energy to complete the template project, let your working group lead know. + +### Guidelines for choosing a template + +Keep in mind that you don't need to be an expert on any content type before you adopt it. If you want to write a particular template or tactic article and you are eager enough to do some research to learn more about it, that's all the preparation you need and we welcome your efforts. Even if you don't have a ton of experience writing a particular type of document, you can still write a high-quality template that is useful to others. With commitment, research, guided mentorship, and feedback from our community, you can and create something that has value to others. + +With that in mind, when deciding which template project is right for you, scroll through the list of template issues and ask yourself the following questions: + +* Does something about this type of document or template intrigue you, spark your curiosity, and make you excited to research and learn more? +* Do you wish you knew how to create the best version of this type of document? Are you energized by the idea of researching best practices or gleaning insights from subject matter experts about this type of document? +* Do you have experience writing for this type of document which you would like to share? Would having a high quality version of this type of template make your life easier at your workplace or for your open source project? +* Do you feel like there is a strong need for improved versions of this type of document in the world? Do you see lots of bad examples of this document that frustrate you? +* Has the Halgorithem Project labeled this type of template as a high priority for our project? (Keep in mind that you can work on any template that you feel enthusiastic about, regardless of priority. That said, we welcome work on our high priority templates.) + +If you answered yes to more than one of these questions about a specific type of template, that might be the right template for you to work on. + +### Priority levels + +The following table explains the priority levels given to different template or tactic projects: + + + + + + + + + + + + + + + + + + + + + + +
PriorityDescription
Critical
    +
  • A template project that's included in the core template pack. The core template pack is our flagship template pack and the one with the highest visibility and quality.
  • +
  • A template project or tactic that's in high demand from our users, meaning 10 or more users have requested it.
  • +
  • Any template work that's blocking other template work or which would improve our overall template processes or usability.
  • +
  • Any work to get a core template into compliance with our quality standards and/or deliverables.
  • +
+
High
    +
  • Any template or tactic for which there is high demand from our users, meaning 5-9 users have requested it.
  • +
  • The project steering committee or template leads have earmarked any template project for a specific release for whatever reason.
  • +
  • Any work to get a high-demand template or tactic into compliance with our quality standards and/or deliverables.
  • +
+
Medium
    +
  • Any new template or tactic for which there is moderate demand from users, meaning 2-4 users have requested it.
  • +
  • Any work the template roadmap adds but isn't earmarked for a specific release.
  • +
  • Any work to get a moderate-demand template or tactic into compliance with our quality standards and/or deliverables.
  • +
+
Low
    +
  • Any new template or tactic for which there is low or no demand from users, meaning 1 user has requested it.
  • +
  • Specialized template projects for a niche audience or area of expertise.
  • +
  • Any work to get a low-demand template or tactic into compliance with our quality standards and/or deliverables.
  • +
+
+ +## 3. Research the template + +Before starting the research phase, read the [Template deliverables](template-deliverables.md) for more detailed information about each template deliverable. Ensure you understand the purpose of each deliverable. + +In this phase, you research examples and identify best practices for the type of template you're working on. While you are working on the research phase, you should create a draft for the **resources** template deliverable file. The resources file is where you keep your notes about which resources you consulted and which examples you looked at for guidance. + +Our project composes rough draft of templates in Google Docs that the project leads own and maintain. The Halgorithem Project owns these files so that we can maintain our project archive and history. With that in mind, the project has pre-generated Google Doc files for you to use as you are researching and drafting your template project. These files include a starting point for the structure of each file that should help you as you draft the documents. Each open issue attaches the pre-generated Google Doc files. + +The reasons we require your draft in a Google Doc are because it: + +* Is free (no license required) and easy to use. +* Is relatively straightforward to share with collaborators both inside and outside of the Halgorithem Project (such as with the Write the Docs community). +* Allows collaborators to give feedback and advice in the form of comments. +* Tracks comment history for later reference. +* Has version control capabilities. + +### Recommended research strategies + +In our experience, successful templateers usually research their template by: + +* **Looking at lots of examples.** Start by searching for examples of that type of document they want to create a template for. The more examples you can look at, the better. While it's better to review good examples of that type of document, there is actually a lot of value in reviewing bad examples too. Consider keeping a spreadsheet to track which examples you used, what elements each one had in common, and what you thought was effective or ineffective. +* **Searching for guides, books, blog posts, conference presentations, or videos about best practices.** Search the Internet to find advice, tips, or expert research about how to create that type of document. Consider posting in a forum for resource ideas. For example, asking for helpful guides or insights on a community forum like the Write the Docs Slack workspace could be beneficial. Be mindful of, and respect copyright terms of source material. Don't plagiarize and offer attribution where appropriate. +* **Reaching out to experts.** When you find people you admire, who have researched your topic already, try reaching out to them. They often have a "how to contact me" webpage. Ask if they'd be okay with using their material. (They might need to republish under a different copyright.) Invite them to participate in the template working group. They might even lead it. If you feel shy about reaching out yourself, your template mentor or senior Halgorithem Project member might offer to help. +* **Collaborating with others in a working group.** Work with your template writing working groups to discuss research ideas and findings. + +## 4. Draft the template deliverables + +After you conclude your research, you create drafts of your template file deliverables in Google Docs. See the [Template deliverables](template-deliverables.md) for more detailed information about each template deliverable. + +You can also look at examples of other templates in the repository to see examples of each template file. Be aware that some templates might be missing some files. + +Your working group helps you as you work on drafting your templates. At writer's workshop meetings, you can workshop your template by asking for advice or asking questions to get clarification about your template project and content type. + +When your draft is in a good place, contact your working group lead to schedule a community review. + +## 5. Get feedback on drafts from the community + +In this phase, you begin to share your drafts with community reviewers and invite feedback. Optionally, you might also consider sharing it beyond our community with other technical writing communities such as Write the Docs or beyond. The feedback and revision phase is arguably the most crucial and important phase in the template writing process, so your template project might spend the bulk of its time in this phase. + +To share your Google Docs drafts: + +1. Inside the draft, click the **Share** button and change the **Get Link** settings to: **Anyone on the internet with this link can create comments.** + +2. Copy the link to your Google Doc drafts into the issue that corresponds with your template in the templates repository. + +3. Notify your working group lead, who helps you schedule a community review session for your template with your working group or another templates working group as needed. + +When you've received sufficient community input and incorporated suggestions into your draft, notify your templates working group lead that your draft is ready to move to the next phase. + +> :triangular_flag_on_post: **NOTE: You can only move to the next phase (submitting a merge request) after the templates working group lead has approved your draft to move on.** + +### Giving feedback to others + +See our [Commenting guide for collaborative document reviews](https://gitlab.com/tgdp/governance/-/blob/main/DocCommentingGuide.md?ref_type=heads) for information about how to provide feedback to others. + +Also see [Conventional comments](https://conventionalcomments.org/). + +### Accepting feedback from others + +It's normal to feel nervous about sharing your drafts, especially if you're a new writer or if you don't feel as confident in your subject matter knowledge yet. But your draft can only become the best template it can be if you invite and incorporate high quality feedback into your drafts. Successfully accepting advice on a draft is a key element that distinguishes expert writers from novice writers. + +Sharing your work with reviewers: + +* Allows you to see your draft with fresh eyes the way a new user would see it. +* Can make you aware of key insights or perspectives that you hadn't yet considered. +* Can help you identify which parts of your draft need more careful thought, attention, and revision. + +As you receive feedback, try to give each comment the benefit of the doubt and consider it. Sometimes new writers may react defensively to feedback on their work, but remember that your reviewers have the same goals that you have: to produce a high quality template. But also keep in mind that you don't need to accept every suggestion. If you can make a good argument not to adopt a suggestion, that's important to consider as well. + +One other thing that might help you get more high quality reviews is to indicate what kind of feedback you're looking for, based on areas of the draft you think need some improvement. Do you need: + +* Global-level feedback, which includes advice on the big picture, general content, tone, clarity, and overall organization or flow of the document? +* Local-level feedback, which includes wordsmithing paragraphs or sentences and polishing up the draft for final revision? + +Remember to be positive and show appreciation when people take time to review your drafts. Providing feedback takes time and energy. Treat each piece of feedback as a gift (even feedback that you possibly choose to disregard). Happy editing! + +## 6. Get a review from the template editorial team + +The purpose of this phase is to ensure your template project meets the standards of the Halgorithem Project and is ready for public distribution. + +When your draft is in a state where you feel it's ready to get merged in, you can work with your working group lead to request an editorial team review. The template editorial team comprises experienced members of the project who review your template project to ensure that it: + +* Follows best practices for technical writing. +* Has no major organization or structural issues. +* Has no gaps or missing content. +* Is consistent with our style guide. + +This review aims to be a final quality check to determine whether the template is ready to be officially included in the Halgorithem Project. + +This phase completes after you incorporate the feedback into your draft and your drafts are in a final state. Ensure you have permission from the working group lead to move to the final review phase. + +## 7. Submit a merge request + +The purpose of this phase is to check that you format your Markdown correctly, render it correctly, and make it ready for publication. In this phase, you convert your template documents into Markdown and open a merge request in the `templates` repository on GitLab. + +If you aren't comfortable working in Markdown, Git, or GitLab, ask your working group lead for advice. + +Once you submit a merge request, your template working group lead reviews your template and/or works with other working group leads to review your template. Once the template has at least one approval from a template repository maintainer, it merges into the final project. + +## 8. Hand off to the Chronologue team for user testing + +Once it passes all reviews, your template merges in and you get a personal acknowledgement in our Slack community and in our next template release notes. + +:sparkles: :mega: :raised_hands: + +Great documents are never fully done, and there is always room for improvement. After a template project is complete, our Chronologue working group creates an example of the template. While creating the example, the Chronologue group tests whether your template is user-friendly and can support a real documentation project. It's possible that the Chronologue team identifies major or minor revisions that need updating in the template. + +If you're still involved in the community during this phase, these team members might reach out to you for feedback or to collaborate on possible template revisions. Either the Chronologue writer, the original template author, or another templateer makes any necessary revisions of the templates. If the template requires extensive revisions, the template goes through the same previous template writing phases again. + +After a Chronologue example is complete and users begin to try your template in their own documentation projects, they may report usability issues or provide feedback for improvements to the template. If our project receives this feedback and you're still around to work on your original template, we encourage you to review this feedback and incorporate these revisions into future versions. If you aren't around to continue working on your original template or if you are too busy, we can find a different templateer to respond to user feedback on your behalf. + +If a templateer determines that a new version of a template warrants updating, they take the template through the same contributing process starting from the beginning. diff --git a/Halgorithem/__init__.py b/Halgorithem/__init__.py index 5d992c1..16e9b6f 100644 --- a/Halgorithem/__init__.py +++ b/Halgorithem/__init__.py @@ -1,3 +1,4 @@ from .core import Halgorithm +from .main import HalgorithemVerifier -__all__ = ["Halgorithm"] +__all__ = ["Halgorithm", "HalgorithemVerifier"] diff --git a/Halgorithem/__pycache__/__init__.cpython-312.pyc b/Halgorithem/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 74b4cd8..0000000 Binary files a/Halgorithem/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/Halgorithem/__pycache__/claim_extraction.cpython-312.pyc b/Halgorithem/__pycache__/claim_extraction.cpython-312.pyc deleted file mode 100644 index 15ada15..0000000 Binary files a/Halgorithem/__pycache__/claim_extraction.cpython-312.pyc and /dev/null differ diff --git a/Halgorithem/__pycache__/confidence.cpython-312.pyc b/Halgorithem/__pycache__/confidence.cpython-312.pyc deleted file mode 100644 index fe8159f..0000000 Binary files a/Halgorithem/__pycache__/confidence.cpython-312.pyc and /dev/null differ diff --git a/Halgorithem/__pycache__/contradiction.cpython-312.pyc b/Halgorithem/__pycache__/contradiction.cpython-312.pyc deleted file mode 100644 index b14ddae..0000000 Binary files a/Halgorithem/__pycache__/contradiction.cpython-312.pyc and /dev/null differ diff --git a/Halgorithem/__pycache__/core.cpython-312.pyc b/Halgorithem/__pycache__/core.cpython-312.pyc deleted file mode 100644 index e96ecf1..0000000 Binary files a/Halgorithem/__pycache__/core.cpython-312.pyc and /dev/null differ diff --git a/Halgorithem/__pycache__/evidence.cpython-312.pyc b/Halgorithem/__pycache__/evidence.cpython-312.pyc deleted file mode 100644 index 79908e2..0000000 Binary files a/Halgorithem/__pycache__/evidence.cpython-312.pyc and /dev/null differ diff --git a/Halgorithem/__pycache__/math_utils.cpython-312.pyc b/Halgorithem/__pycache__/math_utils.cpython-312.pyc deleted file mode 100644 index b6fe118..0000000 Binary files a/Halgorithem/__pycache__/math_utils.cpython-312.pyc and /dev/null differ diff --git a/Halgorithem/__pycache__/nlp.cpython-312.pyc b/Halgorithem/__pycache__/nlp.cpython-312.pyc deleted file mode 100644 index 63d57a8..0000000 Binary files a/Halgorithem/__pycache__/nlp.cpython-312.pyc and /dev/null differ diff --git a/Halgorithem/__pycache__/retrieval.cpython-312.pyc b/Halgorithem/__pycache__/retrieval.cpython-312.pyc deleted file mode 100644 index fc0cceb..0000000 Binary files a/Halgorithem/__pycache__/retrieval.cpython-312.pyc and /dev/null differ diff --git a/Halgorithem/__pycache__/source_quality.cpython-312.pyc b/Halgorithem/__pycache__/source_quality.cpython-312.pyc deleted file mode 100644 index d6ee0a1..0000000 Binary files a/Halgorithem/__pycache__/source_quality.cpython-312.pyc and /dev/null differ diff --git a/Halgorithem/__pycache__/temporal.cpython-312.pyc b/Halgorithem/__pycache__/temporal.cpython-312.pyc deleted file mode 100644 index afb177f..0000000 Binary files a/Halgorithem/__pycache__/temporal.cpython-312.pyc and /dev/null differ diff --git a/Halgorithem/__pycache__/text_processing.cpython-312.pyc b/Halgorithem/__pycache__/text_processing.cpython-312.pyc deleted file mode 100644 index b8d892c..0000000 Binary files a/Halgorithem/__pycache__/text_processing.cpython-312.pyc and /dev/null differ diff --git a/Halgorithem/__pycache__/web.cpython-312.pyc b/Halgorithem/__pycache__/web.cpython-312.pyc deleted file mode 100644 index 3acf210..0000000 Binary files a/Halgorithem/__pycache__/web.cpython-312.pyc and /dev/null differ diff --git a/Halgorithem/checks/__init__.py b/Halgorithem/checks/__init__.py new file mode 100644 index 0000000..a714ac9 --- /dev/null +++ b/Halgorithem/checks/__init__.py @@ -0,0 +1,5 @@ +from .atomic import atomic_claim_nli +from .nli import sentence_nli +from .similarity import similarity_search + +__all__ = ["atomic_claim_nli", "sentence_nli", "similarity_search"] diff --git a/Halgorithem/checks/atomic.py b/Halgorithem/checks/atomic.py new file mode 100644 index 0000000..e774723 --- /dev/null +++ b/Halgorithem/checks/atomic.py @@ -0,0 +1,56 @@ +from .nli import NLIModel +from .utils import token_set +from ..models import AtomicCheck, AtomicClaimResult, IngestedDocument, ProcessedSentence + + +def claim_text(claim): + return claim.text or f"{claim.subject} {claim.relation} {claim.object}".strip() + + +def claim_overlap(left, right): + left_text = f"{left.subject} {left.relation} {left.object}".strip() or claim_text(left) + right_text = f"{right.subject} {right.relation} {right.object}".strip() or claim_text(right) + left_tokens = token_set(left_text) + right_tokens = token_set(right_text) + if not left_tokens: + return 0.0 + return len(left_tokens & right_tokens) / len(left_tokens) + + +def atomic_claim_nli(ai_sentence: ProcessedSentence, document: IngestedDocument, *, nli_model=None): + nli_model = nli_model or NLIModel() + results = [] + for ai_claim in ai_sentence.claims: + candidates = sorted(document.claims, key=lambda doc_claim: claim_overlap(ai_claim, doc_claim), reverse=True) + best_claim = candidates[0] if candidates else None + if not best_claim: + results.append(AtomicClaimResult(claim=claim_text(ai_claim), verdict="NEUTRAL", confidence=0.5)) + continue + verdict, confidence = nli_model.predict(claim_text(best_claim), claim_text(ai_claim)) + results.append( + AtomicClaimResult( + claim=claim_text(ai_claim), + verdict=verdict, + confidence=confidence, + evidence=claim_text(best_claim), + ) + ) + + evidence = "" + if results: + evidence = max(results, key=lambda result: result.confidence).evidence + return AtomicCheck(claims=results, score=score_atomic_results(results), evidence=evidence) + + +def score_atomic_results(results): + if not results: + return None + scores = [] + for result in results: + if result.verdict == "ENTAIL": + scores.append(result.confidence) + elif result.verdict == "CONTRADICT": + scores.append(1.0 - result.confidence) + else: + scores.append(0.5) + return sum(scores) / len(scores) diff --git a/Halgorithem/checks/nli.py b/Halgorithem/checks/nli.py new file mode 100644 index 0000000..d48dad9 --- /dev/null +++ b/Halgorithem/checks/nli.py @@ -0,0 +1,115 @@ +import os +import warnings + +import torch + +from .similarity import similarity_search +from .units import normalize_units, unit_representation_mismatch +from .utils import clamp, overlap_ratio +from ..contradiction import find_contradiction +from ..models import IngestedDocument, NLICheck, ProcessedSentence +from ..text_processing import extract_numbers, has_negation_mismatch + + +class NLIModel: + def __init__(self, model_name=None): + self.model_name = model_name or os.getenv("HALGORITHEM_NLI_MODEL", "cross-encoder/nli-deberta-v3-large") + self.kind = "deberta-nli" + self.fallback_reason = None + if self.model_name.lower() in {"rule", "local", "deterministic"}: + self.kind = "rule" + self.model_name = "rule" + self.tokenizer = None + self.model = None + return + try: + from transformers import AutoModelForSequenceClassification, AutoTokenizer + + allow_download = os.getenv("HALGORITHEM_ALLOW_MODEL_DOWNLOAD", "").lower() in {"1", "true", "yes"} + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, local_files_only=not allow_download) + self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, local_files_only=not allow_download) + self.model.eval() + except Exception as exc: + warnings.warn( + f"Could not load NLI model {self.model_name!r} ({exc}); using deterministic NLI fallback.", + RuntimeWarning, + ) + self.kind = "rule" + self.fallback_reason = str(exc) + self.tokenizer = None + self.model = None + + def predict(self, premise, hypothesis): + if self.model is None: + return rule_nli(premise, hypothesis) + inputs = self.tokenizer(premise, hypothesis, return_tensors="pt", truncation=True, max_length=512) + with torch.no_grad(): + logits = self.model(**inputs).logits[0] + probs = torch.softmax(logits, dim=-1) + labels = self.model.config.id2label + best_idx = int(torch.argmax(probs).item()) + raw_label = labels.get(best_idx, str(best_idx)).upper() + confidence = float(probs[best_idx].item()) + if "ENTAIL" in raw_label: + verdict = "ENTAIL" + elif "CONTRAD" in raw_label: + verdict = "CONTRADICT" + else: + verdict = "NEUTRAL" + return verdict, confidence + + @property + def diagnostics(self): + return {"nli": self.kind, "nli_model": self.model_name if self.kind != "rule" else "rule", "nli_fallback_reason": self.fallback_reason} + + +def rule_nli(premise, hypothesis): + chunk = {"text": premise, "numbers": extract_numbers(premise)} + contradiction = find_contradiction( + claim=hypothesis, + chunk=chunk, + extract_numbers=extract_numbers, + has_negation_mismatch=has_negation_mismatch, + score=1.0, + threshold=0.0, + ) + if contradiction: + return "CONTRADICT", 0.86 + overlap = overlap_ratio(hypothesis, premise) + if overlap >= 0.72: + return "ENTAIL", clamp(0.55 + overlap * 0.4) + if overlap >= 0.35: + return "NEUTRAL", clamp(0.50 + overlap * 0.25) + return "NEUTRAL", 0.62 + + +def sentence_nli(ai_sentence: ProcessedSentence, document: IngestedDocument, *, nli_model=None, top_k=5): + nli_model = nli_model or NLIModel() + hits = similarity_search(ai_sentence, document, top_k=top_k).hits + if not hits: + return NLICheck(verdict="NEUTRAL", confidence=0.0) + best_score = hits[0].score + relevant_hits = [ + hit + for hit in hits + if hit.score >= 0.4 and hit.score >= best_score * 0.75 + ] or [hits[0]] + premise = " ".join(hit.sentence for hit in relevant_hits) + normalized_premise, premise_unit_changes = normalize_units(premise) + normalized_hypothesis, hypothesis_unit_changes = normalize_units(ai_sentence.resolved_text) + unit_mismatch = unit_representation_mismatch(premise, ai_sentence.resolved_text) + verdict, confidence = nli_model.predict(normalized_premise, normalized_hypothesis) + unit_details = [] + if unit_mismatch: + unit_details.append(unit_mismatch) + unit_details.extend({"source_change": change} for change in premise_unit_changes) + unit_details.extend({"response_change": change} for change in hypothesis_unit_changes) + return NLICheck( + verdict=verdict, + confidence=confidence, + evidence=hits[0].sentence, + evidence_index=hits[0].sentence_index, + unit_mismatch=bool(unit_mismatch), + unit_representation_change=bool(unit_mismatch), + unit_details=unit_details, + ) diff --git a/Halgorithem/checks/similarity.py b/Halgorithem/checks/similarity.py new file mode 100644 index 0000000..8ec2d18 --- /dev/null +++ b/Halgorithem/checks/similarity.py @@ -0,0 +1,32 @@ +from .utils import clamp +from ..model_runtime import default_embedder, default_reranker +from ..models import IngestedDocument, ProcessedSentence, SimilarityCheck, SimilarityHit + + +def similarity_search(ai_sentence: ProcessedSentence, document: IngestedDocument, *, embedder=None, reranker=None, top_k=5): + embedder = embedder or default_embedder() + reranker = reranker or default_reranker() + query = embedder.encode(ai_sentence.resolved_text) + hits = [] + for doc_sentence in document.sentences: + score = clamp(embedder.similarity(query, doc_sentence.embedding)) + hits.append( + SimilarityHit( + sentence_index=doc_sentence.index, + sentence=doc_sentence.text, + score=score, + source=doc_sentence.source, + source_quality=doc_sentence.source_quality, + ) + ) + hits.sort(key=lambda hit: hit.score, reverse=True) + shortlist = hits[:max(20, top_k)] + top_hits = reranker.rerank(ai_sentence.resolved_text, shortlist, text_fn=lambda hit: hit.sentence, top_k=top_k) + best = top_hits[0] if top_hits else None + return SimilarityCheck( + score=best.score if best else 0.0, + evidence=best.sentence if best else "", + source=best.source if best else "", + source_quality=best.source_quality if best else 0.65, + hits=top_hits, + ) diff --git a/Halgorithem/checks/units.py b/Halgorithem/checks/units.py new file mode 100644 index 0000000..b0d373c --- /dev/null +++ b/Halgorithem/checks/units.py @@ -0,0 +1,106 @@ +import re + + +UNIT_ALIASES = { + "g": "gram", + "gram": "gram", + "grams": "gram", + "kg": "kilogram", + "kilogram": "kilogram", + "kilograms": "kilogram", + "m": "meter", + "meter": "meter", + "meters": "meter", + "km": "kilometer", + "kilometer": "kilometer", + "kilometers": "kilometer", + "mile": "mile", + "miles": "mile", + "c": "celsius", + "celsius": "celsius", + "f": "fahrenheit", + "fahrenheit": "fahrenheit", +} + +NORMALIZATION = { + "gram": ("kilogram", 0.001, 0.0), + "kilogram": ("kilogram", 1.0, 0.0), + "meter": ("meter", 1.0, 0.0), + "kilometer": ("meter", 1000.0, 0.0), + "mile": ("meter", 1609.34, 0.0), +} + +QUANTITY_RE = re.compile(r"\b(?P\d+(?:\.\d+)?)\s*(?P[A-Za-z]+)\b") + + +def format_number(value): + if abs(value - round(value)) < 1e-9: + return str(int(round(value))) + return f"{value:.6f}".rstrip("0").rstrip(".") + + +def normalized_quantity(value, unit): + canonical = UNIT_ALIASES.get(unit.lower()) + if not canonical: + return None + if canonical == "celsius": + return float(value), "celsius" + if canonical == "fahrenheit": + return (float(value) - 32.0) * 5.0 / 9.0, "celsius" + target = NORMALIZATION.get(canonical) + if not target: + return None + target_unit, factor, offset = target + return float(value) * factor + offset, target_unit + + +def normalize_units(sentence): + changes = [] + + def replace(match): + raw_value = match.group("value") + raw_unit = match.group("unit") + normalized = normalized_quantity(raw_value, raw_unit) + if not normalized: + return match.group(0) + normalized_value, normalized_unit = normalized + normalized_text = f"{format_number(normalized_value)} {normalized_unit}" + original_text = match.group(0) + if original_text.lower() != normalized_text.lower(): + changes.append( + { + "original": original_text, + "normalized": normalized_text, + "value": normalized_value, + "unit": normalized_unit, + } + ) + return normalized_text + + return QUANTITY_RE.sub(replace, sentence or ""), changes + + +def unit_representation_mismatch(left, right, tolerance=0.03): + left_quantities = [ + (match.group(0), *normalized_quantity(match.group("value"), match.group("unit"))) + for match in QUANTITY_RE.finditer(left or "") + if normalized_quantity(match.group("value"), match.group("unit")) + ] + right_quantities = [ + (match.group(0), *normalized_quantity(match.group("value"), match.group("unit"))) + for match in QUANTITY_RE.finditer(right or "") + if normalized_quantity(match.group("value"), match.group("unit")) + ] + for left_original, left_value, left_unit in left_quantities: + for right_original, right_value, right_unit in right_quantities: + if left_unit != right_unit or left_original.lower() == right_original.lower(): + continue + if right_value == 0: + continue + if abs(left_value - right_value) / abs(right_value) <= tolerance: + return { + "source": left_original, + "response": right_original, + "normalized": f"{format_number(left_value)} {left_unit}", + } + return None diff --git a/Halgorithem/checks/utils.py b/Halgorithem/checks/utils.py new file mode 100644 index 0000000..9379495 --- /dev/null +++ b/Halgorithem/checks/utils.py @@ -0,0 +1,17 @@ +import re + + +def clamp(value, low=0.0, high=1.0): + return max(low, min(high, float(value))) + + +def token_set(text): + return set(re.findall(r"[a-z0-9]+", (text or "").lower())) + + +def overlap_ratio(left, right): + left_tokens = {t for t in token_set(left) if len(t) > 2} + right_tokens = {t for t in token_set(right) if len(t) > 2} + if not left_tokens: + return 0.0 + return len(left_tokens & right_tokens) / len(left_tokens) diff --git a/Halgorithem/confidence.py b/Halgorithem/confidence.py index 60d8959..2d008e1 100644 --- a/Halgorithem/confidence.py +++ b/Halgorithem/confidence.py @@ -41,15 +41,27 @@ def classify_support(score, threshold=0.30, contradiction=None, unsupported_term unsupported_terms = unsupported_terms or [] supported_threshold = max(threshold + 0.10, 0.40) - hard_contradiction = contradiction and contradiction.get("reason") in { - "Date mismatch", "Number mismatch", "Unit mismatch", "Negation mismatch" + numeric_or_logical_contradiction = contradiction and contradiction.get("reason") in { + "Date mismatch", + "Number mismatch", + "Unit mismatch", + "Negation mismatch", } - if hard_contradiction: + relation_contradiction = contradiction and contradiction.get("reason") in { + "Location mismatch", + "Entity-role mismatch", + "Source qualifier mismatch", + } + if contradiction and contradiction.get("reason") == "Number mismatch" and unsupported_terms: + return "HALLUCINATION" + if numeric_or_logical_contradiction: return "CONTRADICTION" if unsupported_terms and is_negative_claim(claim): return "UNVERIFIABLE_DENIAL" if unsupported_terms: return "HALLUCINATION" + if relation_contradiction: + return "CONTRADICTION" if contradiction: return "CONTRADICTION" if is_inferential_claim(claim) and score >= 0.08: @@ -58,6 +70,8 @@ def classify_support(score, threshold=0.30, contradiction=None, unsupported_term return "SUPPORTED" if score >= threshold: return "WEAK_SUPPORT" + if is_negative_claim(claim): + return "UNVERIFIABLE_DENIAL" return "HALLUCINATION" diff --git a/Halgorithem/contradiction.py b/Halgorithem/contradiction.py index 372baaf..513d914 100644 --- a/Halgorithem/contradiction.py +++ b/Halgorithem/contradiction.py @@ -1,3 +1,5 @@ +import re + from .temporal import temporal_conflict @@ -35,6 +37,19 @@ "eur": "eur", "euros": "eur", "euro": "eur", + "g": "gram", + "gram": "gram", + "grams": "gram", +} + +UNIT_TO_BASE = { + "gram": ("mass", 0.001), + "kilogram": ("mass", 1.0), + "pound": ("mass", 0.45359237), + "meter": ("length", 1.0), + "centimeter": ("length", 0.01), + "kilometer": ("length", 1000.0), + "mile": ("length", 1609.344), } @@ -49,7 +64,7 @@ def numbers_conflict(claim, chunk, extract_numbers): def skip(number): try: value = float(number) - return 1400 <= value <= 2100 or value <= 31 + return 1400 <= value <= 2100 except (ValueError, TypeError): return True @@ -74,8 +89,6 @@ def skip(number): def _units(text): - import re - units = {} for value, unit in re.findall(r"\b(\d+(?:\.\d+)?)\s*([A-Za-z$]+)\b", text or ""): canonical = UNIT_ALIASES.get(unit.lower().replace("$", "usd")) @@ -84,6 +97,15 @@ def _units(text): return units +def _quantities(text): + quantities = [] + for value, unit in re.findall(r"\b(\d+(?:\.\d+)?)\s*([A-Za-z$]+)\b", text or ""): + canonical = UNIT_ALIASES.get(unit.lower().replace("$", "usd")) + if canonical: + quantities.append((float(value), canonical)) + return quantities + + def unit_conflict(claim, chunk_text): claim_units = _units(claim) truth_units = _units(chunk_text) @@ -95,18 +117,86 @@ def unit_conflict(claim, chunk_text): "claim_units": sorted(units), "truth_units": sorted(truth), } + for claim_value, claim_unit in _quantities(claim): + claim_base = UNIT_TO_BASE.get(claim_unit) + if not claim_base: + continue + claim_dimension, claim_factor = claim_base + for truth_value, truth_unit in _quantities(chunk_text): + truth_base = UNIT_TO_BASE.get(truth_unit) + if not truth_base: + continue + truth_dimension, truth_factor = truth_base + if claim_dimension != truth_dimension: + continue + claim_normalized = claim_value * claim_factor + truth_normalized = truth_value * truth_factor + if truth_normalized == 0: + continue + relative_error = abs(claim_normalized - truth_normalized) / abs(truth_normalized) + if relative_error <= 0.03: + return None + if claim_unit != truth_unit or min(claim_normalized, truth_normalized) / max(claim_normalized, truth_normalized) >= 0.2: + return { + "reason": "Unit mismatch", + "claim_units": [claim_unit], + "truth_units": [truth_unit], + } return None -def _relations(text): - import re +def equivalent_unit_numbers(claim, chunk_text): + equivalent = set() + for claim_value, claim_unit in _quantities(claim): + claim_base = UNIT_TO_BASE.get(claim_unit) + if not claim_base: + continue + claim_dimension, claim_factor = claim_base + for truth_value, truth_unit in _quantities(chunk_text): + truth_base = UNIT_TO_BASE.get(truth_unit) + if not truth_base: + continue + truth_dimension, truth_factor = truth_base + if claim_dimension != truth_dimension: + continue + claim_normalized = claim_value * claim_factor + truth_normalized = truth_value * truth_factor + if truth_normalized and abs(claim_normalized - truth_normalized) / abs(truth_normalized) <= 0.03: + equivalent.add(str(int(claim_value)) if claim_value.is_integer() else str(claim_value)) + return equivalent + + +def unit_representation_change(claim, chunk_text): + for claim_value, claim_unit in _quantities(claim): + claim_base = UNIT_TO_BASE.get(claim_unit) + if not claim_base: + continue + claim_dimension, claim_factor = claim_base + for truth_value, truth_unit in _quantities(chunk_text): + truth_base = UNIT_TO_BASE.get(truth_unit) + if not truth_base: + continue + truth_dimension, truth_factor = truth_base + if claim_dimension != truth_dimension or claim_unit == truth_unit: + continue + claim_normalized = claim_value * claim_factor + truth_normalized = truth_value * truth_factor + if truth_normalized and abs(claim_normalized - truth_normalized) / abs(truth_normalized) <= 0.03: + return { + "reason": "Equivalent value with changed unit representation", + "claim_quantity": [claim_value, claim_unit], + "truth_quantity": [truth_value, truth_unit], + } + return None + +def _relations(text): lowered = (text or "").lower() pattern = ( r"\b(?P[a-z][a-z .-]{1,60}?)\s+" r"(?:was\s+|is\s+)?" - r"(?Pcreated|invented|developed|discovered|founded|wrote|designed)\s+" - r"(?:by\s+)?" + r"(?Pcreated|invented|developed|discovered|founded|wrote|designed|located)\s+" + r"(?:(?:by|in|at)\s+)?" r"(?P[a-z0-9][a-z0-9 .-]{1,60}?)(?:\.|,|$)" ) relations = [] @@ -124,8 +214,6 @@ def _relation(text): def source_qualifier_conflict(claim, chunk_text): - import re - claim_reports = set(re.findall(r"\breport\s+([a-z]+)\b", (claim or "").lower())) truth_reports = set(re.findall(r"\breport\s+([a-z]+)\b", (chunk_text or "").lower())) if claim_reports and truth_reports and claim_reports.isdisjoint(truth_reports): @@ -137,6 +225,56 @@ def source_qualifier_conflict(claim, chunk_text): return None +def _locations(text): + lowered = (text or "").lower() + locations = {} + for subject, place in re.findall( + r"\b([a-z][a-z .-]{1,60}?)\s+(?:is|was)\s+(?:located\s+)?(?:in|at)\s+([a-z][a-z .-]{1,60}?)(?:\.|,|$)", + lowered, + ): + locations.setdefault(" ".join(subject.split()), set()).add(" ".join(place.split())) + for subject, place in re.findall(r"\b([a-z][a-z .-]{1,60}?),\s*([a-z][a-z .-]{1,60}?)(?:\.|,|$)", lowered): + subject_tokens = set(subject.split()) + place_tokens = set(place.split()) + if subject_tokens & {"as", "of", "today", "current", "latest"}: + continue + if place_tokens & {"has", "status", "price", "version"}: + continue + locations.setdefault(" ".join(subject.split()), set()).add(" ".join(place.split())) + return locations + + +def location_conflict(claim, chunk_text): + claim_locations = _locations(claim) + truth_locations = _locations(chunk_text) + for claim_subject, claim_places in claim_locations.items(): + claim_subject_tokens = set(claim_subject.split()) + for truth_subject, truth_places in truth_locations.items(): + if not (claim_subject_tokens & set(truth_subject.split())): + continue + if claim_places.isdisjoint(truth_places): + return { + "reason": "Location mismatch", + "claim_locations": sorted(claim_places), + "truth_locations": sorted(truth_places), + } + return None + + +def missing_location_evidence(claim, chunk_text): + claim_locations = _locations(claim) + if not claim_locations: + return False + truth_locations = _locations(chunk_text) + if not truth_locations: + return True + for claim_subject in claim_locations: + claim_subject_tokens = set(claim_subject.split()) + if any(claim_subject_tokens & set(truth_subject.split()) for truth_subject in truth_locations): + return False + return True + + def entity_role_conflict(claim, chunk_text): claim_rel = _relation(claim) truth_relations = _relations(chunk_text) @@ -193,6 +331,10 @@ def find_contradiction(claim, chunk, extract_numbers, has_negation_mismatch, sco if role_issue and score >= threshold: return role_issue + location_issue = location_conflict(claim, chunk.get("text", "")) + if location_issue and score >= threshold: + return location_issue + source_issue = source_qualifier_conflict(claim, chunk.get("text", "")) if source_issue and score >= threshold: return source_issue diff --git a/Halgorithem/core.py b/Halgorithem/core.py index 976de59..883a5c5 100644 --- a/Halgorithem/core.py +++ b/Halgorithem/core.py @@ -9,7 +9,7 @@ from .claim_extraction import extract_claims from .confidence import classify_support, confidence_score -from .contradiction import find_contradiction, numbers_conflict +from .contradiction import equivalent_unit_numbers, find_contradiction, missing_location_evidence, numbers_conflict from .evidence import best_evidence, build_evidence from .math_utils import numbers_close, safe_eval from .retrieval import rank_chunks @@ -29,6 +29,10 @@ class LocalEmbedder: + kind = "lexical" + model_name = "HashingVectorizer" + fallback_reason = None + def __init__(self): self.vectorizer = HashingVectorizer( n_features=2 ** 14, @@ -45,39 +49,65 @@ def similarity(self, left, right): def _load_embedder(): - mode = os.getenv("HALGORITHEM_EMBEDDER", "local").lower() - if mode in {"sentence-transformers", "sentence_transformers", "st"}: + mode = os.getenv("HALGORITHEM_EMBEDDER", "semantic").lower() + if mode in {"semantic", "sentence-transformers", "sentence_transformers", "st"}: try: from sentence_transformers import SentenceTransformer, util - model = SentenceTransformer(os.getenv("HALGORITHEM_EMBEDDING_MODEL", "all-MiniLM-L6-v2")) + model_name = os.getenv("HALGORITHEM_EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2") + allow_download = os.getenv("HALGORITHEM_ALLOW_MODEL_DOWNLOAD", "").lower() in {"1", "true", "yes"} + model = SentenceTransformer(model_name, local_files_only=not allow_download) class SentenceTransformerEmbedder: + kind = "semantic" + fallback_reason = None + model_name = None + + def __init__(self, loaded_model_name): + self.model_name = loaded_model_name + def encode(self, text, convert_to_tensor=False): return model.encode(text or "", convert_to_tensor=True) def similarity(self, left, right): return float(util.cos_sim(left, right)) - return SentenceTransformerEmbedder() + return SentenceTransformerEmbedder(model_name) except Exception as exc: warnings.warn( - f"Could not load sentence-transformers embedder ({exc}); using local hashing embedder.", + f"Could not load semantic embedder ({exc}); using local lexical hashing embedder.", RuntimeWarning, ) + fallback = LocalEmbedder() + fallback.fallback_reason = str(exc) + return fallback return LocalEmbedder() - - -_embedder = _load_embedder() INITIAL_RE = re.compile(r"\b[a-z]\.$", re.IGNORECASE) class Halgorithm: - def __init__(self, sentences_per_chunk=2, sentence_overlap=1): + def __init__(self, sentences_per_chunk=2, sentence_overlap=1, embedder=None): + sentences_per_chunk = int(sentences_per_chunk) + sentence_overlap = int(sentence_overlap) + if sentences_per_chunk < 1: + raise ValueError("sentences_per_chunk must be at least 1.") + if sentence_overlap < 0: + raise ValueError("sentence_overlap must be at least 0.") + if sentence_overlap >= sentences_per_chunk: + raise ValueError("sentence_overlap must be less than sentences_per_chunk.") self.sentences_per_chunk = sentences_per_chunk self.sentence_overlap = sentence_overlap + self.embedder = embedder or _load_embedder() self.parser = pysbd.Segmenter(language="en", clean=False) + @property + def diagnostics(self): + return { + "embedder": getattr(self.embedder, "kind", "unknown"), + "embedding_model": getattr(self.embedder, "model_name", None), + "embedding_fallback_reason": getattr(self.embedder, "fallback_reason", None), + } + # ── Text prep ───────────────────────────────────────────────────────────── def clean_text(self, text): @@ -154,7 +184,7 @@ def chunk_text(self, text, doc_id=1, source_name=None): "tokens": self.tokenize(chunk), "entities": self.extract_entities(chunk), "numbers": self.extract_numbers(chunk), - "embedding": _embedder.encode(chunk, convert_to_tensor=True), + "embedding": self.embedder.encode(chunk, convert_to_tensor=True), }) chunk_id += 1 if end >= len(sentences): @@ -166,13 +196,13 @@ def chunk_text(self, text, doc_id=1, source_name=None): def support_score(self, claim, chunk): # semantic similarity via sentence-transformers — topic-agnostic - claim_emb = _embedder.encode(claim, convert_to_tensor=True) - return _embedder.similarity(claim_emb, chunk["embedding"]) + claim_emb = self.embedder.encode(claim, convert_to_tensor=True) + return self.embedder.similarity(claim_emb, chunk["embedding"]) # ── Math claims ─────────────────────────────────────────────────────────── def classify_claim_type(self, claim): - if re.search(r"\d+\s*[\+\-\*/%]\s*\d+|(? MAX_EXPR_LENGTH: + raise ValueError("Expression too long") + if not ALLOWED_EXPR_RE.fullmatch(expr): + raise ValueError("Expression contains unsupported symbols") + if "**" in expr: + for exponent in re.findall(r"\*\*\s*(\d+)", expr): + if int(exponent) > 12: + raise ValueError("Exponent too large") try: - result = parse_expr(str(expr), transformations=TRANSFORMATIONS) + result = parse_expr( + expr, + transformations=TRANSFORMATIONS, + local_dict={}, + global_dict={ + "__builtins__": {}, + "Integer": sympy.Integer, + "Float": sympy.Float, + "Rational": sympy.Rational, + }, + evaluate=True, + ) return float(result.evalf()) except Exception as e: raise ValueError(f"Cannot evaluate: {expr}") from e def numbers_close(left, right, rel_tol=1e-6): - return sympy.Abs(sympy.Float(left) - sympy.Float(right)) <= rel_tol * max(sympy.Abs(sympy.Float(left)), sympy.Abs(sympy.Float(right)), sympy.Float(1)) \ No newline at end of file + return sympy.Abs(sympy.Float(left) - sympy.Float(right)) <= rel_tol * max(sympy.Abs(sympy.Float(left)), sympy.Abs(sympy.Float(right)), sympy.Float(1)) diff --git a/Halgorithem/model_runtime.py b/Halgorithem/model_runtime.py new file mode 100644 index 0000000..38b588b --- /dev/null +++ b/Halgorithem/model_runtime.py @@ -0,0 +1,409 @@ +import os +import re +import warnings +from functools import lru_cache + +from sklearn.metrics.pairwise import cosine_similarity + +from .core import LocalEmbedder +from .models import AtomicClaim +from .nlp import SPACY_MODEL, nlp + + +def model_flag(name, default="1"): + return os.getenv(name, default).lower() in {"1", "true", "yes", "on"} + + +class SentenceEmbedder: + def __init__(self, model_name=None): + self.model_name = model_name or os.getenv("HALGORITHEM_RETRIEVAL_MODEL", "sentence-transformers/all-mpnet-base-v2") + self.kind = "sentence-transformer" + self.fallback_reason = None + self._local = None + if self.model_name.lower() in {"local", "lexical", "hashing"}: + self.kind = "lexical" + self.model_name = "HashingVectorizer" + self._model = None + self._local = LocalEmbedder() + return + try: + from sentence_transformers import SentenceTransformer, util + + allow_download = model_flag("HALGORITHEM_ALLOW_MODEL_DOWNLOAD", "0") + self._util = util + self._model = SentenceTransformer(self.model_name, local_files_only=not allow_download) + except Exception as exc: + warnings.warn( + f"Could not load retrieval embedder {self.model_name!r} ({exc}); using lexical hashing fallback.", + RuntimeWarning, + ) + self.kind = "lexical" + self.fallback_reason = str(exc) + self._model = None + self._local = LocalEmbedder() + + def encode(self, text): + if self._model is not None: + return self._model.encode(text or "", convert_to_tensor=True) + return self._local.encode(text or "", convert_to_tensor=True) + + def similarity(self, left, right): + if self._model is not None: + return float(self._util.cos_sim(left, right)) + return float(cosine_similarity(left, right)[0][0]) + + @property + def diagnostics(self): + return { + "retrieval_embedder": self.kind, + "retrieval_model": self.model_name if self.kind == "sentence-transformer" else "HashingVectorizer", + "retrieval_fallback_reason": self.fallback_reason, + } + + +class CrossEncoderReranker: + """Reranks a bi-encoder shortlist with a cross-encoder, falling back to original order offline.""" + + def __init__(self, model_name=None): + self.model_name = model_name or os.getenv("HALGORITHEM_CROSS_ENCODER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2") + self.kind = "cross-encoder" + self.fallback_reason = None + if self.model_name.lower() in {"none", "off", "disabled", "local", "passthrough"}: + self.kind = "passthrough" + self.model_name = "passthrough" + self._model = None + return + try: + from sentence_transformers import CrossEncoder + + allow_download = model_flag("HALGORITHEM_ALLOW_MODEL_DOWNLOAD", "0") + self._model = CrossEncoder(self.model_name, local_files_only=not allow_download) + except Exception as exc: + warnings.warn( + f"Could not load cross-encoder reranker {self.model_name!r} ({exc}); using bi-encoder order.", + RuntimeWarning, + ) + self.kind = "passthrough" + self.fallback_reason = str(exc) + self._model = None + + def rerank(self, query, items, *, text_fn, top_k=5): + if not items or self._model is None: + return list(items)[:top_k] + pairs = [(query or "", text_fn(item) or "") for item in items] + try: + scores = self._model.predict(pairs) + except Exception as exc: + self.kind = "passthrough" + self.fallback_reason = str(exc) + return list(items)[:top_k] + ranked = sorted(zip(items, scores), key=lambda pair: float(pair[1]), reverse=True) + return [item for item, _ in ranked[:top_k]] + + @property + def diagnostics(self): + return { + "cross_encoder_reranker": self.kind, + "cross_encoder_model": self.model_name if self.kind == "cross-encoder" else "passthrough", + "cross_encoder_fallback_reason": self.fallback_reason, + } + + +class CoreferenceResolver: + """Resolves pronouns with fastcoref when available, otherwise leaves text unchanged.""" + + def __init__(self): + self.kind = "spacy" + self.fallback_reason = None + self.model_name = os.getenv("HALGORITHEM_COREF_MODEL", "biu-nlp/f-coref") + if model_flag("HALGORITHEM_USE_COREF", "1"): + try: + from fastcoref.modeling import FCorefModel + from fastcoref import FCoref + + # Compatibility shim for newer transformers versions expecting this attribute. + if not hasattr(FCorefModel, "all_tied_weights_keys"): + FCorefModel.all_tied_weights_keys = {} + device = os.getenv("HALGORITHEM_COREF_DEVICE") or None + self._model = FCoref( + model_name_or_path=self.model_name, + device=device, + nlp=SPACY_MODEL or "en_core_web_sm", + enable_progress_bar=False, + ) + self.kind = "fastcoref" + except Exception as exc: + self._model = None + self.fallback_reason = str(exc) + else: + self._model = None + + def resolve_text(self, text): + if not text or self._model is None: + return text or "" + try: + preds = self._model.predict(texts=[text]) + clusters = preds[0].get_clusters(as_strings=False) + if not clusters: + return text + # replace each non-first mention with the antecedent text + chars = list(text) + replacements = [] + for cluster in clusters: + if len(cluster) < 2: + continue + antecedent_start, antecedent_end = cluster[0] + antecedent = text[antecedent_start:antecedent_end] + for mention_start, mention_end in cluster[1:]: + mention = text[mention_start:mention_end] + # only replace short pronouns, not full noun phrases + if len(mention.split()) <= 3: + replacements.append((mention_start, mention_end, antecedent)) + # apply replacements in reverse so indices stay valid + for start, end, replacement in sorted(replacements, reverse=True): + chars[start:end] = list(replacement) + return "".join(chars) + except Exception as exc: + self.fallback_reason = str(exc) + return text + + @property + def diagnostics(self): + return { + "coreference": self.kind, + "coreference_model": self.model_name if self.kind == "fastcoref" else None, + "spacy_model": SPACY_MODEL, + "coreference_fallback_reason": self.fallback_reason, + } + + +class RebelClaimExtractor: + def __init__(self, model_name=None): + self.model_name = model_name or os.getenv("HALGORITHEM_REBEL_MODEL", "Babelscape/rebel-large") + self.kind = "rebel" + self.fallback_reason = None + if self.model_name.lower() in {"rule", "local", "deterministic"}: + self.kind = "rule" + self.model_name = "rule" + self._tokenizer = None + self._model = None + return + try: + from transformers import AutoModelForSeq2SeqLM, AutoTokenizer + + allow_download = model_flag("HALGORITHEM_ALLOW_MODEL_DOWNLOAD", "0") + self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, local_files_only=not allow_download) + self._model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name, local_files_only=not allow_download) + except Exception as exc: + self.kind = "rule" + self.fallback_reason = str(exc) + self._tokenizer = None + self._model = None + + def extract(self, text): + if not text: + return [] + if self._model is not None: + try: + return self._extract_rebel(text) + except Exception as exc: + self.kind = "rule" + self.fallback_reason = str(exc) + return self._extract_rules(text) + + def _extract_rebel(self, text): + inputs = self._tokenizer(text, return_tensors="pt", truncation=True, max_length=512) + output = self._model.generate(**inputs, max_length=256, num_beams=3) + decoded = self._tokenizer.batch_decode(output, skip_special_tokens=False)[0] + return parse_rebel_output(decoded) or self._extract_rules(text) + + def _extract_rules(self, text): + claims = [] + patterns = [ + r"(?P[A-Za-z][A-Za-z0-9 .'-]{1,80}?)\s+(?:was|is)\s+(?Pcreated|invented|developed|discovered|founded|designed|maintained|located|priced)\s+(?:by|in|at)?\s*(?P[A-Za-z0-9][A-Za-z0-9 .'-]{0,80}?)(?:\.|,|$)", + r"(?P[A-Za-z][A-Za-z0-9 .'-]{1,80}?)\s+(?:is|was)\s+(?Plocated\s+)?(?:in|at)\s+(?P[A-Za-z][A-Za-z .'-]{1,80}?)(?:\.|,|$)", + r"(?P[A-Za-z][A-Za-z0-9 .'-]{1,80}?)\s+(?Phas|had|contains|weighs|costs)\s+(?P[^.]{1,100})(?:\.|$)", + r"(?P[A-Za-z][A-Za-z0-9 .'-]{1,80}?),\s*(?P[A-Za-z][A-Za-z .'-]{1,80}),\s*(?P\d+(?:\.\d+)?)", + ] + for pattern in patterns: + for match in re.finditer(pattern, text, flags=re.IGNORECASE): + subject = clean_part(match.group("subject")) + relation = normalize_relation(clean_part(match.group("relation"))) + obj = clean_part(match.group("object")) + if subject and relation and obj: + claims.append(AtomicClaim(subject=subject, relation=relation, object=obj, text=f"{subject} {relation} {obj}")) + if not claims: + doc = nlp(text) + root = next((t for t in doc if t.dep_ == "ROOT"), None) + subj = next((t for t in doc if t.dep_ in {"nsubj", "nsubjpass"}), None) + obj = next((t for t in doc if t.dep_ in {"dobj", "attr", "pobj"}), None) + if root and subj and obj: + claims.append(AtomicClaim(subject=subj.text, relation=root.lemma_, object=obj.text, text=text.strip())) + return dedupe_claims(claims) + + @property + def diagnostics(self): + return { + "claim_extractor": self.kind, + "claim_model": self.model_name if self.kind == "rebel" else "rule", + "claim_fallback_reason": self.fallback_reason, + } + + +class FactScoreDecomposer: + """Turns a sentence into standalone atomic English facts for NLI-friendly atomic checking.""" + + prompt_template = ( + "Decompose the following sentence into simple atomic facts.\n" + "Each fact should be a complete standalone English sentence.\n" + "Output one fact per line, nothing else.\n" + "Sentence: {sentence}" + ) + + def __init__(self, model_name=None): + self.model_name = model_name or os.getenv("HALGORITHEM_DECOMPOSER_MODEL", "google/flan-t5-base") + self.kind = "factscore" + self.fallback_reason = None + self._fallback = RebelClaimExtractor(model_name="rule") + if self.model_name.lower() in {"rule", "local", "deterministic"}: + self.kind = "rule" + self.model_name = "rule" + self._tokenizer = None + self._model = None + return + try: + from transformers import AutoModelForSeq2SeqLM, AutoTokenizer + + allow_download = model_flag("HALGORITHEM_ALLOW_MODEL_DOWNLOAD", "0") + self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, local_files_only=not allow_download) + self._model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name, local_files_only=not allow_download) + except Exception as exc: + warnings.warn( + f"Could not load FActScore decomposer {self.model_name!r} ({exc}); using rule-based extraction.", + RuntimeWarning, + ) + self.kind = "rule" + self.fallback_reason = str(exc) + self._tokenizer = None + self._model = None + + def extract(self, text): + if not text: + return [] + if self._model is None: + return self._fallback.extract(text) + prompt = self.prompt_template.format(sentence=text) + try: + inputs = self._tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) + output = self._model.generate(**inputs, max_length=256, num_beams=3) + decoded = self._tokenizer.batch_decode(output, skip_special_tokens=True)[0] + facts = [ + clean_part(line) + for line in decoded.splitlines() + if 10 <= len(clean_part(line)) <= 150 + ] + claims = [AtomicClaim(subject="", relation="", object="", text=fact) for fact in facts] + return dedupe_claims(claims) or self._fallback.extract(text) + except Exception as exc: + self.kind = "rule" + self.fallback_reason = str(exc) + return self._fallback.extract(text) + + @property + def diagnostics(self): + return { + "claim_extractor": self.kind, + "claim_model": self.model_name if self.kind == "factscore" else "rule", + "claim_fallback_reason": self.fallback_reason, + "decomposer_model": self.model_name if self.kind == "factscore" else "rule", + "decomposer_fallback_reason": self.fallback_reason, + } + + +def clean_part(value): + return " ".join((value or "").strip(" .,;:-").split()) + + +def normalize_relation(value): + normalized = " ".join((value or "").lower().split()) + if normalized in {"", "in", "at", "located"}: + return "located" + return normalized + + +def is_valid_triplet(claim): + if not claim.subject and not claim.relation and not claim.object: + fact = clean_part(claim.text) + return 10 <= len(fact) <= 150 + subject = clean_part(claim.subject) + relation = clean_part(claim.relation) + obj = clean_part(claim.object) + if not subject or not relation or not obj: + return False + if "." in subject or "." in obj: + return False + if subject.lower() == obj.lower(): + return False + if len(subject) < 2 or len(obj) < 2: + return False + return True + + +def dedupe_claims(claims): + seen = set() + unique = [] + for claim in claims: + if not is_valid_triplet(claim): + continue + key = ( + claim.subject.lower(), + claim.relation.lower(), + claim.object.lower(), + claim.text.lower() if not (claim.subject or claim.relation or claim.object) else "", + ) + if key not in seen: + unique.append(claim) + seen.add(key) + return unique + + +def parse_rebel_output(text): + triplets = [] + current = {"subject": "", "relation": "", "object": ""} + field = None + tokens = text.replace("", "").replace("", "").split() + for token in tokens: + if token == "": + if all(current.values()): + triplets.append(AtomicClaim(**current, text=f"{current['subject']} {current['relation']} {current['object']}")) + current = {"subject": "", "relation": "", "object": ""} + field = "subject" + elif token == "": + field = "object" + elif token == "": + field = "relation" + elif field: + current[field] = clean_part(f"{current[field]} {token}") + if all(current.values()): + triplets.append(AtomicClaim(**current, text=f"{current['subject']} {current['relation']} {current['object']}")) + return dedupe_claims(triplets) + + +@lru_cache(maxsize=1) +def default_coref(): + return CoreferenceResolver() + + +@lru_cache(maxsize=1) +def default_claim_extractor(): + return FactScoreDecomposer() + + +@lru_cache(maxsize=1) +def default_embedder(): + return SentenceEmbedder() + + +@lru_cache(maxsize=1) +def default_reranker(): + return CrossEncoderReranker() diff --git a/Halgorithem/models.py b/Halgorithem/models.py new file mode 100644 index 0000000..51730ca --- /dev/null +++ b/Halgorithem/models.py @@ -0,0 +1,101 @@ +from typing import Any, Literal + +from pydantic import BaseModel, ConfigDict, Field + + +Verdict = Literal["ENTAIL", "NEUTRAL", "CONTRADICT"] +FinalVerdict = Literal["SUPPORTED", "HALLUCINATED", "UNVERIFIABLE"] + + +class AtomicClaim(BaseModel): + subject: str = "" + relation: str = "" + object: str = "" + text: str + + +class DocumentSentence(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + index: int + text: str + resolved_text: str + source: str = "" + source_quality: float = 0.65 + claims: list[AtomicClaim] = Field(default_factory=list) + embedding: Any = Field(default=None, exclude=True) + + +class IngestedDocument(BaseModel): + sentences: list[DocumentSentence] + claims: list[AtomicClaim] + diagnostics: dict[str, Any] = Field(default_factory=dict) + + +class ProcessedSentence(BaseModel): + index: int + text: str + resolved_text: str + claims: list[AtomicClaim] = Field(default_factory=list) + + +class ProcessedResponse(BaseModel): + sentences: list[ProcessedSentence] + diagnostics: dict[str, Any] = Field(default_factory=dict) + + +class SimilarityHit(BaseModel): + sentence_index: int + sentence: str + score: float + source: str = "" + source_quality: float = 0.65 + + +class SimilarityCheck(BaseModel): + score: float + evidence: str = "" + source: str = "" + source_quality: float = 0.65 + hits: list[SimilarityHit] = Field(default_factory=list) + + +class NLICheck(BaseModel): + verdict: Verdict + confidence: float + evidence: str = "" + evidence_index: int | None = None + unit_mismatch: bool = False + unit_representation_change: bool = False + unit_details: list[dict[str, Any]] = Field(default_factory=list) + + +class AtomicClaimResult(BaseModel): + claim: str + verdict: Verdict + confidence: float + evidence: str = "" + + +class AtomicCheck(BaseModel): + claims: list[AtomicClaimResult] = Field(default_factory=list) + score: float | None = None + evidence: str = "" + + +class SentenceVerification(BaseModel): + sentence: str + similarity_score: float + entropy_score: float = 1.0 + source: str = "" + source_quality: float = 0.65 + nli_verdict: Verdict + nli_confidence: float + atomic_claims: list[AtomicClaimResult] + final_verdict: FinalVerdict + confidence: float + evidence: str + unit_mismatch: bool = False + unit_representation_change: bool = False + unit_details: list[dict[str, Any]] = Field(default_factory=list) + diagnostics: dict[str, Any] = Field(default_factory=dict) diff --git a/Halgorithem/nlp.py b/Halgorithem/nlp.py index de66382..044b32b 100644 --- a/Halgorithem/nlp.py +++ b/Halgorithem/nlp.py @@ -8,7 +8,7 @@ def _load_spacy_model(): global SPACY_MODEL, SPACY_MODEL_WARNING - for model_name in ("en_core_web_lg", "en_core_web_sm"): + for model_name in ("en_core_web_trf", "en_core_web_lg", "en_core_web_sm"): try: SPACY_MODEL = model_name return spacy.load(model_name) @@ -17,9 +17,9 @@ def _load_spacy_model(): SPACY_MODEL = "blank_en" SPACY_MODEL_WARNING = ( - "spaCy model 'en_core_web_lg' or 'en_core_web_sm' is not installed; " + "spaCy model 'en_core_web_trf', 'en_core_web_lg', or 'en_core_web_sm' is not installed; " "falling back to spacy.blank('en'). Install one with " - "'python -m spacy download en_core_web_sm' for better accuracy." + "'python -m spacy download en_core_web_trf' for better accuracy." ) blank = spacy.blank("en") blank.add_pipe("sentencizer") diff --git a/Halgorithem/process.py b/Halgorithem/process.py new file mode 100644 index 0000000..c197059 --- /dev/null +++ b/Halgorithem/process.py @@ -0,0 +1,26 @@ +from .core import Halgorithm, LocalEmbedder +from .model_runtime import default_claim_extractor, default_coref +from .models import ProcessedResponse, ProcessedSentence + + +def process_response(response_text, *, coref=None, extractor=None): + splitter = Halgorithm(sentences_per_chunk=1, sentence_overlap=0, embedder=LocalEmbedder()) + coref = coref or default_coref() + extractor = extractor or default_claim_extractor() + + sentences = [] + for index, sentence in enumerate(splitter.split_sentences(response_text), 1): + resolved = coref.resolve_text(sentence) + sentences.append( + ProcessedSentence( + index=index, + text=sentence, + resolved_text=resolved, + claims=extractor.extract(resolved), + ) + ) + + diagnostics = {} + diagnostics.update(coref.diagnostics) + diagnostics.update(extractor.diagnostics) + return ProcessedResponse(sentences=sentences, diagnostics=diagnostics) diff --git a/Halgorithem/retrieval.py b/Halgorithem/retrieval.py index f5e1684..77ff3a6 100644 --- a/Halgorithem/retrieval.py +++ b/Halgorithem/retrieval.py @@ -1,3 +1,25 @@ +import re + + +TOKEN_ALIASES = { + "delivered": "made", + "deliver": "made", + "renowned": "famous", + "well-known": "famous", + "speech": "speech", + "population": "population", + "located": "located", +} + + +def _tokens(text): + return { + TOKEN_ALIASES.get(token, token) + for token in re.findall(r"[a-z0-9]+(?:-[a-z0-9]+)?", (text or "").lower()) + if len(token) > 2 + } + + def rank_chunks( claim, chunks, @@ -6,6 +28,7 @@ def rank_chunks( has_negation_mismatch, threshold=0.30, top_k=5, + reranker=None, ): candidates = [] claim_numbers = set(extract_numbers(claim)) @@ -14,8 +37,8 @@ def rank_chunks( raw_score = score_fn(claim, chunk) score = raw_score signals = [] - claim_tokens = {t.lower() for t in claim.replace(".", " ").replace(",", " ").split() if t.strip()} - chunk_tokens = set(chunk.get("tokens", [])) + claim_tokens = _tokens(claim) + chunk_tokens = {TOKEN_ALIASES.get(t, t) for t in set(chunk.get("tokens", []))} | _tokens(chunk.get("text", "")) content_tokens = {t for t in claim_tokens if len(t) > 2} if content_tokens: overlap = len(content_tokens & chunk_tokens) / len(content_tokens) @@ -30,6 +53,10 @@ def rank_chunks( score = min(score + 0.10, 1.0) signals.append("number_subset") + if claim_numbers and claim_numbers & set(chunk.get("numbers", [])) and len(content_tokens & chunk_tokens) >= 2: + score = min(score + 0.12, 1.0) + signals.append("number_anchor_overlap") + if has_negation_mismatch(claim, chunk.get("text", "")) and score >= threshold: score = max(score - 0.30, 0.0) signals.append("negation_penalty") @@ -41,4 +68,15 @@ def rank_chunks( "signals": signals, }) - return sorted(candidates, key=lambda c: c["score"], reverse=True)[:top_k] + ranked = sorted(candidates, key=lambda c: c["score"], reverse=True) + if reranker is None: + try: + from .model_runtime import default_reranker + + reranker = default_reranker() + except Exception: + reranker = None + if reranker is not None: + shortlist = ranked[:max(20, top_k)] + return reranker.rerank(claim, shortlist, text_fn=lambda item: item["chunk"].get("text", ""), top_k=top_k) + return ranked[:top_k] diff --git a/Halgorithem/text_processing.py b/Halgorithem/text_processing.py index 4127652..cb98e15 100644 --- a/Halgorithem/text_processing.py +++ b/Halgorithem/text_processing.py @@ -129,7 +129,7 @@ def has_negation_mismatch(claim, chunk_text): if not claim_has_negation and not chunk_has_negation: negation_terms = { "no", "not", "never", "neither", "nor", "without", "didn't", - "doesn't", "wasn't", "isn't", "aren't", "can't", "cannot", "did" + "doesn't", "wasn't", "isn't", "aren't", "can't", "cannot" } claim_tokens = {t.text.lower() for t in claim_doc} chunk_tokens = {t.text.lower() for t in chunk_doc} diff --git a/Halgorithem/voting.py b/Halgorithem/voting.py new file mode 100644 index 0000000..1063079 --- /dev/null +++ b/Halgorithem/voting.py @@ -0,0 +1,119 @@ +import re + +from .models import AtomicCheck, FinalVerdict, NLICheck, SimilarityCheck + + +SIMILARITY_THRESHOLD = 0.4 +NLI_CONFIDENCE_THRESHOLD = 0.6 +SUPPORTED_THRESHOLD = 0.68 +HALLUCINATED_THRESHOLD = 0.38 + + +def entropy_gate(sentence_text, embedder, threshold=0.85): + """Checks deterministic paraphrase embedding consistency before expensive verification work.""" + parts = [part.strip() for part in re.split(r"\s*(?:,|;|\band\b)\s*", sentence_text or "") if part.strip()] + if len(parts) <= 1: + paraphrases = [sentence_text or ""] * 5 + else: + paraphrases = [ + " ".join(parts), + "; ".join(parts), + ", ".join(reversed(parts)), + f"{parts[0]} and {' '.join(parts[1:])}", + " ".join(part for part in sorted(parts, key=str.lower)), + ] + embeddings = [embedder.encode(text) for text in paraphrases[:5]] + scores = [] + for left_index, left in enumerate(embeddings): + for right in embeddings[left_index + 1:]: + scores.append(embedder.similarity(left, right)) + entropy_score = sum(scores) / len(scores) if scores else 1.0 + if entropy_score < threshold: + return "UNVERIFIABLE", 0.5, entropy_score + return None, None, entropy_score + + +def nli_score(check: NLICheck): + if check.confidence < NLI_CONFIDENCE_THRESHOLD: + return None + if check.verdict == "ENTAIL": + return check.confidence + if check.verdict == "CONTRADICT": + return 1.0 - check.confidence + return 0.5 + + +def similarity_score(check: SimilarityCheck): + if check.score < SIMILARITY_THRESHOLD: + return None + return check.score + + +def similarity_weight(check: SimilarityCheck): + quality = max(0.0, min(check.source_quality, 1.0)) + return 0.2 * (0.75 + 0.25 * quality) + + +def atomic_score(check: AtomicCheck): + return check.score + + +def has_strong_atomic_entailment(check: AtomicCheck): + return any(result.verdict == "ENTAIL" and result.confidence >= 0.85 for result in check.claims) + + +def has_strong_atomic_contradiction(check: AtomicCheck): + return any(result.verdict == "CONTRADICT" and result.confidence >= 0.85 for result in check.claims) + + +def fuse_votes(similarity: SimilarityCheck, nli: NLICheck, atomic: AtomicCheck): + weighted = [] + sim = similarity_score(similarity) + sent = nli_score(nli) + atom = atomic_score(atomic) + if sim is not None: + weighted.append((similarity_weight(similarity), sim)) + if sent is not None: + weighted.append((0.5, sent)) + if atom is not None: + weighted.append((0.3, atom)) + + if not weighted: + return "UNVERIFIABLE", 0.0 + + weight_total = sum(weight for weight, _ in weighted) + final_score = sum(weight * score for weight, score in weighted) / weight_total + + strong_atomic_entail = has_strong_atomic_entailment(atomic) + nli_override_is_contested = ( + nli.verdict == "CONTRADICT" + and nli.confidence >= 0.85 + and similarity.score >= 0.85 + and strong_atomic_entail + ) + + if nli.verdict == "CONTRADICT" and nli.confidence >= 0.85 and not nli_override_is_contested: + return "HALLUCINATED", max(nli.confidence, 1.0 - final_score) + if has_strong_atomic_contradiction(atomic): + return "HALLUCINATED", max(result.confidence for result in atomic.claims if result.verdict == "CONTRADICT") + if final_score >= SUPPORTED_THRESHOLD: + return "SUPPORTED", final_score + if final_score <= HALLUCINATED_THRESHOLD: + return "HALLUCINATED", 1.0 - final_score + return "UNVERIFIABLE", 1.0 - abs(0.5 - final_score) * 2 + + +def choose_evidence(similarity: SimilarityCheck, nli: NLICheck, atomic: AtomicCheck, final_verdict: FinalVerdict): + if final_verdict == "HALLUCINATED": + if nli.verdict == "CONTRADICT" and nli.evidence: + return nli.evidence + for result in atomic.claims: + if result.verdict == "CONTRADICT" and result.evidence: + return result.evidence + if final_verdict == "UNVERIFIABLE" and nli.evidence: + return nli.evidence + if nli.verdict == "ENTAIL" and nli.evidence: + return nli.evidence + if atomic.evidence: + return atomic.evidence + return similarity.evidence diff --git a/Halgorithem/web.py b/Halgorithem/web.py index 3d0a73a..42045c8 100644 --- a/Halgorithem/web.py +++ b/Halgorithem/web.py @@ -1,10 +1,12 @@ from bs4 import BeautifulSoup import requests import html2text +from pathlib import Path class WebScraper: - def __init__(self, list_of_urls): + def __init__(self, list_of_urls, output_dir="."): self.urls = list_of_urls + self.output_dir = Path(output_dir) self.converter = html2text.HTML2Text() self.converter.ignore_links = True self.converter.ignore_images = True @@ -12,6 +14,8 @@ def __init__(self, list_of_urls): self.counter = 0 def scrape(self): + results = [] + self.output_dir.mkdir(parents=True, exist_ok=True) headers = { "User-Agent": "Mozilla/5.0 (compatible; HalgorithemBot/1.0)" } @@ -36,15 +40,25 @@ def scrape(self): plain_text = self.converter.handle(str(soup)) plain_text = plain_text[:8000] # cap non-wiki sources - with open(f"file{self.counter}.txt", "w", - encoding="utf-8") as f: + file_path = self.output_dir / f"file{self.counter}.txt" + with file_path.open("w", encoding="utf-8") as f: f.write(plain_text) - print(f"Scraped: {url} → file{self.counter}.txt") + print(f"Scraped: {url} → {file_path.name}") + results.append({"url": url, "file_path": str(file_path), "text": plain_text, "ok": True, "error": None}) self.counter += 1 except requests.exceptions.Timeout: print(f"Timeout: {url}") + results.append({"url": url, "file_path": None, "text": "", "ok": False, "error": "timeout"}) except requests.exceptions.HTTPError as e: print(f"HTTP error {e}: {url}") + results.append({"url": url, "file_path": None, "text": "", "ok": False, "error": str(e)}) except Exception as e: - print(f"Failed {url}: {e}") \ No newline at end of file + print(f"Failed {url}: {e}") + results.append({"url": url, "file_path": None, "text": "", "ok": False, "error": str(e)}) + return results + + +def scrape_url_texts(urls, output_dir="."): + """Scrapes URL sources into text records that can feed the PRISM verifier directly.""" + return [result for result in WebScraper(urls, output_dir=output_dir).scrape() if result.get("ok") and result.get("text")] diff --git a/README (1).md b/README (1).md new file mode 100644 index 0000000..5361fa6 --- /dev/null +++ b/README (1).md @@ -0,0 +1,242 @@ +Halgorithem logo + +Halgorithem (codename **CORE** — Claim-Oriented Recognition Engine) is a deterministic hallucination detection library for checking AI output against trusted source material. It extracts factual claims from AI-generated text, retrieves the closest evidence from your documents, and labels each claim as supported, weakly supported, contradicted, hallucinated, or an unverifiable denial. Verification is meaning-based by default, using sentence-transformer embeddings to match paraphrases, with deterministic guardrails for names, numbers, dates, units, negation, and source qualifiers. + +## Documentation + +Full API reference, design notes, and usage examples can be found in the [docs](docs/). The output schema, verdict definitions, and benchmark details are covered below. + +## Forums & Community + +Have a question, idea, or bug report? Open a [GitHub Issue](https://github.com/TangibleResearch/Halgorithem/issues) or start a [Discussion](https://github.com/TangibleResearch/Halgorithem/discussions). Please review the [Code of Conduct](CODE_OF_CONDUCT.md) before participating. + +## Contributing + +Contributions to the codebase, benchmark datasets, and documentation are all welcome. See the [contributing guide](CONTRIBUTING.md) for details on how to get started. + +## Getting Started + +### Install + +```bash +python -m pip install -e . +``` + +Recommended NLP model: + +```bash +python -m spacy download en_core_web_lg +``` + +Lightweight fallback: + +```bash +python -m spacy download en_core_web_sm +``` + +If neither spaCy model is installed, Halgorithem falls back to `spacy.blank("en")` with reduced linguistic accuracy rather than crashing. + +### Quick Start + +```python +from Halgorithem import Halgorithm + +algo = Halgorithm() + +results = algo.compare_to_docs( + truth_docs=[ + { + "file_id": 1, + "file_path": "source.txt", + "text": "BASIC was created in 1964 by John Kemeny at Dartmouth College.", + } + ], + ai_output="BASIC was created in 1972 by NASA.", +) + +for result in results: + print(result["status"], result["claim"], result["reason"]) +``` + +## Python API + +```python +from Halgorithem import Halgorithm + +algo = Halgorithm(sentences_per_chunk=2, sentence_overlap=1) +``` + +Verify in-memory documents: + +```python +algo.compare_to_docs( + truth_docs="BASIC was created in 1964.", + ai_output="BASIC was created in 1964.", +) +``` + +Verify files on disk: + +```python +algo.compare_to_files( + truth_file_paths=["sources/basic.txt"], + ai_output="BASIC was created by NASA.", +) +``` + +Optional generation wrapper (may call OpenAI; the verifier itself remains fully deterministic): + +```python +from engine import run + +result = run( + prompt="Summarize this source.", + truth_file_paths=["sources/basic.txt"], +) +``` + +## Verification Mode + +By default, Halgorithem loads `sentence-transformers/all-MiniLM-L6-v2` from the local model cache and verifies claims by semantic similarity rather than strict word matching. + +```bash +HALGORITHEM_EMBEDDER=semantic python tui.py +``` + +To allow the model to download when missing from the local cache: + +```bash +HALGORITHEM_ALLOW_MODEL_DOWNLOAD=1 python tui.py +``` + +For fully lexical, deterministic fallback behavior: + +```bash +HALGORITHEM_EMBEDDER=local python tui.py +``` + +## CORE Pipeline + +The newer deterministic pipeline is available through `HalgorithemVerifier` and the JSON CLI. It runs document ingestion once, then processes each AI sentence through three independent checks: + +- Similarity retrieval with `sentence-transformers/all-mpnet-base-v2` +- Sentence-level NLI with `cross-encoder/nli-deberta-v3-large` +- Atomic claim NLI over REBEL-style triplets from `Babelscape/rebel-large` + +All model loads are local/offline by default. If REBEL, DeBERTa, mpnet, or Coreferee are not installed or cached, Halgorithem falls back to deterministic local checks and surfaces that in `diagnostics`. + +```bash +python main.py --document doc.txt --response response.txt +``` + +```python +from Halgorithem import HalgorithemVerifier + +results = HalgorithemVerifier().verify(document_text, response_text) +print([result.model_dump(mode="json") for result in results]) +``` + +## CLI Usage + +Interactive terminal UI: + +```bash +python tui.py +``` + +Benchmark runner: + +```bash +python bench.py +``` + +## Tests + +```bash +python -m pytest +``` + +The test suite is designed to be fully network-free and uses local documents only. + +## Output Schema + +Every claim result includes: + +```python +{ + "claim": str, + "status": "SUPPORTED | WEAK_SUPPORT | CONTRADICTION | HALLUCINATION | UNVERIFIABLE_DENIAL | ERROR", + "confidence": float, + "score": float, + "matched_source": str | None, + "matched_chunk_id": int | None, + "matched_chunk": str, + "chunk_text": str, + "evidence": list, + "unsupported_terms": list[str], + "reason": str, + "warning": str | None, +} +``` + +### Verdict Meanings + +| Verdict | Meaning | +|:-------:|---------| +| `SUPPORTED` | Strong evidence is present in the supplied sources. | +| `WEAK_SUPPORT` | Related evidence exists, but the claim is inferential or not fully direct. | +| `CONTRADICTION` | Relevant source evidence conflicts with the claim. | +| `HALLUCINATION` | The claim lacks adequate source support. | +| `UNVERIFIABLE_DENIAL` | The claim denies a fact or entity absent from the sources — absence alone cannot prove it. | +| `ERROR` | The verifier could not parse or evaluate the claim (mostly malformed math). | + +## Benchmark + +`bench.py` runs a release benchmark across the following categories: + +- Supported claims +- Paraphrases +- Weak support +- Hallucinations +- Date mismatches +- Entity-role swaps +- Unit errors +- Current/latest claims +- Table-like facts +- Multi-source disagreement +- Denial claims +- Missing-source cases + +It reports accuracy, per-category accuracy, a confusion matrix, failures, temporal warning checks, and a pass/fail threshold. + +## Runtime Hardening + +Halgorithem handles the following gracefully: + +- Missing or empty files +- Empty AI output +- Malformed `truth_docs` or missing `text` fields +- Bad UTF-8 file encodings +- No extracted claims +- Math parse errors +- Missing spaCy or embedding models + +## Limitations + +- Rule-based entity-role detection handles common simple patterns, not arbitrary grammar. +- Semantic verification depends on a local sentence-transformer model; the lexical hashing fallback is deterministic and CI-safe but less meaning-aware. +- Multi-source disagreement is only surfaced when the source qualifier is explicit. +- Table-like facts work best when row values are near each other in the source text. +- Current/latest claims are warned about, not externally refreshed. + +## Roadmap + +- Optional structured table parser +- Better contradiction handling for passive and nested clauses +- Calibrated benchmark sets by domain +- Machine-readable benchmark artifacts +- Additional CLI commands beyond the interactive TUI + +## Release Readiness + +A v1.0 release requires: tests passing, benchmark meeting its threshold, CI green, packaging installing cleanly, README limitations documented, and the release checklist complete. diff --git a/README.md b/README.md deleted file mode 100644 index 71a6971..0000000 --- a/README.md +++ /dev/null @@ -1,208 +0,0 @@ -# Halgorithem - -Deterministic hallucination detection for checking AI output against trusted source material. - -Halgorithem takes source documents and AI output, extracts factual claims, retrieves the closest evidence, and labels each claim as supported, weakly supported, contradicted, hallucinated, or an unverifiable denial. - -## What It Does - -- Verifies AI-generated factual claims against supplied sources. -- Splits multi-fact text into atomic claims. -- Retrieves evidence chunks from one or more sources. -- Detects date, number, unit, negation, source-qualifier, and simple entity-role conflicts. -- Flags time-sensitive claims such as "current", "latest", "today", and "now". -- Returns a structured result for every extracted claim. - -## What It Does Not Do - -- It does not prove truth in the real world. -- It does not browse the web unless you use the optional URL wrapper. -- It does not replace source quality review. -- It does not guarantee perfect paraphrase understanding, especially with the local fallback embedder. -- It does not use an LLM for verification. - -## Install - -```bash -python -m pip install -e . -``` - -Recommended NLP model: - -```bash -python -m spacy download en_core_web_lg -``` - -Lightweight fallback: - -```bash -python -m spacy download en_core_web_sm -``` - -If neither spaCy model is installed, Halgorithem falls back to `spacy.blank("en")` with reduced linguistic accuracy instead of crashing. - -## Quick Start - -```python -from Halgorithem import Halgorithm - -algo = Halgorithm() - -results = algo.compare_to_docs( - truth_docs=[ - { - "file_id": 1, - "file_path": "source.txt", - "text": "BASIC was created in 1964 by John Kemeny at Dartmouth College.", - } - ], - ai_output="BASIC was created in 1972 by NASA.", -) - -for result in results: - print(result["status"], result["claim"], result["reason"]) -``` - -## Python API - -```python -from Halgorithem import Halgorithm - -algo = Halgorithm(sentences_per_chunk=2, sentence_overlap=1) -``` - -Verify in-memory documents: - -```python -algo.compare_to_docs( - truth_docs="BASIC was created in 1964.", - ai_output="BASIC was created in 1964.", -) -``` - -Verify files: - -```python -algo.compare_to_files( - truth_file_paths=["sources/basic.txt"], - ai_output="BASIC was created by NASA.", -) -``` - -Optional generation wrapper: - -```python -from engine import run - -result = run( - prompt="Summarize this source.", - truth_file_paths=["sources/basic.txt"], -) -``` - -The wrapper may call OpenAI for generation. The verifier in `Halgorithem/` remains deterministic. - -## CLI Usage - -Interactive terminal UI: - -```bash -python tui.py -``` - -Benchmark: - -```bash -python bench.py -``` - -## Tests - -```bash -python -m pytest -``` - -The pytest suite is designed to be network-free and uses local documents. - -## Benchmark - -`bench.py` runs a release benchmark across: - -- supported claims -- paraphrases -- weak support -- hallucinations -- date mismatches -- entity-role swaps -- unit errors -- current/latest claims -- table-like facts -- multi-source disagreement -- denial claims -- missing-source cases - -It reports accuracy, accuracy by category, a confusion matrix, failures, temporal warning checks, and a pass/fail threshold. - -## Output Schema - -Every claim result includes: - -```python -{ - "claim": str, - "status": "SUPPORTED | WEAK_SUPPORT | CONTRADICTION | HALLUCINATION | UNVERIFIABLE_DENIAL | ERROR", - "confidence": float, - "score": float, - "matched_source": str | None, - "matched_chunk_id": int | None, - "matched_chunk": str, - "chunk_text": str, - "evidence": list, - "unsupported_terms": list[str], - "reason": str, - "warning": str | None, -} -``` - -## Verdict Meanings - -- `SUPPORTED`: strong evidence is present in the supplied sources. -- `WEAK_SUPPORT`: related evidence exists, but the claim is inferential or not fully direct. -- `CONTRADICTION`: relevant source evidence conflicts with the claim. -- `HALLUCINATION`: the claim lacks adequate source support. -- `UNVERIFIABLE_DENIAL`: the claim denies a fact or entity absent from the sources, so absence alone cannot prove it. -- `ERROR`: the verifier could not parse or evaluate the claim, mostly for malformed math. - -## Runtime Hardening - -Halgorithem handles: - -- missing files -- empty sources -- empty AI output -- malformed `truth_docs` -- missing `text` fields -- bad UTF-8 file encodings -- no extracted claims -- math parse errors -- missing spaCy or embedding models - -## Limitations - -- Rule-based entity-role detection handles common simple patterns, not arbitrary grammar. -- Local hashing embeddings are deterministic and CI-safe but less semantic than sentence-transformers. -- Multi-source disagreement is surfaced when the source qualifier is explicit. -- Table-like facts work best when row values are near each other in text. -- Current/latest claims are warned, not externally refreshed. - -## Roadmap - -- Optional structured table parser. -- Better contradiction handling for passive and nested clauses. -- Calibrated benchmark sets by domain. -- Machine-readable benchmark artifacts. -- More CLI commands beyond the interactive TUI. - -## Release Readiness - -v1.0 readiness means tests pass, the benchmark meets its threshold, CI passes, packaging installs, README limitations are documented, and the release checklist is complete. diff --git a/assets/Halgorithem.png b/assets/Halgorithem.png new file mode 100644 index 0000000..eed5d75 Binary files /dev/null and b/assets/Halgorithem.png differ diff --git a/engine.py b/engine.py index 4c3a0fa..fa6960e 100644 --- a/engine.py +++ b/engine.py @@ -27,12 +27,7 @@ def scrape_urls(self, urls: Iterable[str]): return [] docs = [] with TemporaryDirectory(prefix="halgorithem-scrape-") as tmp: - prev = Path.cwd() - os.chdir(tmp) - try: - WebScraper(urls).scrape() - finally: - os.chdir(prev) + WebScraper(urls, output_dir=tmp).scrape() for i, url in enumerate(urls, 1): f = Path(tmp) / f"file{i - 1}.txt" if not f.exists(): @@ -70,7 +65,11 @@ def verify(self, ai_output, source_docs, threshold=0.30): ai_output=ai_output, threshold=threshold, ) - return {"claims": claims, "summary": self.summarize(claims)} + return { + "claims": claims, + "summary": self.summarize(claims), + "diagnostics": self.algo.diagnostics, + } def _load_sources(self, urls=None, truth_file_paths=None): return self.scrape_urls(urls or []) + self.load_truth_files(truth_file_paths or []) diff --git a/main.py b/main.py new file mode 100644 index 0000000..d33f287 --- /dev/null +++ b/main.py @@ -0,0 +1,5 @@ +from Halgorithem.main import main + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 59c494a..411dad3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,19 +7,22 @@ name = "halgorithem" version = "1.0.0" description = "Deterministic hallucination detection against trusted source material." readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" license = { file = "LICENCE" } authors = [ { name = "Tangible Research" } ] dependencies = [ + "numpy<2", "beautifulsoup4>=4.12", "clean-text>=0.6", + "fastcoref>=2.1", "html2text>=2024.2.26", "markdown-it-py>=3", "negspacy>=1.0", "nltk>=3.8", "openai>=1.0", + "pydantic>=2", "pysbd>=0.3", "quantulum3[classifier]>=0.9", "requests>=2.31", @@ -27,13 +30,20 @@ dependencies = [ "scikit-learn>=1.3", "sentence-transformers>=2.7", "spacy>=3.7", + "spacy-curated-transformers>=0.3", "sympy>=1.12", "textacy>=0.13", "stemming>=1.0", + "torch>=2", + "transformers>=4", ] +[project.scripts] +halgorithem = "Halgorithem.main:main" + [project.optional-dependencies] dev = ["pytest>=8"] +advanced = ["spacy-curated-transformers>=0.3"] [tool.setuptools.packages.find] include = ["Halgorithem*"] diff --git a/requirements.txt b/requirements.txt index e07f5a7..17e07b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,13 @@ beautifulsoup4>=4.12 +fastcoref>=2.1 +numpy<2 clean-text>=0.6 html2text>=2024.2.26 markdown-it-py>=3 negspacy>=1.0 nltk>=3.8 openai>=1.0 +pydantic>=2 pysbd>=0.3 quantulum3[classifier]>=0.9 requests>=2.31 @@ -12,12 +15,15 @@ rich>=13 scikit-learn>=1.3 sentence-transformers>=2.7 spacy>=3.7 +spacy-curated-transformers>=0.3 sympy>=1.12 textacy>=0.13 stemming>=1.0 +torch>=2 +transformers>=4 # Optional but recommended for best NLP accuracy: -# python -m spacy download en_core_web_lg +# python -m spacy download en_core_web_trf # # CI and lightweight installs may use: # python -m spacy download en_core_web_sm diff --git a/tests/__pycache__/test_halgorithem.cpython-312-pytest-9.0.3.pyc b/tests/__pycache__/test_halgorithem.cpython-312-pytest-9.0.3.pyc deleted file mode 100644 index 1f34cf1..0000000 Binary files a/tests/__pycache__/test_halgorithem.cpython-312-pytest-9.0.3.pyc and /dev/null differ diff --git a/tests/test_core_pipeline.py b/tests/test_core_pipeline.py new file mode 100644 index 0000000..cdacd4d --- /dev/null +++ b/tests/test_core_pipeline.py @@ -0,0 +1,65 @@ +from Halgorithem.checks.nli import rule_nli +from Halgorithem.checks.units import normalize_units, unit_representation_mismatch +from Halgorithem.ingest import ingest_document +from Halgorithem.model_runtime import RebelClaimExtractor, dedupe_claims +from Halgorithem.models import AtomicClaim + + +def test_rule_claim_extractor_normalizes_short_location_relation(): + extractor = RebelClaimExtractor(model_name="rule") + + claims = extractor.extract("Lima is in Peru.") + + assert len(claims) == 1 + assert claims[0].subject.lower() == "lima" + assert claims[0].relation == "located" + assert claims[0].object.lower() == "peru" + + +def test_rule_nli_contradicts_short_and_explicit_location_mismatch(): + verdict, confidence = rule_nli("Lima is in Peru.", "Lima is located in Japan.") + + assert verdict == "CONTRADICT" + assert confidence >= 0.8 + + +def test_unit_normalization_matches_equivalent_rewrites(): + normalized, changes = normalize_units("The Mars sample container has a mass of 10000 grams.") + + assert "10 kilogram" in normalized + assert changes[0]["original"] == "10000 grams" + assert changes[0]["normalized"] == "10 kilogram" + + +def test_unit_representation_mismatch_flags_equivalent_rewrite(): + mismatch = unit_representation_mismatch( + "The Mars sample container has a mass of 10 kilograms.", + "The Mars sample container has a mass of 10000 grams.", + ) + + assert mismatch["source"] == "10 kilograms" + assert mismatch["response"] == "10000 grams" + assert mismatch["normalized"] == "10 kilogram" + + +def test_rebel_triplet_filter_rejects_malformed_artifacts(): + claims = dedupe_claims( + [ + AtomicClaim(subject="nasa.basic", relation="owned by", object="nasa", text="bad"), + AtomicClaim(subject="nasa", relation="owner of", object="nasa.basic", text="bad"), + AtomicClaim(subject="Lima", relation="country", object="Peru", text="good"), + ] + ) + + assert [claim.text for claim in claims] == ["good"] + + +def test_ingest_records_source_quality(): + document = ingest_document( + "NASA launched the test mission.", + source_name="https://www.nasa.gov/example", + extractor=RebelClaimExtractor(model_name="rule"), + ) + + assert document.sentences[0].source == "https://www.nasa.gov/example" + assert document.sentences[0].source_quality > 0.7 diff --git a/tests/test_halgorithem.py b/tests/test_halgorithem.py index 9edfc89..4ed4300 100644 --- a/tests/test_halgorithem.py +++ b/tests/test_halgorithem.py @@ -1,8 +1,10 @@ import pytest +import tui from Halgorithem import Halgorithm from Halgorithem.claim_extraction import split_atomic_claims from Halgorithem.contradiction import find_contradiction +from Halgorithem.core import LocalEmbedder from Halgorithem.retrieval import rank_chunks @@ -56,6 +58,10 @@ def test_supported_claim(algo, docs): assert first_status(algo, docs, "BASIC was created in 1964.") == "SUPPORTED" +def test_semantic_paraphrase_support(algo, docs): + assert first_status(algo, docs, "John Kemeny developed BASIC at Dartmouth in 1964.") == "SUPPORTED" + + def test_weak_support(algo, docs): assert first_status(algo, docs, "BASIC helped beginners learn programming.") == "WEAK_SUPPORT" @@ -80,6 +86,11 @@ def test_unit_contradiction(algo, docs): assert result["reason"] == "Unit mismatch" +def test_unit_conversion_support(algo, docs): + result = algo.compare_to_docs(docs, "The sample has a mass of 10000 grams.")[0] + assert result["status"] == "SUPPORTED" + + def test_math_checks(algo): supported = algo.compare_to_docs("Math source.", "2 + 2 = 4.")[0] contradicted = algo.compare_to_docs("Math source.", "2 + 2 = 5.")[0] @@ -89,6 +100,19 @@ def test_math_checks(algo): assert malformed["status"] == "ERROR" +def test_percent_claims_are_verified_as_source_claims(algo): + docs = "The drug reduced mortality by 20 percent." + result = algo.compare_to_docs(docs, "The drug reduced mortality by 20 percent.")[0] + assert result["type"] == "SOURCE" + assert result["status"] == "SUPPORTED" + + +def test_math_rejects_unsafe_symbols(algo): + result = algo.verify_math_claim("__import__('os') = 1") + assert result["status"] == "ERROR" + assert "unsupported symbols" in result["reason"] + + def test_temporal_warning(algo, docs): result = algo.compare_to_docs(docs, "The current status of Project Helios is active.")[0] assert result["warning"] == "Time-sensitive claim" @@ -119,6 +143,10 @@ def test_compare_to_files(algo, tmp_path): def test_runtime_hardening_errors(algo, tmp_path): + with pytest.raises(ValueError): + Halgorithm(sentences_per_chunk=0, sentence_overlap=0) + with pytest.raises(ValueError): + Halgorithm(sentences_per_chunk=1, sentence_overlap=1) with pytest.raises(FileNotFoundError): algo.compare_to_files([str(tmp_path / "missing.txt")], "A claim.") with pytest.raises(ValueError): @@ -126,3 +154,76 @@ def test_runtime_hardening_errors(algo, tmp_path): with pytest.raises(ValueError): algo.compare_to_docs([{"file_path": "bad"}], "A claim.") assert algo.compare_to_docs("A source.", "") == [] + + +def test_location_mismatch(algo): + docs = "Lima, Peru, 9752000. Tokyo, Japan, 14000000." + result = algo.compare_to_docs(docs, "Lima is located in Japan.")[0] + assert result["status"] == "CONTRADICTION" + assert result["reason"] == "Location mismatch" + + +def test_missing_table_entity_with_number_is_hallucination(algo): + docs = "Product A price 19 USD. Product B price 25 USD." + result = algo.compare_to_docs(docs, "Product C has price 31 USD.")[0] + assert result["status"] == "HALLUCINATION" + assert "c" in result["unsupported_terms"] + + +def test_britannica_nehru_paraphrase_with_lexical_fallback(): + algo = Halgorithm(sentences_per_chunk=2, sentence_overlap=1, embedder=LocalEmbedder()) + docs = ( + "Midnight on August 14-15, 1947, was a landmark moment. " + "Shortly before the stroke of midnight on August 14, India's first prime minister, " + "Jawaharlal Nehru, made a famous speech entitled A Tryst with Destiny." + ) + claim = ( + "India's first prime minister, Jawaharlal Nehru, delivered his renowned " + "Tryst with Destiny speech shortly before midnight on August 14, 1947." + ) + result = algo.compare_to_docs(docs, claim)[0] + assert result["status"] in {"SUPPORTED", "WEAK_SUPPORT"} + assert result["status"] != "HALLUCINATION" + + +def test_semantic_paraphrase_does_not_require_nearby_year(algo): + docs = ( + "Shortly before the stroke of midnight on August 14, India's first prime minister, " + "Jawaharlal Nehru, made a famous speech entitled A Tryst with Destiny." + ) + claim = ( + "India's first prime minister, Jawaharlal Nehru, delivered his renowned " + "Tryst with Destiny speech shortly before midnight on August 14, 1947." + ) + result = algo.compare_to_docs(docs, claim)[0] + assert result["status"] in {"SUPPORTED", "WEAK_SUPPORT"} + + +def test_result_exposes_embedder_diagnostics(): + algo = Halgorithm(embedder=LocalEmbedder()) + result = algo.compare_to_docs("BASIC was created in 1964.", "BASIC was created in 1964.")[0] + assert result["embedder"] == "lexical" + assert result["embedding_model"] == "HashingVectorizer" + + +def test_tui_api_key_prompt_is_visible(monkeypatch, tmp_path): + answers = iter([ + "test-key", + "files", + str(tmp_path / "source.txt"), + "0.30", + "2", + "1", + "What is true?", + ]) + prompt_kwargs = [] + + (tmp_path / "source.txt").write_text("A source.", encoding="utf-8") + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.setattr(tui.console, "clear", lambda: None) + monkeypatch.setattr(tui.Prompt, "ask", lambda *args, **kwargs: prompt_kwargs.append(kwargs) or next(answers)) + + config = tui.collect_inputs() + + assert config["prompt"] == "What is true?" + assert prompt_kwargs[0].get("password") is None diff --git a/tests/test_voting.py b/tests/test_voting.py new file mode 100644 index 0000000..4c6a793 --- /dev/null +++ b/tests/test_voting.py @@ -0,0 +1,74 @@ +import pytest + +from Halgorithem.models import AtomicCheck, AtomicClaimResult, NLICheck, SimilarityCheck +from Halgorithem.voting import fuse_votes + + +def test_voting_supports_strong_entailment(): + verdict, confidence = fuse_votes( + SimilarityCheck(score=0.82, evidence="source"), + NLICheck(verdict="ENTAIL", confidence=0.91, evidence="source"), + AtomicCheck(claims=[AtomicClaimResult(claim="A", verdict="ENTAIL", confidence=0.88)], score=0.88), + ) + assert verdict == "SUPPORTED" + assert confidence >= 0.85 + + +def test_voting_discards_weak_nli(): + verdict, confidence = fuse_votes( + SimilarityCheck(score=0.78, evidence="source"), + NLICheck(verdict="CONTRADICT", confidence=0.40, evidence="source"), + AtomicCheck(claims=[], score=None), + ) + assert verdict == "SUPPORTED" + assert confidence == pytest.approx(0.78) + + +def test_voting_hallucinates_confident_contradiction(): + verdict, confidence = fuse_votes( + SimilarityCheck(score=0.76, evidence="source"), + NLICheck(verdict="CONTRADICT", confidence=0.93, evidence="source"), + AtomicCheck(claims=[], score=None), + ) + assert verdict == "HALLUCINATED" + assert confidence >= 0.93 + + +def test_voting_does_not_let_contested_nli_override_win_alone(): + verdict, confidence = fuse_votes( + SimilarityCheck(score=0.94, evidence="source"), + NLICheck(verdict="CONTRADICT", confidence=0.93, evidence="source"), + AtomicCheck( + claims=[AtomicClaimResult(claim="A", verdict="ENTAIL", confidence=0.92)], + score=0.92, + ), + ) + assert verdict == "UNVERIFIABLE" + assert confidence > 0.9 + + +def test_voting_source_quality_scales_similarity_weight_only(): + trusted_verdict, trusted_confidence = fuse_votes( + SimilarityCheck(score=0.82, evidence="source", source_quality=0.95), + NLICheck(verdict="NEUTRAL", confidence=0.7, evidence="source"), + AtomicCheck(claims=[], score=None), + ) + weak_verdict, weak_confidence = fuse_votes( + SimilarityCheck(score=0.82, evidence="source", source_quality=0.25), + NLICheck(verdict="NEUTRAL", confidence=0.7, evidence="source"), + AtomicCheck(claims=[], score=None), + ) + + assert trusted_verdict == "UNVERIFIABLE" + assert weak_verdict == "UNVERIFIABLE" + assert trusted_confidence < weak_confidence + + +def test_voting_unverifiable_when_only_weak_signals_exist(): + verdict, confidence = fuse_votes( + SimilarityCheck(score=0.22), + NLICheck(verdict="NEUTRAL", confidence=0.52), + AtomicCheck(claims=[], score=None), + ) + assert verdict == "UNVERIFIABLE" + assert confidence == 0.0 diff --git a/tui.py b/tui.py index 2949af1..3d41e58 100644 --- a/tui.py +++ b/tui.py @@ -53,7 +53,7 @@ def collect_inputs(): if existing_key: console.print(f"[dim]Using existing OPENAI_API_KEY ({existing_key[:4]}...)[/dim]") else: - api_key = Prompt.ask("[bold green]OpenAI API key[/bold green]", password=True) + api_key = Prompt.ask("[bold green]OpenAI API key[/bold green]") if not api_key.strip(): console.print("[red]No API key provided. Exiting.[/red]") raise SystemExit(1) @@ -186,6 +186,13 @@ def render_results(source_docs, ai_output, verification, config): config_table.add_row("Sentences/chunk", str(config["sentences_per_chunk"])) config_table.add_row("Overlap", str(config["sentence_overlap"])) config_table.add_row("Sources", str(len(source_docs))) + diagnostics = verification.get("diagnostics") or {} + if diagnostics: + config_table.add_row("Embedder", str(diagnostics.get("embedder", "unknown"))) + if diagnostics.get("embedding_model"): + config_table.add_row("Embedding model", str(diagnostics["embedding_model"])) + if diagnostics.get("embedding_fallback_reason"): + config_table.add_row("Fallback", str(diagnostics["embedding_fallback_reason"])[:80]) console.print(config_table) console.print()