Spaces:

vladbogo
/

Filtir

Sleeping

File size: 4,574 Bytes

from step1_api_claim_extractor import ClaimExtractor
from step2_api_fix_passage_anchors import FixAnchors
from step3_api_identify_objective_claims import ClassifyClaims
from step41_api_fetch_cohere_wikipedia_evidence import CohereEvidence
from step42_api_fetch_google_search_evidence import GoogleEvidence
from step5_api_embed_search_results import EmbedResults
from step6_api_claims_to_evidence import ClaimToEvidence
from step7_api_check_claims_against_evidence import CheckClaimAgainstEvidence
from step8_api_format_fact_checked_document import FormatDocument

import argparse
import json
import os
import copy
from dotenv import load_dotenv

load_dotenv()


def get_fact_checked(text_input, model="gpt-3.5-turbo", mode="slow"):
    text_input = text_input.strip()

    results = {}

    # STEP1
    print("Step1: Extracting claims")
    step1 = ClaimExtractor(model=model)
    step1_json = step1.extract_claims(text_input)
    results["step1_claims"] = copy.deepcopy(step1_json)

    # STEP2
    print("Step2: Anchoring claims")
    try:
        step2 = FixAnchors(model=model)
        step2_json = step2.fix_passage_anchors(step1_json, text_input)
    except:
        if model != "gpt-4":
            print("Step2 failed with gpt-3.5, trying with gpt-4!")
            step2 = FixAnchors(model="gpt-4")
            step2_json = step2.fix_passage_anchors(step1_json, text_input)
    results["step2_anchored_claims"] = copy.deepcopy(step2_json)

    # STEP3
    print("Step3: Classifying claims")
    step3 = ClassifyClaims(model=model)
    step3_json = step3.classify_claims(step2_json)
    step3_filter = step3.filter_to_objective_claims(step3_json)
    results["step3_classify_claims"] = copy.deepcopy(step3_json)
    results["step3_objective_claims"] = copy.deepcopy(step3_filter)

    if len(step3_filter) == 0:
        return {"fact_checked_md": "No objective claims found!"}

    # STEP4.1
    print("Step4.1: Gathering evidence")
    step4_cohere = CohereEvidence()
    step4_json_cohere = (
        step4_cohere.fetch_cohere_semantic_search_results_to_gather_evidence(
            step3_filter
        )
    )
    results["step41_cohere_evidence"] = copy.deepcopy(step4_json_cohere)

    # STEP4.2
    print("Step4.2: Gathering evidence")
    step4_json_google = None
    if mode == "slow":
        step4_json_google = ""
        try:
            step4_google = GoogleEvidence(model=model)
            step4_json_google = step4_google.fetch_search_results_to_gather_evidence(
                step3_filter
            )
        except Exception as e:
            print(f"Google search failed: {e}")
            pass
        results["step42_google_evidence"] = copy.deepcopy(step4_json_google)

    embedding_model = "text-embedding-ada-002"
    text_embedding_chunk_size = 500

    srcs = [step4_json_cohere]
    if step4_json_google:
        srcs.append(step4_json_google)

    # STEP 5
    print("Step5: Embedding evidence")
    step5 = EmbedResults(
        embedding_model=embedding_model,
        text_embedding_chunk_size=text_embedding_chunk_size,
    )
    faiss_db = step5.embed_for_uuid(srcs)

    # STEP 6
    print("Step6: Linking claims to evidence")
    step6 = ClaimToEvidence()
    step6_json = step6.link_claims_to_evidence(step3_filter, faiss_db)
    results["step6_claim_to_evidence"] = copy.deepcopy(step6_json)

    # STEP 7
    print("Step7: Checking claims against evidence")
    step7 = CheckClaimAgainstEvidence(model=model)
    step7_json = step7.check_claims_against_evidence(step6_json)
    results["step7_evaluated_claims"] = copy.deepcopy(step7_json)

    # STEP 8
    print("Step8: Formatting")
    step8 = FormatDocument(model=model, footnote_style="verbose")
    step8_md = step8.reformat_document_to_include_claims(
        text_input, step7_json, footnote_style="verbose"
    )
    step8_md_terse = step8.reformat_document_to_include_claims(
        text_input, step7_json, footnote_style="terse"
    )

    results["fact_checked_md"] = copy.deepcopy(step8_md)
    results["fact_checked_terse"] = copy.deepcopy(step8_md_terse)
    return results


def main(args):
    with open(args.file, "r") as f:
        text = f.read()
    out = get_fact_checked(text, mode="slow", model=args.model)
    print(out["fact_checked_md"])


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process a file.")
    parser.add_argument("--file", type=str, help="File to process", required=True)
    parser.add_argument("--model", type=str, help="Model to use", required=True)
    args = parser.parse_args()
    main(args)