File size: 4,574 Bytes
7a8b33f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e56b403
 
 
 
 
 
 
7a8b33f
e56b403
 
7a8b33f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from step1_api_claim_extractor import ClaimExtractor
from step2_api_fix_passage_anchors import FixAnchors
from step3_api_identify_objective_claims import ClassifyClaims
from step41_api_fetch_cohere_wikipedia_evidence import CohereEvidence
from step42_api_fetch_google_search_evidence import GoogleEvidence
from step5_api_embed_search_results import EmbedResults
from step6_api_claims_to_evidence import ClaimToEvidence
from step7_api_check_claims_against_evidence import CheckClaimAgainstEvidence
from step8_api_format_fact_checked_document import FormatDocument

import argparse
import json
import os
import copy
from dotenv import load_dotenv

load_dotenv()


def get_fact_checked(text_input, model="gpt-3.5-turbo", mode="slow"):
    text_input = text_input.strip()

    results = {}

    # STEP1
    print("Step1: Extracting claims")
    step1 = ClaimExtractor(model=model)
    step1_json = step1.extract_claims(text_input)
    results["step1_claims"] = copy.deepcopy(step1_json)

    # STEP2
    print("Step2: Anchoring claims")
    try:
        step2 = FixAnchors(model=model)
        step2_json = step2.fix_passage_anchors(step1_json, text_input)
    except:
        if model != "gpt-4":
            print("Step2 failed with gpt-3.5, trying with gpt-4!")
            step2 = FixAnchors(model="gpt-4")
            step2_json = step2.fix_passage_anchors(step1_json, text_input)
    results["step2_anchored_claims"] = copy.deepcopy(step2_json)

    # STEP3
    print("Step3: Classifying claims")
    step3 = ClassifyClaims(model=model)
    step3_json = step3.classify_claims(step2_json)
    step3_filter = step3.filter_to_objective_claims(step3_json)
    results["step3_classify_claims"] = copy.deepcopy(step3_json)
    results["step3_objective_claims"] = copy.deepcopy(step3_filter)

    if len(step3_filter) == 0:
        return {"fact_checked_md": "No objective claims found!"}

    # STEP4.1
    print("Step4.1: Gathering evidence")
    step4_cohere = CohereEvidence()
    step4_json_cohere = (
        step4_cohere.fetch_cohere_semantic_search_results_to_gather_evidence(
            step3_filter
        )
    )
    results["step41_cohere_evidence"] = copy.deepcopy(step4_json_cohere)

    # STEP4.2
    print("Step4.2: Gathering evidence")
    step4_json_google = None
    if mode == "slow":
        step4_json_google = ""
        try:
            step4_google = GoogleEvidence(model=model)
            step4_json_google = step4_google.fetch_search_results_to_gather_evidence(
                step3_filter
            )
        except Exception as e:
            print(f"Google search failed: {e}")
            pass
        results["step42_google_evidence"] = copy.deepcopy(step4_json_google)

    embedding_model = "text-embedding-ada-002"
    text_embedding_chunk_size = 500

    srcs = [step4_json_cohere]
    if step4_json_google:
        srcs.append(step4_json_google)

    # STEP 5
    print("Step5: Embedding evidence")
    step5 = EmbedResults(
        embedding_model=embedding_model,
        text_embedding_chunk_size=text_embedding_chunk_size,
    )
    faiss_db = step5.embed_for_uuid(srcs)

    # STEP 6
    print("Step6: Linking claims to evidence")
    step6 = ClaimToEvidence()
    step6_json = step6.link_claims_to_evidence(step3_filter, faiss_db)
    results["step6_claim_to_evidence"] = copy.deepcopy(step6_json)

    # STEP 7
    print("Step7: Checking claims against evidence")
    step7 = CheckClaimAgainstEvidence(model=model)
    step7_json = step7.check_claims_against_evidence(step6_json)
    results["step7_evaluated_claims"] = copy.deepcopy(step7_json)

    # STEP 8
    print("Step8: Formatting")
    step8 = FormatDocument(model=model, footnote_style="verbose")
    step8_md = step8.reformat_document_to_include_claims(
        text_input, step7_json, footnote_style="verbose"
    )
    step8_md_terse = step8.reformat_document_to_include_claims(
        text_input, step7_json, footnote_style="terse"
    )

    results["fact_checked_md"] = copy.deepcopy(step8_md)
    results["fact_checked_terse"] = copy.deepcopy(step8_md_terse)
    return results


def main(args):
    with open(args.file, "r") as f:
        text = f.read()
    out = get_fact_checked(text, mode="slow", model=args.model)
    print(out["fact_checked_md"])


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process a file.")
    parser.add_argument("--file", type=str, help="File to process", required=True)
    parser.add_argument("--model", type=str, help="Model to use", required=True)
    args = parser.parse_args()
    main(args)