File size: 8,829 Bytes
7a8b33f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import argparse
import json
import multiprocessing as mp
from zsvision.zs_multiproc import starmap_with_kwargs
from zsvision.zs_utils import BlockTimer
from text_utils import is_unique_verbatim_quote, parse_passage_quote_and_claim
from llm_api_utils import (
call_openai_with_exponetial_backoff,
estimate_cost_of_text_generation_api_call,
init_openai_with_api_key,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
class ClaimExtractor:
def __init__(
self,
temperature=0,
model="gpt-3.5-turbo",
filter_str="",
processes=1,
refresh=False,
):
"""Initializes ClaimExtractor with the provided arguments"""
self.temperature = temperature
self.model = model
self.filter_str = filter_str
self.processes = processes
self.refresh = refresh
def extract_claims_from_passage(
self,
idx: int,
total: int,
passage: str,
):
init_openai_with_api_key()
print(f"Processing passage {idx + 1} of {total}")
prompt = f"""\
Task:
Enumerate all the discrete factual claims or logical assertions stated in the passage that follows the dashed horizontal line below. \
To allow the claims to be linked to the passage, use the format: `VERBATIM_PASSAGE_QUOTE_FOR_CLAIM: <verbatim passage quote for claim>, CLAIM: <claim>` on each line. \
The <verbatim passage quote for claim> must be A SINGLE UNEDITED SUBSTRING from the passage that uniquely identifies the claim. \
The <verbatim passage quote for claim> must carefully preserve all punctuation and clauses from the original passage. \
This text will be used in the final national exam.
----------
Here is an example passage, together with the verbatim passage quotes and claims that should be extracted from it:
Passage:
Immanuel Kant was born in 1724 into a modest, devoutly religious family, with his father working as a saddle-maker. \
He was one of nine children, but only five, including Kant, survived to adulthood. \
His upbringing was steeped in the Pietist tradition, emphasizing intense religious devotion, a literal interpretation of the Bible, and a strong focus on personal morality. \
Kant attended the University of Königsberg, studying various subjects, including theology, metaphysics, and natural science. \
After completing his studies, Kant worked as a private tutor for nine years before returning to the University of Königsberg as a lecturer in 1755. \
In his works Groundwork of the Metaphysics of Morals (1785) and Critique of Practical Reason (1788), Kant argues that morality is not contingent upon personal desires or cultural norms. \
Extracted source phrases and claims:
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Immanuel Kant was born in 1724 into a modest, devoutly religious family [CLAIM] Immanuel Kant was born in 1724.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Immanuel Kant was born in 1724 into a modest, devoutly religious family [CLAIM] Immanuel Kant was born into a modest family.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Immanuel Kant was born in 1724 into a modest, devoutly religious family [CLAIM] Immanuel Kant was born into a devoutly religious family.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] with his father working as a saddle-maker [CLAIM] Immnauel Kant's father worked as a saddle-maker.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] He was one of nine children [CLAIM] Immanuel Kant was one of nine children.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] but only five, including Kant survived to adulthood [CLAIM] Only five of Immanuel Kant's parents' children survived to adulthood.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] His upbringing was steeped in the Pietist tradition [CLAIM] Immanuel Kant's upbringing was steeped in the Pietist tradition.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] emphasizing intense religious devotion [CLAIM] Immanuel Kant's upbringing emphasized intense religious devotion.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] a literal interpretation of the Bible [CLAIM] Immanuel Kant's upbringing emphasized a literal interpretation of the Bible.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] a strong focus on personal morality [CLAIM] Immanuel Kant's upbringing emphasized a strong focus on personal morality.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Kant attended the University of Königsberg [CLAIM] Immanuel Kant attended the University of Königsberg.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] studying various subjects, including theology, metaphysics, and natural science [CLAIM] Immanuel Kant studied theology.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] studying various subjects, including theology, metaphysics, and natural science [CLAIM] Immanuel Kant studied metaphysics.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] studying various subjects, including theology, metaphysics, and natural science [CLAIM] Immanuel Kant studied natural science.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] After completing his studies [CLAIM] Immanuel Kant completed his studies.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] After completing his studies, Kant worked as a private tutor for nine years [CLAIM] After completing his studies, Immanuel Kant worked as a private tutor.
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] before returning to the University of Königsberg as a lecturer in 1755 [CLAIM] Immanuel Kant returned to the University of Königsberg as a lecturer in 1755.
----------
Passage:
{passage}
Extracted source phrases and claims:\
"""
persona = "You are a careful research assistant who helps with fact-checking and editing informative articles."
system_message = {"role": "system", "content": persona}
user_message = {"role": "user", "content": prompt}
messages = [system_message, user_message]
with BlockTimer(f"Using OpenAI API to extract claims with {self.model}"):
response = call_openai_with_exponetial_backoff(
model=self.model,
temperature=self.temperature,
messages=messages,
)
cost = estimate_cost_of_text_generation_api_call(
model=self.model, response=response, verbose=True
)
content = response.choices[0].message.content
content = content.strip()
quotes_and_claims = content.split("\n")
parsed_claims = []
for quote_and_claim in quotes_and_claims:
quote_and_claim = quote_and_claim.strip()
if "[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]" not in quote_and_claim:
quote_and_claim = quote_and_claim.replace(
"VERBATIM_PASSAGE_QUOTE_FOR_CLAIM: ",
"[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]: ",
)
if "[CLAIM]" not in quote_and_claim:
quote_and_claim = quote_and_claim.replace(" CLAIM:", " [CLAIM]:")
if "[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]" not in quote_and_claim:
continue
quote_and_claim = quote_and_claim.strip()
parsed = parse_passage_quote_and_claim(quote_and_claim)
is_unique_and_verbatim = is_unique_verbatim_quote(
verbatim_quote=parsed["verbatim_quote"], original_passage=passage
)
parsed["is_unique_and_verbatim"] = is_unique_and_verbatim
parsed_claims.append(parsed)
return {"claims": parsed_claims, "cost": cost}
def extract_claims(self, text_input):
"""
Extracts claims from text_input and return the extracted claims in a json file
"""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.create_documents([text_input])
print(f"Split text into {len(docs)} documents")
all_claims = []
kwarg_list = []
for idx, doc in enumerate(docs):
# remove newlines from the passage to avoid a confusing prompt format
passage = doc.page_content.replace("\n", " ")
kwarg_list.append(
{
"idx": idx,
"total": len(docs),
"passage": passage,
}
)
if self.processes == 1:
results = []
for kwargs in kwarg_list:
results.append(self.extract_claims_from_passage(**kwargs))
else: # multiprocess
func = self.extract_claims_from_passage
with mp.Pool(processes=self.processes) as pool:
results = starmap_with_kwargs(
pool=pool, func=func, kwargs_iter=kwarg_list
)
cost = sum([result["cost"] for result in results])
all_claims = []
for result in results:
all_claims.extend(result["claims"])
print(f"Returning {len(all_claims)} claims (cost: {cost} USD)")
return all_claims
|