Synced repo using 'sync_with_huggingface' Github Action
Browse files- __pycache__/fact_checker.cpython-310.pyc +0 -0
- app.py +75 -28
- assets/overall.png +2 -2
- fact_checker.py +135 -28
- visualize.ipynb +0 -0
__pycache__/fact_checker.cpython-310.pyc
CHANGED
Binary files a/__pycache__/fact_checker.cpython-310.pyc and b/__pycache__/fact_checker.cpython-310.pyc differ
|
|
app.py
CHANGED
@@ -154,34 +154,81 @@ button[kind="primary"] {
|
|
154 |
st.session_state.last_claim = claim
|
155 |
st.session_state.feedback_submitted = False # Reset feedback state for new claim
|
156 |
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
st.markdown(f"{
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
# Feedback system
|
187 |
feedback_key = f"feedback_radio_{st.session_state.last_claim}"
|
|
|
154 |
st.session_state.last_claim = claim
|
155 |
st.session_state.feedback_submitted = False # Reset feedback state for new claim
|
156 |
|
157 |
+
# Display results from session state
|
158 |
+
if st.session_state.result:
|
159 |
+
result = st.session_state.result
|
160 |
+
|
161 |
+
# Show entity verification results
|
162 |
+
st.subheader("Entity Verification Results")
|
163 |
+
entities = result.get("entities", [])
|
164 |
+
if entities:
|
165 |
+
for idx, entity_result in enumerate(entities, 1):
|
166 |
+
st.markdown(f"### Entity {idx}: {entity_result.get('entity', '')} ({entity_result.get('type', '')})")
|
167 |
+
|
168 |
+
if "error" in entity_result:
|
169 |
+
st.error(f"Error: {entity_result['error']}")
|
170 |
+
if "raw_response" in entity_result:
|
171 |
+
with st.expander("Show raw LLM response"):
|
172 |
+
st.code(entity_result["raw_response"])
|
173 |
+
continue
|
174 |
+
|
175 |
+
verdict_color = {
|
176 |
+
"Valid": "green",
|
177 |
+
"Invalid": "red",
|
178 |
+
"Unverified": "orange"
|
179 |
+
}.get(entity_result.get("verdict", ""), "gray")
|
180 |
+
st.markdown(f"**Verdict:** :{verdict_color}[{entity_result.get('verdict', 'Unknown')}]")
|
181 |
+
|
182 |
+
# Confidence
|
183 |
+
st.metric("Confidence Score", f"{entity_result.get('confidence', 0):.2f}")
|
184 |
+
|
185 |
+
# Evidence
|
186 |
+
with st.expander("View Supporting Evidence"):
|
187 |
+
for i, evidence in enumerate(entity_result.get("evidence", []), 1):
|
188 |
+
st.markdown(f"{i}. {evidence}")
|
189 |
+
|
190 |
+
# Reasoning
|
191 |
+
st.markdown("**Analysis:**")
|
192 |
+
st.write(entity_result.get("reasoning", "No reasoning provided"))
|
193 |
+
else:
|
194 |
+
st.write("No entities detected or verified.")
|
195 |
+
|
196 |
+
# Show claim verification results
|
197 |
+
st.subheader("Detected Claims and Verification Results")
|
198 |
+
claims = result.get("claims", [])
|
199 |
+
if not claims:
|
200 |
+
st.info("No check-worthy claims detected in the input.")
|
201 |
+
else:
|
202 |
+
for idx, claim_result in enumerate(claims, 1):
|
203 |
+
st.markdown(f"### Claim {idx}")
|
204 |
+
st.markdown(f"> {claim_result.get('claim', '')}")
|
205 |
+
|
206 |
+
if "error" in claim_result:
|
207 |
+
st.error(f"Error: {claim_result['error']}")
|
208 |
+
if "raw_response" in claim_result:
|
209 |
+
with st.expander("Show raw LLM response"):
|
210 |
+
st.code(claim_result["raw_response"])
|
211 |
+
continue
|
212 |
+
|
213 |
+
verdict_color = {
|
214 |
+
"True": "green",
|
215 |
+
"False": "red",
|
216 |
+
"Unverifiable": "orange"
|
217 |
+
}.get(claim_result.get("verdict", ""), "gray")
|
218 |
+
st.markdown(f"**Verdict:** :{verdict_color}[{claim_result.get('verdict', 'Unknown')}]")
|
219 |
+
|
220 |
+
# Confidence
|
221 |
+
st.metric("Confidence Score", f"{claim_result.get('confidence', 0):.2f}")
|
222 |
+
|
223 |
+
# Evidence
|
224 |
+
with st.expander("View Supporting Evidence"):
|
225 |
+
for i, evidence in enumerate(claim_result.get("evidence", []), 1):
|
226 |
+
st.markdown(f"{i}. {evidence}")
|
227 |
+
|
228 |
+
# Reasoning
|
229 |
+
st.markdown("**Analysis:**")
|
230 |
+
st.write(claim_result.get("reasoning", "No reasoning provided"))
|
231 |
+
|
232 |
|
233 |
# Feedback system
|
234 |
feedback_key = f"feedback_radio_{st.session_state.last_claim}"
|
assets/overall.png
CHANGED
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
fact_checker.py
CHANGED
@@ -6,6 +6,9 @@ import re
|
|
6 |
from openai import OpenAI
|
7 |
import re
|
8 |
import json
|
|
|
|
|
|
|
9 |
|
10 |
def robust_json_extractor(response_content):
|
11 |
# Preprocess: Remove markdown code blocks and extra whitespace
|
@@ -56,49 +59,55 @@ class FactChecker:
|
|
56 |
)
|
57 |
self.groq_client = groq_client
|
58 |
self.model_name = "llama3-8b-8192"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
-
def
|
61 |
-
# Vector search returns full verified statements with distances
|
62 |
results = self.collection.query(
|
63 |
query_texts=[claim],
|
64 |
n_results=3,
|
65 |
include=["documents", "metadatas", "distances"]
|
66 |
)
|
67 |
-
|
68 |
-
# Pair documents with their distances and sort by similarity (ascending distance)
|
69 |
zipped_results = sorted(
|
70 |
zip(results['documents'][0], results['metadatas'][0], results['distances'][0]),
|
71 |
-
key=lambda x: x[2]
|
72 |
)
|
73 |
-
|
74 |
-
# Format evidence with similarity scores (full sentences, not fragments)
|
75 |
evidence = []
|
76 |
for doc, meta, distance in zipped_results:
|
77 |
source = meta["source"] if meta and "source" in meta else "Unknown source"
|
78 |
-
# Convert distance to similarity score (higher = more similar)
|
79 |
similarity_score = 1 - (distance / 2) # Assuming cosine distance in [0,2]
|
80 |
evidence.append(
|
81 |
f'"{doc}" (Source: {source}, Similarity: {similarity_score:.2f})'
|
82 |
)
|
83 |
-
|
84 |
-
|
85 |
-
# Calculate overall confidence
|
86 |
avg_distance = sum(d for _, _, d in zipped_results) / len(zipped_results)
|
87 |
confidence = 1 - (avg_distance / 2) # Normalize to 0-1 range
|
88 |
|
89 |
-
# Threshold check
|
90 |
if confidence < confidence_threshold:
|
91 |
return {
|
92 |
"verdict": "Unverifiable",
|
93 |
"confidence": confidence,
|
94 |
-
"evidence": [e.split(" (Source:")[0] for e in evidence],
|
95 |
"reasoning": "Claim is too vague or lacks sufficient evidence"
|
96 |
}
|
97 |
|
98 |
-
# LLM verification with distance-aware prompt
|
99 |
evidence_str = "\n".join([f"- {e}" for e in evidence])
|
100 |
-
prompt = f"""
|
101 |
-
Relying on the similarity scores, also carefully check whether all factual details in the claim (such as dates, names, locations, and events) exactly match
|
102 |
If there is any factual mismatch (for example, the date in the claim is different from the evidence), classify the claim as False. Any factual mismatch, even if the overall context is similar, should lead to a False classification.
|
103 |
If the evidence is too vague or lacks strong matches, classify as Unverifiable.
|
104 |
If evidence directly contradicts the claim, classify as False.
|
@@ -113,7 +122,6 @@ Evidence (with similarity scores):
|
|
113 |
|
114 |
Guidelines:
|
115 |
1. Give more weight to evidence with higher similarity scores, but do not ignore factual mismatches.
|
116 |
-
2. If any one piece of evidence independently supports the claim, without factual mismatches, classify as True.
|
117 |
2. Pay close attention to details such as dates, names, locations, and events.
|
118 |
3. If the claim and evidence differ on any factual point, do not classify as True.
|
119 |
4. Respond only in JSON format without any additional text.
|
@@ -127,23 +135,14 @@ Respond in JSON format:
|
|
127 |
"reasoning": "Explanation of the verdict based on evidence and factual details"
|
128 |
}}
|
129 |
"""
|
130 |
-
|
131 |
-
|
132 |
completion = self.groq_client.chat.completions.create(
|
133 |
model=self.model_name,
|
134 |
messages=[{"role": "user", "content": prompt}],
|
135 |
temperature=0.1,
|
136 |
max_tokens=400
|
137 |
)
|
138 |
-
|
139 |
-
# Process response
|
140 |
response_content = completion.choices[0].message.content
|
141 |
-
print(f"Response from Groq: {response_content}")
|
142 |
-
|
143 |
-
# Use the robust JSON extractor
|
144 |
parsed = robust_json_extractor(response_content)
|
145 |
-
print(f"Parsed JSON: {parsed}")
|
146 |
-
|
147 |
if "error" in parsed:
|
148 |
return {
|
149 |
"error": parsed["error"],
|
@@ -151,7 +150,6 @@ Respond in JSON format:
|
|
151 |
"raw_response": parsed.get("raw", response_content)
|
152 |
}
|
153 |
else:
|
154 |
-
# Validate required fields
|
155 |
required_keys = ["verdict", "evidence", "reasoning"]
|
156 |
if all(key in parsed for key in required_keys):
|
157 |
return {
|
@@ -165,4 +163,113 @@ Respond in JSON format:
|
|
165 |
"error": f"Missing required keys: {[k for k in required_keys if k not in parsed]}",
|
166 |
"confidence": confidence,
|
167 |
"raw_response": response_content
|
168 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from openai import OpenAI
|
7 |
import re
|
8 |
import json
|
9 |
+
import spacy
|
10 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
11 |
+
from transformers import pipeline
|
12 |
|
13 |
def robust_json_extractor(response_content):
|
14 |
# Preprocess: Remove markdown code blocks and extra whitespace
|
|
|
59 |
)
|
60 |
self.groq_client = groq_client
|
61 |
self.model_name = "llama3-8b-8192"
|
62 |
+
self.ner = spacy.load("en_core_web_sm")
|
63 |
+
|
64 |
+
|
65 |
+
self.claim_tokenizer = T5Tokenizer.from_pretrained("Babelscape/t5-base-summarization-claim-extractor")
|
66 |
+
self.claim_model = T5ForConditionalGeneration.from_pretrained("Babelscape/t5-base-summarization-claim-extractor")
|
67 |
+
|
68 |
+
def extract_entities(self, text):
|
69 |
+
doc = self.ner(text)
|
70 |
+
return [(ent.text, ent.label_) for ent in doc.ents]
|
71 |
+
|
72 |
+
def extract_claims(self, text, threshold=0.5):
|
73 |
+
tok_input = self.claim_tokenizer.batch_encode_plus([text], return_tensors="pt", padding=True)
|
74 |
+
outputs = self.claim_model.generate(**tok_input)
|
75 |
+
claims = self.claim_tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
76 |
+
claims = [claim.strip() for claim in claims if len(claim.strip()) > 0]
|
77 |
+
return claims
|
78 |
+
|
79 |
|
80 |
+
def verify_single_claim(self, claim, confidence_threshold=0.5):
|
|
|
81 |
results = self.collection.query(
|
82 |
query_texts=[claim],
|
83 |
n_results=3,
|
84 |
include=["documents", "metadatas", "distances"]
|
85 |
)
|
|
|
|
|
86 |
zipped_results = sorted(
|
87 |
zip(results['documents'][0], results['metadatas'][0], results['distances'][0]),
|
88 |
+
key=lambda x: x[2]
|
89 |
)
|
|
|
|
|
90 |
evidence = []
|
91 |
for doc, meta, distance in zipped_results:
|
92 |
source = meta["source"] if meta and "source" in meta else "Unknown source"
|
|
|
93 |
similarity_score = 1 - (distance / 2) # Assuming cosine distance in [0,2]
|
94 |
evidence.append(
|
95 |
f'"{doc}" (Source: {source}, Similarity: {similarity_score:.2f})'
|
96 |
)
|
|
|
|
|
|
|
97 |
avg_distance = sum(d for _, _, d in zipped_results) / len(zipped_results)
|
98 |
confidence = 1 - (avg_distance / 2) # Normalize to 0-1 range
|
99 |
|
|
|
100 |
if confidence < confidence_threshold:
|
101 |
return {
|
102 |
"verdict": "Unverifiable",
|
103 |
"confidence": confidence,
|
104 |
+
"evidence": [e.split(" (Source:")[0] for e in evidence],
|
105 |
"reasoning": "Claim is too vague or lacks sufficient evidence"
|
106 |
}
|
107 |
|
|
|
108 |
evidence_str = "\n".join([f"- {e}" for e in evidence])
|
109 |
+
prompt = f"""You are a powerful fact checker. Analyze the claim below against the provided verified information.
|
110 |
+
Relying on the similarity scores, also carefully check whether all factual details in the claim (such as dates, names, locations, and events) exactly match the evidence.
|
111 |
If there is any factual mismatch (for example, the date in the claim is different from the evidence), classify the claim as False. Any factual mismatch, even if the overall context is similar, should lead to a False classification.
|
112 |
If the evidence is too vague or lacks strong matches, classify as Unverifiable.
|
113 |
If evidence directly contradicts the claim, classify as False.
|
|
|
122 |
|
123 |
Guidelines:
|
124 |
1. Give more weight to evidence with higher similarity scores, but do not ignore factual mismatches.
|
|
|
125 |
2. Pay close attention to details such as dates, names, locations, and events.
|
126 |
3. If the claim and evidence differ on any factual point, do not classify as True.
|
127 |
4. Respond only in JSON format without any additional text.
|
|
|
135 |
"reasoning": "Explanation of the verdict based on evidence and factual details"
|
136 |
}}
|
137 |
"""
|
|
|
|
|
138 |
completion = self.groq_client.chat.completions.create(
|
139 |
model=self.model_name,
|
140 |
messages=[{"role": "user", "content": prompt}],
|
141 |
temperature=0.1,
|
142 |
max_tokens=400
|
143 |
)
|
|
|
|
|
144 |
response_content = completion.choices[0].message.content
|
|
|
|
|
|
|
145 |
parsed = robust_json_extractor(response_content)
|
|
|
|
|
146 |
if "error" in parsed:
|
147 |
return {
|
148 |
"error": parsed["error"],
|
|
|
150 |
"raw_response": parsed.get("raw", response_content)
|
151 |
}
|
152 |
else:
|
|
|
153 |
required_keys = ["verdict", "evidence", "reasoning"]
|
154 |
if all(key in parsed for key in required_keys):
|
155 |
return {
|
|
|
163 |
"error": f"Missing required keys: {[k for k in required_keys if k not in parsed]}",
|
164 |
"confidence": confidence,
|
165 |
"raw_response": response_content
|
166 |
+
}
|
167 |
+
|
168 |
+
def verify_single_entity(self, entity_text, confidence_threshold=0.5):
|
169 |
+
"""Verify a single named entity against the fact database"""
|
170 |
+
# Vector similarity search
|
171 |
+
results = self.collection.query(
|
172 |
+
query_texts=[entity_text],
|
173 |
+
n_results=3,
|
174 |
+
include=["documents", "metadatas", "distances"]
|
175 |
+
)
|
176 |
+
|
177 |
+
# Process evidence with similarity normalization
|
178 |
+
evidence = []
|
179 |
+
total_distance = 0
|
180 |
+
for doc, meta, distance in zip(results['documents'][0],
|
181 |
+
results['metadatas'][0],
|
182 |
+
results['distances'][0]):
|
183 |
+
similarity = 1 - (distance / 2) # Convert cosine distance to similarity
|
184 |
+
evidence.append({
|
185 |
+
"text": doc,
|
186 |
+
"source": meta.get("source", "Unknown"),
|
187 |
+
"similarity": similarity
|
188 |
+
})
|
189 |
+
total_distance += distance
|
190 |
+
|
191 |
+
avg_similarity = 1 - (total_distance / len(results['distances'][0]) / 2)
|
192 |
+
|
193 |
+
# Prepare LLM verification prompt
|
194 |
+
evidence_str = "\n".join([
|
195 |
+
f"- {e['text']} (Similarity: {e['similarity']:.2f})"
|
196 |
+
for e in evidence
|
197 |
+
])
|
198 |
+
|
199 |
+
prompt = f"""**Entity Verification Task**
|
200 |
+
Entity: "{entity_text}"
|
201 |
+
|
202 |
+
**Verified Evidence:**
|
203 |
+
{evidence_str}
|
204 |
+
|
205 |
+
**Instructions:**
|
206 |
+
1. Verify if this entity exists in official records
|
207 |
+
2. Check for exact matches of names/titles
|
208 |
+
3. Confirm associated details (locations, dates, roles)
|
209 |
+
4. Return JSON with: verdict (True/False/Unverified), confidence (0-1), reasoning
|
210 |
+
|
211 |
+
**JSON Response:"""
|
212 |
+
|
213 |
+
try:
|
214 |
+
response = self.groq_client.chat.completions.create(
|
215 |
+
model=self.model_name,
|
216 |
+
messages=[{"role": "user", "content": prompt}],
|
217 |
+
temperature=0.2,
|
218 |
+
response_format={"type": "json_object"}
|
219 |
+
)
|
220 |
+
|
221 |
+
result = json.loads(response.choices[0].message.content)
|
222 |
+
return {
|
223 |
+
"verdict": result.get("verdict", "Unverified"),
|
224 |
+
"confidence": min(max(result.get("confidence", avg_similarity), 0), 1),
|
225 |
+
"evidence": [e["text"] for e in evidence],
|
226 |
+
"reasoning": result.get("reasoning", "No reasoning provided")
|
227 |
+
}
|
228 |
+
|
229 |
+
except Exception as e:
|
230 |
+
return {
|
231 |
+
"verdict": "Error",
|
232 |
+
"confidence": 0,
|
233 |
+
"evidence": [],
|
234 |
+
"reasoning": f"Verification failed: {str(e)}"
|
235 |
+
}
|
236 |
+
|
237 |
+
def verify_claim(self, text, confidence_threshold=0.5):
|
238 |
+
"""
|
239 |
+
Main method: takes input text, extracts entities and claims,
|
240 |
+
verifies each, and returns JSON results
|
241 |
+
"""
|
242 |
+
# Extract entities and claims
|
243 |
+
entities = self.extract_entities(text)
|
244 |
+
claims = self.extract_claims(text)
|
245 |
+
|
246 |
+
# Verify claims
|
247 |
+
claim_results = []
|
248 |
+
for claim in claims:
|
249 |
+
verification = self.verify_single_claim(claim, confidence_threshold)
|
250 |
+
claim_results.append({
|
251 |
+
"claim": claim,
|
252 |
+
"verdict": verification.get("verdict", "Error"),
|
253 |
+
"confidence": verification.get("confidence", 0),
|
254 |
+
"evidence": verification.get("evidence", []),
|
255 |
+
"reasoning": verification.get("reasoning", "Analysis failed")
|
256 |
+
})
|
257 |
+
|
258 |
+
# Verify entities
|
259 |
+
entity_results = []
|
260 |
+
for entity_text, entity_label in entities:
|
261 |
+
verification = self.verify_single_entity(entity_text, confidence_threshold)
|
262 |
+
entity_results.append({
|
263 |
+
"entity": entity_text,
|
264 |
+
"type": entity_label,
|
265 |
+
"verdict": verification.get("verdict", "Error"),
|
266 |
+
"confidence": verification.get("confidence", 0),
|
267 |
+
"evidence": verification.get("evidence", []),
|
268 |
+
"reasoning": verification.get("reasoning", "Analysis failed")
|
269 |
+
})
|
270 |
+
|
271 |
+
return {
|
272 |
+
"entities": entity_results,
|
273 |
+
"claims": claim_results
|
274 |
+
}
|
275 |
+
|
visualize.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|