tsrivallabh commited on
Commit
11cc0d3
·
verified ·
1 Parent(s): decec61

Synced repo using 'sync_with_huggingface' Github Action

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ chroma_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,21 +1,28 @@
1
- FROM python:3.9-slim
2
 
 
 
 
 
 
 
3
  WORKDIR /app
4
 
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- software-properties-common \
9
- git \
10
- && rm -rf /var/lib/apt/lists/*
11
 
12
- COPY requirements.txt ./
13
- COPY src/ ./src/
14
 
15
- RUN pip3 install -r requirements.txt
 
 
16
 
17
- EXPOSE 8501
 
18
 
19
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
 
20
 
21
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
 
1
 
2
+
3
+ # Use an official Python runtime as a parent image
4
+ FROM python:3.11-slim
5
+
6
+
7
+ # Set the working directory in the container
8
  WORKDIR /app
9
 
10
+ ENV HF_HOME=/data/hf_cache
11
+ ENV TRANSFORMERS_CACHE=/data/hf_cache/transformers
12
+ ENV HF_DATASETS_CACHE=/data/hf_cache/datasets
13
+ ENV HF_HUB_CACHE=/data/hf_cache/hub
 
 
14
 
15
+ RUN mkdir -p /data/hf_cache/transformers /data/hf_cache/datasets /data/hf_cache/hub && chmod -R 777 /data/hf_cache
 
16
 
17
+ # Copy requirements.txt and install dependencies
18
+ COPY requirements.txt .
19
+ RUN pip install --no-cache-dir -r requirements.txt
20
 
21
+ # Copy the rest of your app's code
22
+ COPY . .
23
 
24
+ # Expose the port Streamlit runs on
25
+ EXPOSE 8501
26
 
27
+ # Run Streamlit
28
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
__pycache__/fact_checker.cpython-310.pyc ADDED
Binary file (5.84 kB). View file
 
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from fact_checker import FactChecker
3
+ from openai import OpenAI
4
+ import os
5
+ from dotenv import load_dotenv
6
+ import csv
7
+ from datetime import datetime
8
+
9
+ load_dotenv()
10
+
11
+ def store_feedback_csv(claim, result, feedback, csv_file="feedback_log.csv"):
12
+ now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
13
+ row = [
14
+ now,
15
+ claim,
16
+ result.get("verdict", ""),
17
+ result.get("confidence", ""),
18
+ "|".join(result.get("evidence", [])),
19
+ result.get("reasoning", ""),
20
+ feedback
21
+ ]
22
+ header = ["datetime", "claim", "verdict", "confidence", "evidence", "reasoning", "feedback"]
23
+
24
+ # Create file if it doesn't exist
25
+ if not os.path.exists(csv_file):
26
+ with open(csv_file, "w", newline='', encoding="utf-8") as f:
27
+ writer = csv.writer(f)
28
+ writer.writerow(header)
29
+
30
+ # Append to existing file
31
+ with open(csv_file, "a", newline='', encoding="utf-8") as f:
32
+ writer = csv.writer(f)
33
+ writer.writerow(row)
34
+
35
+ def initialize_services():
36
+ return FactChecker(
37
+ chroma_path="chroma_db",
38
+ collection_name="pib_titles",
39
+ groq_client=OpenAI(
40
+ api_key=os.getenv("GROQ_API_KEY"),
41
+ base_url="https://api.groq.com/openai/v1"
42
+ )
43
+ )
44
+
45
+ def main():
46
+ st.title("🔍 Fact Checker")
47
+ checker = initialize_services()
48
+
49
+ # Initialize session state variables
50
+ if "feedback_submitted" not in st.session_state:
51
+ st.session_state.feedback_submitted = False
52
+ if "last_claim" not in st.session_state:
53
+ st.session_state.last_claim = ""
54
+ if "result" not in st.session_state:
55
+ st.session_state.result = None
56
+
57
+ claim = st.text_area("Enter a claim to verify:", height=150)
58
+ confidence_threshold = st.slider("Confidence Threshold", 0.0, 1.0, 0.5, 0.05)
59
+
60
+ if st.button("Verify Claim"):
61
+ if not claim.strip():
62
+ st.error("Please enter a claim to verify")
63
+ return
64
+
65
+ with st.spinner("Analyzing..."):
66
+ # Store result in session state
67
+ st.session_state.result = checker.verify_claim(claim, confidence_threshold)
68
+ st.session_state.last_claim = claim
69
+ st.session_state.feedback_submitted = False # Reset feedback state for new claim
70
+
71
+ # Display results from session state
72
+ if st.session_state.result:
73
+ result = st.session_state.result
74
+ if "error" in result:
75
+ st.error(f"Error: {result['error']}")
76
+ if "raw_response" in result:
77
+ with st.expander("Show raw LLM response"):
78
+ st.code(result["raw_response"])
79
+ else:
80
+ # Display verdict
81
+ verdict_color = {
82
+ "True": "green",
83
+ "False": "red",
84
+ "Unverifiable": "orange"
85
+ }.get(result["verdict"], "gray")
86
+ st.markdown(f"**Verdict:** :{verdict_color}[{result['verdict']}]")
87
+
88
+ # Display confidence score
89
+ st.metric("Confidence Score", f"{result.get('confidence', 0):.2f}")
90
+
91
+ # Display evidence
92
+ with st.expander("View Supporting Evidence"):
93
+ for idx, evidence in enumerate(result.get("evidence", []), 1):
94
+ st.markdown(f"{idx}. {evidence}")
95
+
96
+ # Display reasoning
97
+ st.markdown("**Analysis:**")
98
+ st.write(result.get("reasoning", "No reasoning provided"))
99
+
100
+ # Feedback system
101
+ feedback_key = f"feedback_radio_{st.session_state.last_claim}"
102
+ if not st.session_state.feedback_submitted:
103
+ feedback = st.radio(
104
+ "Was this analysis helpful?",
105
+ ["", "👍 Yes", "👎 No"],
106
+ horizontal=True,
107
+ key=feedback_key
108
+ )
109
+
110
+ if feedback:
111
+ store_feedback_csv(st.session_state.last_claim, result, feedback)
112
+ st.session_state.feedback_submitted = True
113
+ st.rerun() # Use st.rerun() instead of experimental_rerun()
114
+ else:
115
+ st.success("Thank you for your feedback! Your input helps improve the system.")
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
chroma_db/973cff8c-e7bc-4ee9-b987-873e62fd3ab6/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8146ecc3e4c3a36ea9b3edc3778630c452f483990ec942d38e8006f4661e430
3
+ size 16760000
chroma_db/973cff8c-e7bc-4ee9-b987-873e62fd3ab6/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18f1e924efbb5e1af5201e3fbab86a97f5c195c311abe651eeec525884e5e449
3
+ size 100
chroma_db/973cff8c-e7bc-4ee9-b987-873e62fd3ab6/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6546ec686723f09f7d5947cb57d3636b6cbcaf7e56fc7dbb5a0c9730cefa3aa
3
+ size 40000
chroma_db/973cff8c-e7bc-4ee9-b987-873e62fd3ab6/link_lists.bin ADDED
File without changes
chroma_db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:594c12b15d2ef0b8cfc94dd49539e7cd5c9a0639193d7d42b96557d6863b008a
3
+ size 344064
chroma_key.key ADDED
@@ -0,0 +1 @@
 
 
1
+ YDqTej_W3zFT8JRh5D7eFIqHJjMDiQY07Ceyln1Msjs=
debug.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from chromadb.utils import embedding_functions
3
+
4
+ # Adjust these as needed
5
+ CHROMA_PATH = "chroma_db"
6
+ COLLECTION_NAME = "pib_titles"
7
+
8
+ client = chromadb.PersistentClient(path=CHROMA_PATH)
9
+ collection = client.get_collection(
10
+ name=COLLECTION_NAME,
11
+ embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
12
+ model_name="all-MiniLM-L6-v2"
13
+ )
14
+ )
15
+
16
+ # Retrieve all documents and metadata (ids are always returned)
17
+ all_docs = collection.get(include=["documents", "metadatas"])
18
+
19
+ print("Total documents:", len(all_docs["ids"]))
20
+ for i, (doc_id, doc, meta) in enumerate(zip(all_docs["ids"], all_docs["documents"], all_docs["metadatas"])):
21
+ print(f"\n--- Document {i+1} ---")
22
+ print("ID:", doc_id)
23
+ print("Document:", doc)
24
+ print("Metadata:", meta)
decrypt_chroma.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from cryptography.fernet import Fernet
3
+
4
+ KEY_FILE = "chroma_key.key"
5
+ CHROMA_PATH = "chroma_db"
6
+
7
+ def decrypt_chroma_files():
8
+ # Load key
9
+ with open(KEY_FILE, "rb") as f:
10
+ key = f.read()
11
+ fernet = Fernet(key)
12
+
13
+ # Decrypt all .enc files
14
+ for root, _, files in os.walk(CHROMA_PATH):
15
+ for file in files:
16
+ if not file.endswith(".enc"):
17
+ continue
18
+
19
+ encrypted_path = os.path.join(root, file)
20
+ original_path = encrypted_path[:-4] # Remove .enc
21
+
22
+ try:
23
+ with open(encrypted_path, "rb") as f:
24
+ encrypted_data = f.read()
25
+ decrypted_data = fernet.decrypt(encrypted_data)
26
+ with open(original_path, "wb") as f:
27
+ f.write(decrypted_data)
28
+ os.remove(encrypted_path)
29
+ print(f"Decrypted: {original_path}")
30
+ except Exception as e:
31
+ print(f"Error decrypting {encrypted_path}: {e}")
32
+
33
+ if __name__ == "__main__":
34
+ decrypt_chroma_files()
35
+ print("Decryption complete. ChromaDB ready for use.")
encrypt_chroma.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from cryptography.fernet import Fernet
3
+
4
+ KEY_FILE = "chroma_key.key"
5
+ CHROMA_PATH = "chroma_db"
6
+
7
+ # Generate the key if it doesn't exist
8
+ if not os.path.exists(KEY_FILE):
9
+ key = Fernet.generate_key()
10
+ with open(KEY_FILE, "wb") as f:
11
+ f.write(key)
12
+ print(f"Encryption key generated and saved as {KEY_FILE}.")
13
+ else:
14
+ with open(KEY_FILE, "rb") as f:
15
+ key = f.read()
16
+
17
+ def encrypt_chroma_files():
18
+ with open(KEY_FILE, "rb") as f:
19
+ key = f.read()
20
+ fernet = Fernet(key)
21
+ encrypted_count = 0
22
+ skipped_count = 0
23
+
24
+ for root, _, files in os.walk(CHROMA_PATH):
25
+ for file in files:
26
+ file_path = os.path.join(root, file)
27
+ if file.endswith(".enc"):
28
+ continue
29
+ try:
30
+ with open(file_path, "rb") as f:
31
+ data = f.read()
32
+ encrypted = fernet.encrypt(data)
33
+ with open(f"{file_path}.enc", "wb") as f:
34
+ f.write(encrypted)
35
+ os.remove(file_path)
36
+ print(f"Encrypted and removed: {file_path}")
37
+ encrypted_count += 1
38
+ except PermissionError:
39
+ print(f"Skipped (file in use): {file_path}")
40
+ skipped_count += 1
41
+ except Exception as e:
42
+ print(f"Error encrypting {file_path}: {e}")
43
+ skipped_count += 1
44
+
45
+ print(f"\nEncryption complete. {encrypted_count} files encrypted, {skipped_count} files skipped.")
46
+
47
+ if __name__ == "__main__":
48
+ encrypt_chroma_files()
fact_checker.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import chromadb
3
+ from chromadb.utils import embedding_functions
4
+ import json
5
+ import re
6
+ from openai import OpenAI
7
+ import re
8
+ import json
9
+
10
+ def robust_json_extractor(response_content):
11
+ # Preprocess: Remove markdown code blocks and extra whitespace
12
+ cleaned = re.sub(r'``````', '', response_content).strip()
13
+
14
+ # Key-specific regex patterns
15
+ patterns = {
16
+ "verdict": r'"verdict"\s*:\s*"((?:\\"|[^"])*)"',
17
+ "evidence": r'"evidence"\s*:\s*(\[[^\]]*?\]|\[.*?\])(?=\s*[,}])',
18
+ "reasoning": r'"reasoning"\s*:\s*"((?:\\"|[^"])*)"'
19
+ }
20
+
21
+ result = {}
22
+ for key, pattern in patterns.items():
23
+ match = re.search(pattern, cleaned, re.DOTALL)
24
+ if match:
25
+ try:
26
+ if key == "evidence":
27
+ # Handle array parsing with json.loads
28
+ evidence_str = re.sub(r'(?<!\\)"', r'\"', match.group(1)) # Escape unescaped quotes
29
+ result[key] = json.loads(evidence_str)
30
+ else:
31
+ # Unescape quotes for strings
32
+ result[key] = json.loads(f'"{match.group(1)}"')
33
+ except:
34
+ # Fallback: Return raw matched string
35
+ result[key] = match.group(1)
36
+
37
+ # Validation
38
+ required_keys = ["verdict", "evidence", "reasoning"]
39
+ if all(key in result for key in required_keys):
40
+ return result
41
+ else:
42
+ # Fallback to standard JSON parsing
43
+ try:
44
+ return json.loads(re.search(r'\{.*\}', cleaned, re.DOTALL).group())
45
+ except:
46
+ return {"error": "Failed to extract required keys", "raw": cleaned}
47
+
48
+ class FactChecker:
49
+ def __init__(self, chroma_path, collection_name, groq_client):
50
+ self.client = chromadb.PersistentClient(path=chroma_path)
51
+ self.collection = self.client.get_collection(
52
+ name=collection_name,
53
+ embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
54
+ model_name="all-MiniLM-L6-v2"
55
+ )
56
+ )
57
+ self.groq_client = groq_client
58
+ self.model_name = "llama3-8b-8192"
59
+
60
+ def verify_claim(self, claim, confidence_threshold=0.5):
61
+ # Vector search returns full verified statements with distances
62
+ results = self.collection.query(
63
+ query_texts=[claim],
64
+ n_results=3,
65
+ include=["documents", "metadatas", "distances"]
66
+ )
67
+
68
+ # Pair documents with their distances and sort by similarity (ascending distance)
69
+ zipped_results = sorted(
70
+ zip(results['documents'][0], results['metadatas'][0], results['distances'][0]),
71
+ key=lambda x: x[2] # Sort by distance (ascending = most similar first)
72
+ )
73
+
74
+ # Format evidence with similarity scores (full sentences, not fragments)
75
+ evidence = []
76
+ for doc, meta, distance in zipped_results:
77
+ source = meta["source"] if meta and "source" in meta else "Unknown source"
78
+ # Convert distance to similarity score (higher = more similar)
79
+ similarity_score = 1 - (distance / 2) # Assuming cosine distance in [0,2]
80
+ evidence.append(
81
+ f'"{doc}" (Source: {source}, Similarity: {similarity_score:.2f})'
82
+ )
83
+
84
+
85
+ # Calculate overall confidence
86
+ avg_distance = sum(d for _, _, d in zipped_results) / len(zipped_results)
87
+ confidence = 1 - (avg_distance / 2) # Normalize to 0-1 range
88
+
89
+ # Threshold check
90
+ if confidence < confidence_threshold:
91
+ return {
92
+ "verdict": "Unverifiable",
93
+ "confidence": confidence,
94
+ "evidence": [e.split(" (Source:")[0] for e in evidence], # Cleaned evidence
95
+ "reasoning": "Claim is too vague or lacks sufficient evidence"
96
+ }
97
+
98
+ # LLM verification with distance-aware prompt
99
+ evidence_str = "\n".join([f"- {e}" for e in evidence])
100
+ prompt = f""" You are a powerful fact checker. Analyze the claim below against the provided verified information.
101
+ Relying on the similarity scores, also carefully check whether all factual details in the claim (such as dates, names, locations, and events) exactly match the evidence.
102
+ If there is any factual mismatch (for example, the date in the claim is different from the evidence), classify the claim as False. Any factual mismatch, even if the overall context is similar, should lead to a False classification.
103
+ If the evidence is too vague or lacks strong matches, classify as Unverifiable.
104
+ If evidence directly contradicts the claim, classify as False.
105
+ Any discrepancy in factual details, even if the overall context is similar, should lead to a False classification.
106
+ If the evidence fully supports the claim with all factual details matching, classify as True.
107
+
108
+ Claim:
109
+ {claim}
110
+
111
+ Evidence (with similarity scores):
112
+ {evidence_str}
113
+
114
+ Guidelines:
115
+ 1. Give more weight to evidence with higher similarity scores, but do not ignore factual mismatches.
116
+ 2. Pay close attention to details such as dates, names, locations, and events.
117
+ 3. If the claim and evidence differ on any factual point, do not classify as True.
118
+ 4. Respond only in JSON format without any additional text.
119
+ 5. In the "evidence" array, include only full evidence statements as strings, without any extra comments or explanations.
120
+ 6. Put all explanations or comparisons in the "reasoning" field.
121
+
122
+ Respond in JSON format:
123
+ {{
124
+ "verdict": "Verdict",
125
+ "evidence": [List of relevant facts from provided evidence],
126
+ "reasoning": "Explanation of the verdict based on evidence and factual details"
127
+ }}
128
+ """
129
+
130
+
131
+ completion = self.groq_client.chat.completions.create(
132
+ model=self.model_name,
133
+ messages=[{"role": "user", "content": prompt}],
134
+ temperature=0.1,
135
+ max_tokens=400
136
+ )
137
+
138
+ # Process response
139
+ response_content = completion.choices[0].message.content
140
+ print(f"Response from Groq: {response_content}")
141
+
142
+ # Use the robust JSON extractor
143
+ parsed = robust_json_extractor(response_content)
144
+ print(f"Parsed JSON: {parsed}")
145
+
146
+ if "error" in parsed:
147
+ return {
148
+ "error": parsed["error"],
149
+ "confidence": confidence,
150
+ "raw_response": parsed.get("raw", response_content)
151
+ }
152
+ else:
153
+ # Validate required fields
154
+ required_keys = ["verdict", "evidence", "reasoning"]
155
+ if all(key in parsed for key in required_keys):
156
+ return {
157
+ "verdict": parsed["verdict"],
158
+ "confidence": confidence,
159
+ "evidence": [e.split(" (Source:")[0] for e in evidence],
160
+ "reasoning": parsed["reasoning"]
161
+ }
162
+ else:
163
+ return {
164
+ "error": f"Missing required keys: {[k for k in required_keys if k not in parsed]}",
165
+ "confidence": confidence,
166
+ "raw_response": response_content
167
+ }
feedback_log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-06-10 17:56:57,Hi,Unverifiable,0.2471650640169779,"""The Minister of State for Finance, Shri Pankaj Chaudhary addressing at the Corporate Office of Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.""|""The Minister of State for Finance, Shri Pankaj Chaudhary lighting the lamp at the Corporate Office of the Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025. The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman also seen.""|""The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman and The Minister of State for Finance, Shri Pankaj Chaudhary at the Corporate Office of Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.""",Claim is too vague or lacks sufficient evidence,👍 Yes
2
+ 2025-06-10 17:57:49,"Union Minister for Finance and Corporate Affairs Smt. Nirmala Sitharaman inaugurates SPMCIL's New Corporate Office at World Trade Centre, Nauroji Nagar, New Delhi, today",True,0.8596701025962078,"""Union Minister for Finance and Corporate Affairs Smt. Nirmala Sitharaman inaugurates SPMCIL's New Corporate Office at World Trade Centre, Nauroji Nagar, New Delhi, today""|""The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman addressing at the Corporate Office of Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.""|""The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman and The Minister of State for Finance, Shri Pankaj Chaudhary to inaugurate the Corporate Office of the Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.""","The claim matches the evidence in all factual details, including the name of the minister, the location, and the event. The similarity scores are high, indicating a strong match. There are no discrepancies in dates, names, locations, or events, which confirms the claim as True.",👎 No
pib_titles.csv ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title,source
2
+ "Union Minister for Finance and Corporate Affairs Smt. Nirmala Sitharaman inaugurates SPMCIL's New Corporate Office at World Trade Centre, Nauroji Nagar, New Delhi, today",https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135268
3
+ Union Minister Dr L. Murugan inaugurates NeVA Digital Platform for Puducherry Legislative Assembly,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135180
4
+ World today looks up to India to overcome the global challenges: Dr Jitendra Singh,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135185
5
+ "The Consulate General of India celebrated the fifth Curtain Raiser for the International Day of Yoga 2025, in collaboration with the Brahma Kumaris a unique, spiritual, value-based educational institution at Perth, in Australia on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185210
6
+ "Swami Vivekananda Cultural Centre (SVCC), Bali in collaboration with Genta Yoga Pemogan community in Denpasar organized the 36th pre-event of IDY2025. The session highlighted the power of connection, discipline, and tradition through a shared yoga practice, in Indonesia on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185211
7
+ "The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman addressing at the Corporate Office of Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185229
8
+ The Office of Principal Scientific Adviser to Government of India organised the first State/UTs engagement workshop under the National One Health Mission,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135236
9
+ Earth Sciences Minister Dr. Jitendra Singh and the UNESCO emissary Vidar Helgesen hold bilateral with focus on conservation of oceans,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135183
10
+ Lok Samvardhan Parv Commemorating Completion of 11 years of Government to begin tomorrow at Raj Ghat,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135292
11
+ "The Minister of State for Finance, Shri Pankaj Chaudhary addressing at the Corporate Office of Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185230
12
+ "The Minister of State for Finance, Shri Pankaj Chaudhary lighting the lamp at the Corporate Office of the Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025. The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman also seen.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185227
13
+ "The Union Minister of Communications and Development of North Eastern Region (DoNER), Shri Jyotiraditya M. Scindia addressing at unveil the theme for 9th Edition of India Mobile Congress 2025: ‘Innovate to Transform’ and launch the Student Volunteers led ‘Sanchar Mitra Scheme’ for citizen awareness, in New Delhi on May 26, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=184023
14
+ PM shares an article on how India's Technical Textiles Sector is witnessing rapid growth,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135306
15
+ "The Minister of State for Road Transport and Highways and Corporate Affairs, Shri Harsh Malhotra lighting the lamp at Poshan Abhiyan, in Delhi on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185195
16
+ "The Minister of State for Road Transport and Highways and Corporate Affairs, Shri Harsh Malhotra addressing at Poshan Abhiyan, in Delhi on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185197
17
+ From Drafting Legislations to Saving Lives: Legislative Department Organizes Blood Donation Camp on World Blood Donor Day 2025,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135243
18
+ Raksha Rajya Mantri visits Shimla-based Army Training Command,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135241
19
+ Delhi Chief Minister meets Prime Minister,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135330
20
+ "Following the announcement by Prime Minister Shri Narendra Modi regarding additional compensation for houses damaged in border areas, Union Home Minister Shri Amit Shah ensures swift action by facilitating an additional provision of ₹25 crore from the Ministry of Home Affairs for 2,060 houses",https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135226
21
+ "Prime Minister hails unprecedented growth of India's Defence sector in last 11 Years, strengthening Self-Reliance and Modernisation",https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135281
22
+ "During their port call in Chennai, the ships of Eastern Sword Sunrise Fleet experienced a rejuvenating yoga session, embracing holistic wellness. The session fostered harmony, resilience, and personal health, while promoting self-awareness and focus among the crew on June 10, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185247
23
+ "The Embassy of India in Berlin, Germany and The Tagore Cultural Centre in Berlin, Germany held a calm and grounding yoga session in front of Berlin’s Reichstag - Germany’s historic Parliament building and a symbol of democracy and unity on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185206
24
+ Union Commerce and Industry Minister Shri Piyush Goyal Engages with Swiss Industry to Deepen India–Switzerland Economic Partnership,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135293
25
+ "The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman and The Minister of State for Finance, Shri Pankaj Chaudhary at the Corporate Office of Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185234
26
+ "On the occasion of Harit Yoga, the spirited team of students, staff, and doctors of National Institute of Naturopathy, Pune embarked on a rejuvenating Yoga Trek to Sinhagad Fort blending the serenity of yoga with the strength of nature and the joy of togetherness on June 10, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185249
27
+ "The Embassy of India in Berlin, Germany and The Tagore Cultural Centre in Berlin, Germany held a calm and grounding yoga session in front of Berlin’s Reichstag - Germany’s historic Parliament building and a symbol of democracy and unity on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185204
28
+ "On the occasion of Harit Yoga, the spirited team of students, staff, and doctors of National Institute of Naturopathy, Pune embarked on a rejuvenating Yoga Trek to Sinhagad Fort blending the serenity of yoga with the strength of nature and the joy of togetherness on June 10, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185248
29
+ "On the occasion of Harit Yoga, the spirited team of students, staff, and doctors of National Institute of Naturopathy, Pune embarked on a rejuvenating Yoga Trek to Sinhagad Fort blending the serenity of yoga with the strength of nature and the joy of togetherness on June 10, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185250
30
+ "The Minister of State for Defence, Shri Sanjay Seth visited the Army Training Command (ARTRAC) based at Shimla, in Himachal Pradesh on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185243
31
+ Shri Dharmendra Pradhan chairs Global Young Scientists Conference and Annual General Meeting of the Global Young Academy,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135357
32
+ "The Embassy of India in Berlin, Germany and The Tagore Cultural Centre in Berlin, Germany held a calm and grounding yoga session in front of Berlin’s Reichstag - Germany’s historic Parliament building and a symbol of democracy and unity on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185205
33
+ Union Agriculture Minister Shri Shivraj Singh Chouhan Meets Farmers in Telangana,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135232
34
+ "The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman and The Minister of State for Finance, Shri Pankaj Chaudhary to inaugurate the Corporate Office of the Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185224
35
+ "The Minister of State for Defence, Shri Sanjay Seth visited the Army Training Command (ARTRAC) based at Shimla, in Himachal Pradesh on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185244
36
+ PM shares an article highlighting India’s Digital Connectivity Revolution in last 11 years,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135354
37
+ BHASHINI and CRIS Sign MoU to Build Next-Gen Multilingual AI Solutions for Indian Railways,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135178
38
+ PM shares an article highlighting expansion of India's sports infrastructure in last 11 years,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135308
39
+ "The Minister of State for Road Transport and Highways and Corporate Affairs, Shri Harsh Malhotra addressing at Poshan Abhiyan, in Delhi on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185198
40
+ KEEL LAYING OF FOURTH EX-GSL NGOPV (YARD 1283),https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135188
41
+ "Union Minister for Finance and Corporate Affairs Smt. Nirmala Sitharaman chairs 6th meeting of the Governing Council of National Investment and Infrastructure Fund (NIIF) in New Delhi, today",https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135239
requirements.txt CHANGED
@@ -1,3 +1,9 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
1
+ requests==2.32.4
2
+ beautifulsoup4==4.13.4
3
+ lxml==4.9.3
4
+ chromadb
5
+ sentence-transformers
6
+ cryptography
7
+ openai
8
+ streamlit
9
+ python-dotenv
scrape_chroma.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import chromadb
5
+ from chromadb.utils import embedding_functions
6
+ import gc
7
+ import csv
8
+
9
+ # === CONFIGURATION ===
10
+ CHROMA_PATH = "chroma_db"
11
+ COLLECTION_NAME = "pib_titles"
12
+
13
+ def save_titles_to_csv(titles, filename="pib_titles.csv"):
14
+ with open(filename, mode="w", newline='', encoding="utf-8") as csvfile:
15
+ writer = csv.writer(csvfile)
16
+ writer.writerow(["title", "source"]) # header
17
+ for title, source in titles:
18
+ writer.writerow([title, source])
19
+ print(f"Saved {len(titles)} titles to {filename}")
20
+
21
+ def scrape_and_store():
22
+ RSS_URLS = [
23
+ "https://www.pib.gov.in/RssMain.aspx?ModId=6&Lang=1&Regid=3",
24
+ "https://www.pib.gov.in/RssMain.aspx?ModId=8&Lang=1&Regid=3"
25
+ ]
26
+
27
+ all_titles_sources = set()
28
+ for url in RSS_URLS:
29
+ try:
30
+ response = requests.get(url, timeout=10)
31
+ response.raise_for_status()
32
+ soup = BeautifulSoup(response.content, "xml")
33
+ items = soup.find_all("item")
34
+ for item in items:
35
+ title_tag = item.find("title")
36
+ link_tag = item.find("link")
37
+ if title_tag and title_tag.text and link_tag and link_tag.text:
38
+ all_titles_sources.add((title_tag.text.strip(), link_tag.text.strip()))
39
+ except Exception as e:
40
+ print(f"Error fetching {url}: {e}")
41
+
42
+ all_titles_sources = list(all_titles_sources)
43
+ print(f"Fetched {len(all_titles_sources)} unique titles.")
44
+
45
+ # Save to CSV
46
+ save_titles_to_csv(all_titles_sources, filename="pib_titles.csv")
47
+
48
+ # Prepare for ChromaDB
49
+ documents = [title for title, source in all_titles_sources]
50
+ metadatas = [{"source": source} for title, source in all_titles_sources]
51
+ ids = [f"title_{i}" for i in range(len(all_titles_sources))]
52
+
53
+ # Store in ChromaDB
54
+ client = chromadb.PersistentClient(path=CHROMA_PATH)
55
+ collection = client.get_or_create_collection(
56
+ name=COLLECTION_NAME,
57
+ embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
58
+ model_name="all-MiniLM-L6-v2"
59
+ )
60
+ )
61
+ collection.add(documents=documents, ids=ids, metadatas=metadatas)
62
+
63
+ # Explicitly close client
64
+ del collection
65
+ del client
66
+ gc.collect()
67
+
68
+ if __name__ == "__main__":
69
+ scrape_and_store()
70
+ print("Scraping complete. ChromaDB ready for encryption.")