Spaces:

tsrivallabh
/

LLM-Powered-Fact-Checker

Sleeping

App Files Files Community

tsrivallabh commited on 14 days ago

Commit

11cc0d3

verified ·

1 Parent(s): decec61

Synced repo using 'sync_with_huggingface' Github Action

Browse files

Files changed (18) hide show

.gitattributes +1 -0
Dockerfile +20 -13
__pycache__/fact_checker.cpython-310.pyc +0 -0
app.py +119 -0
chroma_db/973cff8c-e7bc-4ee9-b987-873e62fd3ab6/data_level0.bin +3 -0
chroma_db/973cff8c-e7bc-4ee9-b987-873e62fd3ab6/header.bin +3 -0
chroma_db/973cff8c-e7bc-4ee9-b987-873e62fd3ab6/length.bin +3 -0
chroma_db/973cff8c-e7bc-4ee9-b987-873e62fd3ab6/link_lists.bin +0 -0
chroma_db/chroma.sqlite3 +3 -0
chroma_key.key +1 -0
debug.py +24 -0
decrypt_chroma.py +35 -0
encrypt_chroma.py +48 -0
fact_checker.py +167 -0
feedback_log.csv +2 -0
pib_titles.csv +41 -0
requirements.txt +9 -3
scrape_chroma.py +70 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+chroma_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -1,21 +1,28 @@
-FROM python:3.9-slim
 WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    software-properties-common \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+# Use an official Python runtime as a parent image
+FROM python:3.11-slim
+# Set the working directory in the container
 WORKDIR /app
+ENV HF_HOME=/data/hf_cache
+ENV TRANSFORMERS_CACHE=/data/hf_cache/transformers
+ENV HF_DATASETS_CACHE=/data/hf_cache/datasets
+ENV HF_HUB_CACHE=/data/hf_cache/hub
+RUN mkdir -p /data/hf_cache/transformers /data/hf_cache/datasets /data/hf_cache/hub && chmod -R 777 /data/hf_cache
+# Copy requirements.txt and install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of your app's code
+COPY . .
+# Expose the port Streamlit runs on
+EXPOSE 8501
+# Run Streamlit
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

__pycache__/fact_checker.cpython-310.pyc ADDED Viewed

Binary file (5.84 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import streamlit as st
+from fact_checker import FactChecker
+from openai import OpenAI
+import os
+from dotenv import load_dotenv
+import csv
+from datetime import datetime
+load_dotenv()
+def store_feedback_csv(claim, result, feedback, csv_file="feedback_log.csv"):
+    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    row = [
+        now,
+        claim,
+        result.get("verdict", ""),
+        result.get("confidence", ""),
+        "|".join(result.get("evidence", [])),
+        result.get("reasoning", ""),
+        feedback
+    ]
+    header = ["datetime", "claim", "verdict", "confidence", "evidence", "reasoning", "feedback"]
+    # Create file if it doesn't exist
+    if not os.path.exists(csv_file):
+        with open(csv_file, "w", newline='', encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow(header)
+    # Append to existing file
+    with open(csv_file, "a", newline='', encoding="utf-8") as f:
+        writer = csv.writer(f)
+        writer.writerow(row)
+def initialize_services():
+    return FactChecker(
+        chroma_path="chroma_db",
+        collection_name="pib_titles",
+        groq_client=OpenAI(
+            api_key=os.getenv("GROQ_API_KEY"),
+            base_url="https://api.groq.com/openai/v1"
+        )
+    )
+def main():
+    st.title("🔍 Fact Checker")
+    checker = initialize_services()
+    # Initialize session state variables
+    if "feedback_submitted" not in st.session_state:
+        st.session_state.feedback_submitted = False
+    if "last_claim" not in st.session_state:
+        st.session_state.last_claim = ""
+    if "result" not in st.session_state:
+        st.session_state.result = None
+    claim = st.text_area("Enter a claim to verify:", height=150)
+    confidence_threshold = st.slider("Confidence Threshold", 0.0, 1.0, 0.5, 0.05)
+    if st.button("Verify Claim"):
+        if not claim.strip():
+            st.error("Please enter a claim to verify")
+            return
+        with st.spinner("Analyzing..."):
+            # Store result in session state
+            st.session_state.result = checker.verify_claim(claim, confidence_threshold)
+            st.session_state.last_claim = claim
+            st.session_state.feedback_submitted = False  # Reset feedback state for new claim
+    # Display results from session state
+    if st.session_state.result:
+        result = st.session_state.result
+        if "error" in result:
+            st.error(f"Error: {result['error']}")
+            if "raw_response" in result:
+                with st.expander("Show raw LLM response"):
+                    st.code(result["raw_response"])
+        else:
+            # Display verdict
+            verdict_color = {
+                "True": "green",
+                "False": "red",
+                "Unverifiable": "orange"
+            }.get(result["verdict"], "gray")
+            st.markdown(f"**Verdict:** :{verdict_color}[{result['verdict']}]")
+            # Display confidence score
+            st.metric("Confidence Score", f"{result.get('confidence', 0):.2f}")
+            # Display evidence
+            with st.expander("View Supporting Evidence"):
+                for idx, evidence in enumerate(result.get("evidence", []), 1):
+                    st.markdown(f"{idx}. {evidence}")
+            # Display reasoning
+            st.markdown("**Analysis:**")
+            st.write(result.get("reasoning", "No reasoning provided"))
+        # Feedback system
+        feedback_key = f"feedback_radio_{st.session_state.last_claim}"
+        if not st.session_state.feedback_submitted:
+            feedback = st.radio(
+                "Was this analysis helpful?",
+                ["", "👍 Yes", "👎 No"],
+                horizontal=True,
+                key=feedback_key
+            )
+            if feedback:
+                store_feedback_csv(st.session_state.last_claim, result, feedback)
+                st.session_state.feedback_submitted = True
+                st.rerun()  # Use st.rerun() instead of experimental_rerun()
+        else:
+            st.success("Thank you for your feedback! Your input helps improve the system.")
+if __name__ == "__main__":
+    main()

chroma_db/973cff8c-e7bc-4ee9-b987-873e62fd3ab6/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8146ecc3e4c3a36ea9b3edc3778630c452f483990ec942d38e8006f4661e430
+size 16760000

chroma_db/973cff8c-e7bc-4ee9-b987-873e62fd3ab6/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18f1e924efbb5e1af5201e3fbab86a97f5c195c311abe651eeec525884e5e449
+size 100

chroma_db/973cff8c-e7bc-4ee9-b987-873e62fd3ab6/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6546ec686723f09f7d5947cb57d3636b6cbcaf7e56fc7dbb5a0c9730cefa3aa
+size 40000

chroma_db/973cff8c-e7bc-4ee9-b987-873e62fd3ab6/link_lists.bin ADDED Viewed

File without changes

chroma_db/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:594c12b15d2ef0b8cfc94dd49539e7cd5c9a0639193d7d42b96557d6863b008a
+size 344064

chroma_key.key ADDED Viewed

	@@ -0,0 +1 @@


1	+ YDqTej_W3zFT8JRh5D7eFIqHJjMDiQY07Ceyln1Msjs=

debug.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import chromadb
+from chromadb.utils import embedding_functions
+# Adjust these as needed
+CHROMA_PATH = "chroma_db"
+COLLECTION_NAME = "pib_titles"
+client = chromadb.PersistentClient(path=CHROMA_PATH)
+collection = client.get_collection(
+    name=COLLECTION_NAME,
+    embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
+        model_name="all-MiniLM-L6-v2"
+    )
+)
+# Retrieve all documents and metadata (ids are always returned)
+all_docs = collection.get(include=["documents", "metadatas"])
+print("Total documents:", len(all_docs["ids"]))
+for i, (doc_id, doc, meta) in enumerate(zip(all_docs["ids"], all_docs["documents"], all_docs["metadatas"])):
+    print(f"\n--- Document {i+1} ---")
+    print("ID:", doc_id)
+    print("Document:", doc)
+    print("Metadata:", meta)

decrypt_chroma.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import os
+from cryptography.fernet import Fernet
+KEY_FILE = "chroma_key.key"
+CHROMA_PATH = "chroma_db"
+def decrypt_chroma_files():
+    # Load key
+    with open(KEY_FILE, "rb") as f:
+        key = f.read()
+    fernet = Fernet(key)
+    # Decrypt all .enc files
+    for root, _, files in os.walk(CHROMA_PATH):
+        for file in files:
+            if not file.endswith(".enc"):
+                continue
+            encrypted_path = os.path.join(root, file)
+            original_path = encrypted_path[:-4]  # Remove .enc
+            try:
+                with open(encrypted_path, "rb") as f:
+                    encrypted_data = f.read()
+                decrypted_data = fernet.decrypt(encrypted_data)
+                with open(original_path, "wb") as f:
+                    f.write(decrypted_data)
+                os.remove(encrypted_path)
+                print(f"Decrypted: {original_path}")
+            except Exception as e:
+                print(f"Error decrypting {encrypted_path}: {e}")
+if __name__ == "__main__":
+    decrypt_chroma_files()
+    print("Decryption complete. ChromaDB ready for use.")

encrypt_chroma.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+from cryptography.fernet import Fernet
+KEY_FILE = "chroma_key.key"
+CHROMA_PATH = "chroma_db"
+# Generate the key if it doesn't exist
+if not os.path.exists(KEY_FILE):
+    key = Fernet.generate_key()
+    with open(KEY_FILE, "wb") as f:
+        f.write(key)
+    print(f"Encryption key generated and saved as {KEY_FILE}.")
+else:
+    with open(KEY_FILE, "rb") as f:
+        key = f.read()
+def encrypt_chroma_files():
+    with open(KEY_FILE, "rb") as f:
+        key = f.read()
+    fernet = Fernet(key)
+    encrypted_count = 0
+    skipped_count = 0
+    for root, _, files in os.walk(CHROMA_PATH):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if file.endswith(".enc"):
+                continue
+            try:
+                with open(file_path, "rb") as f:
+                    data = f.read()
+                encrypted = fernet.encrypt(data)
+                with open(f"{file_path}.enc", "wb") as f:
+                    f.write(encrypted)
+                os.remove(file_path)
+                print(f"Encrypted and removed: {file_path}")
+                encrypted_count += 1
+            except PermissionError:
+                print(f"Skipped (file in use): {file_path}")
+                skipped_count += 1
+            except Exception as e:
+                print(f"Error encrypting {file_path}: {e}")
+                skipped_count += 1
+    print(f"\nEncryption complete. {encrypted_count} files encrypted, {skipped_count} files skipped.")
+if __name__ == "__main__":
+    encrypt_chroma_files()

fact_checker.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import os
+import chromadb
+from chromadb.utils import embedding_functions
+import json
+import re
+from openai import OpenAI
+import re
+import json
+def robust_json_extractor(response_content):
+    # Preprocess: Remove markdown code blocks and extra whitespace
+    cleaned = re.sub(r'``````', '', response_content).strip()
+    # Key-specific regex patterns
+    patterns = {
+        "verdict": r'"verdict"\s*:\s*"((?:\\"|[^"])*)"',
+        "evidence": r'"evidence"\s*:\s*(\[[^\]]*?\]|\[.*?\])(?=\s*[,}])',
+        "reasoning": r'"reasoning"\s*:\s*"((?:\\"|[^"])*)"'
+    }
+    result = {}
+    for key, pattern in patterns.items():
+        match = re.search(pattern, cleaned, re.DOTALL)
+        if match:
+            try:
+                if key == "evidence":
+                    # Handle array parsing with json.loads
+                    evidence_str = re.sub(r'(?<!\\)"', r'\"', match.group(1))  # Escape unescaped quotes
+                    result[key] = json.loads(evidence_str)
+                else:
+                    # Unescape quotes for strings
+                    result[key] = json.loads(f'"{match.group(1)}"')
+            except:
+                # Fallback: Return raw matched string
+                result[key] = match.group(1)
+    # Validation
+    required_keys = ["verdict", "evidence", "reasoning"]
+    if all(key in result for key in required_keys):
+        return result
+    else:
+        # Fallback to standard JSON parsing
+        try:
+            return json.loads(re.search(r'\{.*\}', cleaned, re.DOTALL).group())
+        except:
+            return {"error": "Failed to extract required keys", "raw": cleaned}
+class FactChecker:
+    def __init__(self, chroma_path, collection_name, groq_client):
+        self.client = chromadb.PersistentClient(path=chroma_path)
+        self.collection = self.client.get_collection(
+            name=collection_name,
+            embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
+                model_name="all-MiniLM-L6-v2"
+            )
+        )
+        self.groq_client = groq_client
+        self.model_name = "llama3-8b-8192"
+    def verify_claim(self, claim, confidence_threshold=0.5):
+    # Vector search returns full verified statements with distances
+        results = self.collection.query(
+            query_texts=[claim],
+            n_results=3,
+            include=["documents", "metadatas", "distances"]
+        )
+        # Pair documents with their distances and sort by similarity (ascending distance)
+        zipped_results = sorted(
+            zip(results['documents'][0], results['metadatas'][0], results['distances'][0]),
+            key=lambda x: x[2]  # Sort by distance (ascending = most similar first)
+        )
+        # Format evidence with similarity scores (full sentences, not fragments)
+        evidence = []
+        for doc, meta, distance in zipped_results:
+            source = meta["source"] if meta and "source" in meta else "Unknown source"
+            # Convert distance to similarity score (higher = more similar)
+            similarity_score = 1 - (distance / 2)  # Assuming cosine distance in [0,2]
+            evidence.append(
+                f'"{doc}" (Source: {source}, Similarity: {similarity_score:.2f})'
+            )
+        # Calculate overall confidence
+        avg_distance = sum(d for _, _, d in zipped_results) / len(zipped_results)
+        confidence = 1 - (avg_distance / 2)  # Normalize to 0-1 range
+        # Threshold check
+        if confidence < confidence_threshold:
+            return {
+                "verdict": "Unverifiable",
+                "confidence": confidence,
+                "evidence": [e.split(" (Source:")[0] for e in evidence],  # Cleaned evidence
+                "reasoning": "Claim is too vague or lacks sufficient evidence"
+            }
+        # LLM verification with distance-aware prompt
+        evidence_str = "\n".join([f"- {e}" for e in evidence])
+        prompt = f""" You are a powerful fact checker. Analyze the claim below against the provided verified information.
+Relying on the similarity scores, also carefully check whether all factual details in the claim (such as dates, names, locations, and events) exactly match the evidence.
+If there is any factual mismatch (for example, the date in the claim is different from the evidence), classify the claim as False. Any factual mismatch, even if the overall context is similar, should lead to a False classification.
+If the evidence is too vague or lacks strong matches, classify as Unverifiable.
+If evidence directly contradicts the claim, classify as False.
+Any discrepancy in factual details, even if the overall context is similar, should lead to a False classification.
+If the evidence fully supports the claim with all factual details matching, classify as True.
+Claim:
+{claim}
+Evidence (with similarity scores):
+{evidence_str}
+Guidelines:
+1. Give more weight to evidence with higher similarity scores, but do not ignore factual mismatches.
+2. Pay close attention to details such as dates, names, locations, and events.
+3. If the claim and evidence differ on any factual point, do not classify as True.
+4. Respond only in JSON format without any additional text.
+5. In the "evidence" array, include only full evidence statements as strings, without any extra comments or explanations.
+6. Put all explanations or comparisons in the "reasoning" field.
+Respond in JSON format:
+{{
+    "verdict": "Verdict",
+    "evidence": [List of relevant facts from provided evidence],
+    "reasoning": "Explanation of the verdict based on evidence and factual details"
+}}
+"""
+        completion = self.groq_client.chat.completions.create(
+            model=self.model_name,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.1,
+            max_tokens=400
+        )
+        # Process response
+        response_content = completion.choices[0].message.content
+        print(f"Response from Groq: {response_content}")
+        # Use the robust JSON extractor
+        parsed = robust_json_extractor(response_content)
+        print(f"Parsed JSON: {parsed}")
+        if "error" in parsed:
+            return {
+                "error": parsed["error"],
+                "confidence": confidence,
+                "raw_response": parsed.get("raw", response_content)
+            }
+        else:
+            # Validate required fields
+            required_keys = ["verdict", "evidence", "reasoning"]
+            if all(key in parsed for key in required_keys):
+                return {
+                    "verdict": parsed["verdict"],
+                    "confidence": confidence,
+                    "evidence": [e.split(" (Source:")[0] for e in evidence],
+                    "reasoning": parsed["reasoning"]
+                }
+            else:
+                return {
+                    "error": f"Missing required keys: {[k for k in required_keys if k not in parsed]}",
+                    "confidence": confidence,
+                    "raw_response": response_content
+                }

feedback_log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-06-10 17:56:57,Hi,Unverifiable,0.2471650640169779,"""The Minister of State for Finance, Shri Pankaj Chaudhary addressing at the Corporate Office of Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.""\|""The Minister of State for Finance, Shri Pankaj Chaudhary lighting the lamp at the Corporate Office of the Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025. The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman also seen.""\|""The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman and The Minister of State for Finance, Shri Pankaj Chaudhary at the Corporate Office of Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.""",Claim is too vague or lacks sufficient evidence,👍 Yes
2	+ 2025-06-10 17:57:49,"Union Minister for Finance and Corporate Affairs Smt. Nirmala Sitharaman inaugurates SPMCIL's New Corporate Office at World Trade Centre, Nauroji Nagar, New Delhi, today",True,0.8596701025962078,"""Union Minister for Finance and Corporate Affairs Smt. Nirmala Sitharaman inaugurates SPMCIL's New Corporate Office at World Trade Centre, Nauroji Nagar, New Delhi, today""\|""The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman addressing at the Corporate Office of Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.""\|""The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman and The Minister of State for Finance, Shri Pankaj Chaudhary to inaugurate the Corporate Office of the Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.""","The claim matches the evidence in all factual details, including the name of the minister, the location, and the event. The similarity scores are high, indicating a strong match. There are no discrepancies in dates, names, locations, or events, which confirms the claim as True.",👎 No

pib_titles.csv ADDED Viewed

	@@ -0,0 +1,41 @@

+title,source
+"Union Minister for Finance and Corporate Affairs Smt. Nirmala Sitharaman inaugurates SPMCIL's New Corporate Office at World Trade Centre, Nauroji Nagar, New Delhi, today",https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135268
+Union Minister Dr L. Murugan inaugurates NeVA Digital Platform for Puducherry Legislative Assembly,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135180
+World today looks up to India to overcome the global challenges: Dr Jitendra Singh,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135185
+"The Consulate General of India celebrated the fifth Curtain Raiser for the International Day of Yoga 2025, in collaboration with the Brahma Kumaris a unique, spiritual, value-based educational institution at Perth, in Australia on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185210
+"Swami Vivekananda Cultural Centre (SVCC), Bali in collaboration with Genta Yoga Pemogan community in Denpasar organized the 36th pre-event of IDY2025. The session highlighted the power of connection, discipline, and tradition through a shared yoga practice, in Indonesia on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185211
+"The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman addressing at the Corporate Office of Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185229
+The Office of Principal Scientific Adviser to Government of India organised the first State/UTs engagement workshop under the National One Health Mission,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135236
+Earth Sciences Minister Dr. Jitendra Singh and the UNESCO emissary Vidar Helgesen hold bilateral with focus on conservation of oceans,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135183
+Lok Samvardhan Parv Commemorating Completion of 11 years of Government to begin tomorrow at Raj Ghat,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135292
+"The Minister of State for Finance, Shri Pankaj Chaudhary addressing at the Corporate Office of Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185230
+"The Minister of State for Finance, Shri Pankaj Chaudhary lighting the lamp at the Corporate Office of the Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025. The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman also seen.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185227
+"The Union Minister of Communications and Development of North Eastern Region (DoNER), Shri Jyotiraditya M. Scindia addressing at unveil the theme for 9th Edition of India Mobile Congress 2025: ‘Innovate to Transform’ and launch the Student Volunteers led ‘Sanchar Mitra Scheme’ for citizen awareness, in New Delhi on May 26, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=184023
+PM shares an article on how India's Technical Textiles Sector is witnessing rapid growth,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135306
+"The Minister of State for Road Transport and Highways and Corporate Affairs, Shri Harsh Malhotra lighting the lamp at Poshan Abhiyan, in Delhi on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185195
+"The Minister of State for Road Transport and Highways and Corporate Affairs, Shri Harsh Malhotra addressing at Poshan Abhiyan, in Delhi on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185197
+From Drafting Legislations to Saving Lives: Legislative Department Organizes Blood Donation Camp on World Blood Donor Day 2025,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135243
+Raksha Rajya Mantri visits Shimla-based Army Training Command,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135241
+Delhi Chief Minister meets Prime Minister,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135330
+"Following the announcement by Prime Minister Shri Narendra Modi regarding additional compensation for houses damaged in border areas, Union Home Minister Shri Amit Shah ensures swift action by facilitating an additional provision of ₹25 crore from the Ministry of Home Affairs for 2,060 houses",https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135226
+"Prime Minister hails unprecedented growth of India's Defence sector in last 11 Years, strengthening Self-Reliance and Modernisation",https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135281
+"During their port call in Chennai, the ships of Eastern Sword Sunrise Fleet experienced a rejuvenating yoga session, embracing holistic wellness. The session fostered harmony, resilience, and personal health, while promoting self-awareness and focus among the crew on June 10, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185247
+"The Embassy of India in Berlin, Germany and The Tagore Cultural Centre in Berlin, Germany held a calm and grounding yoga session in front of Berlin’s Reichstag - Germany’s historic Parliament building and a symbol of democracy and unity on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185206
+Union Commerce and Industry Minister Shri Piyush Goyal Engages with Swiss Industry to Deepen India–Switzerland Economic Partnership,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135293
+"The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman and The Minister of State for Finance, Shri Pankaj Chaudhary at the Corporate Office of Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185234
+"On the occasion of Harit Yoga, the spirited team of students, staff, and doctors of National Institute of Naturopathy, Pune embarked on a rejuvenating Yoga Trek to Sinhagad Fort blending the serenity of yoga with the strength of nature and the joy of togetherness on June 10, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185249
+"The Embassy of India in Berlin, Germany and The Tagore Cultural Centre in Berlin, Germany held a calm and grounding yoga session in front of Berlin’s Reichstag - Germany’s historic Parliament building and a symbol of democracy and unity on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185204
+"On the occasion of Harit Yoga, the spirited team of students, staff, and doctors of National Institute of Naturopathy, Pune embarked on a rejuvenating Yoga Trek to Sinhagad Fort blending the serenity of yoga with the strength of nature and the joy of togetherness on June 10, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185248
+"On the occasion of Harit Yoga, the spirited team of students, staff, and doctors of National Institute of Naturopathy, Pune embarked on a rejuvenating Yoga Trek to Sinhagad Fort blending the serenity of yoga with the strength of nature and the joy of togetherness on June 10, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185250
+"The Minister of State for Defence, Shri Sanjay Seth visited the Army Training Command (ARTRAC) based at Shimla, in Himachal Pradesh on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185243
+Shri Dharmendra Pradhan chairs Global Young Scientists Conference and Annual General Meeting of the Global Young Academy,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135357
+"The Embassy of India in Berlin, Germany and The Tagore Cultural Centre in Berlin, Germany held a calm and grounding yoga session in front of Berlin’s Reichstag - Germany’s historic Parliament building and a symbol of democracy and unity on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185205
+Union Agriculture Minister Shri Shivraj Singh Chouhan Meets Farmers in Telangana,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135232
+"The Union Minister for Finance and Corporate Affairs, Smt. Nirmala Sitharaman and The Minister of State for Finance, Shri Pankaj Chaudhary to inaugurate the Corporate Office of the Security Printing and Minting Corporation of India (SPMCIL), followed by award ceremony, in New Delhi on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185224
+"The Minister of State for Defence, Shri Sanjay Seth visited the Army Training Command (ARTRAC) based at Shimla, in Himachal Pradesh on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185244
+PM shares an article highlighting India’s Digital Connectivity Revolution in last 11 years,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135354
+BHASHINI and CRIS Sign MoU to Build Next-Gen Multilingual AI Solutions for Indian Railways,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135178
+PM shares an article highlighting expansion of India's sports infrastructure in last 11 years,https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135308
+"The Minister of State for Road Transport and Highways and Corporate Affairs, Shri Harsh Malhotra addressing at Poshan Abhiyan, in Delhi on June 09, 2025.",https://pib.gov.in/FrontPhotoGallery.aspx?CategoryId=185198
+KEEL LAYING OF FOURTH EX-GSL NGOPV (YARD 1283),https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135188
+"Union Minister for Finance and Corporate Affairs Smt. Nirmala Sitharaman chairs 6th meeting of the Governing Council of National Investment and Infrastructure Fund (NIIF) in New Delhi, today",https://pib.gov.in/PressReleaseIframePage.aspx?PRID=2135239

requirements.txt CHANGED Viewed

@@ -1,3 +1,9 @@
-altair
-pandas
-streamlit

+requests==2.32.4
+beautifulsoup4==4.13.4
+lxml==4.9.3
+chromadb
+sentence-transformers
+cryptography
+openai
+streamlit
+python-dotenv

scrape_chroma.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import os
+import requests
+from bs4 import BeautifulSoup
+import chromadb
+from chromadb.utils import embedding_functions
+import gc
+import csv
+# === CONFIGURATION ===
+CHROMA_PATH = "chroma_db"
+COLLECTION_NAME = "pib_titles"
+def save_titles_to_csv(titles, filename="pib_titles.csv"):
+    with open(filename, mode="w", newline='', encoding="utf-8") as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(["title", "source"])  # header
+        for title, source in titles:
+            writer.writerow([title, source])
+    print(f"Saved {len(titles)} titles to {filename}")
+def scrape_and_store():
+    RSS_URLS = [
+        "https://www.pib.gov.in/RssMain.aspx?ModId=6&Lang=1&Regid=3",
+        "https://www.pib.gov.in/RssMain.aspx?ModId=8&Lang=1&Regid=3"
+    ]
+    all_titles_sources = set()
+    for url in RSS_URLS:
+        try:
+            response = requests.get(url, timeout=10)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, "xml")
+            items = soup.find_all("item")
+            for item in items:
+                title_tag = item.find("title")
+                link_tag = item.find("link")
+                if title_tag and title_tag.text and link_tag and link_tag.text:
+                    all_titles_sources.add((title_tag.text.strip(), link_tag.text.strip()))
+        except Exception as e:
+            print(f"Error fetching {url}: {e}")
+    all_titles_sources = list(all_titles_sources)
+    print(f"Fetched {len(all_titles_sources)} unique titles.")
+    # Save to CSV
+    save_titles_to_csv(all_titles_sources, filename="pib_titles.csv")
+    # Prepare for ChromaDB
+    documents = [title for title, source in all_titles_sources]
+    metadatas = [{"source": source} for title, source in all_titles_sources]
+    ids = [f"title_{i}" for i in range(len(all_titles_sources))]
+    # Store in ChromaDB
+    client = chromadb.PersistentClient(path=CHROMA_PATH)
+    collection = client.get_or_create_collection(
+        name=COLLECTION_NAME,
+        embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
+            model_name="all-MiniLM-L6-v2"
+        )
+    )
+    collection.add(documents=documents, ids=ids, metadatas=metadatas)
+    # Explicitly close client
+    del collection
+    del client
+    gc.collect()
+if __name__ == "__main__":
+    scrape_and_store()
+    print("Scraping complete. ChromaDB ready for encryption.")