File size: 2,429 Bytes
11cc0d3
 
 
 
 
 
 
 
 
569e03b
11cc0d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffed807
 
11cc0d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import requests
from bs4 import BeautifulSoup
import chromadb
from chromadb.utils import embedding_functions
import gc
import csv

# === CONFIGURATION ===
CHROMA_PATH = "app/chroma_db"
COLLECTION_NAME = "pib_titles"

def save_titles_to_csv(titles, filename="pib_titles.csv"):
    with open(filename, mode="w", newline='', encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["title", "source"])  # header
        for title, source in titles:
            writer.writerow([title, source])
    print(f"Saved {len(titles)} titles to {filename}")

def scrape_and_store():
    RSS_URLS = [
        "https://www.pib.gov.in/RssMain.aspx?ModId=6&Lang=1&Regid=3",
        "https://www.pib.gov.in/RssMain.aspx?ModId=8&Lang=1&Regid=3"
    ]

    all_titles_sources = set()
    for url in RSS_URLS:
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "xml")
            items = soup.find_all("item")
            for item in items:
                title_tag = item.find("title")
                link_tag = item.find("link")
                if title_tag and title_tag.text and link_tag and link_tag.text:
                    all_titles_sources.add((title_tag.text.strip(), link_tag.text.strip()))
        except Exception as e:
            print(f"Error fetching {url}: {e}")

    all_titles_sources = list(all_titles_sources)
    print(f"Fetched {len(all_titles_sources)} unique titles.")

    # Save to CSV
    save_titles_to_csv(all_titles_sources, filename="data/pib_titles.csv")


    # Prepare for ChromaDB
    documents = [title for title, source in all_titles_sources]
    metadatas = [{"source": source} for title, source in all_titles_sources]
    ids = [f"title_{i}" for i in range(len(all_titles_sources))]

    # Store in ChromaDB
    client = chromadb.PersistentClient(path=CHROMA_PATH)
    collection = client.get_or_create_collection(
        name=COLLECTION_NAME,
        embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="all-MiniLM-L6-v2"
        )
    )
    collection.add(documents=documents, ids=ids, metadatas=metadatas)

    # Explicitly close client
    del collection
    del client
    gc.collect()

if __name__ == "__main__":
    scrape_and_store()
    print("Scraping complete. ChromaDB ready for encryption.")