Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on Jul 17

Commit

8e17b80

1 Parent(s): e9cce51

updated ingestion

Browse files

Files changed (1) hide show

pipeline/news_ingest.py +37 -13

pipeline/news_ingest.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import sys
 import os
 import json
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 from components.indexers.news_indexer import get_or_build_index
@@ -8,6 +10,7 @@ from components.fetchers.google_search import fetch_google_news
 from components.fetchers.scraper import scrape_url
 from llama_index.core.settings import Settings
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 # ✅ Set up local embedding model
 Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
@@ -24,40 +27,60 @@ QUERIES = [
 # ✅ Paths
 INDEX_DIR = "storage/index"
 DATA_DIR = "data/news"
-RAW_FILE = os.path.join(DATA_DIR, "news.txt")
-def write_articles_to_file(articles, file_path):
     os.makedirs(os.path.dirname(file_path), exist_ok=True)
     with open(file_path, "w", encoding="utf-8") as f:
         for article in articles:
-            f.write(article.strip() + "\n\n")
 if __name__ == "__main__":
     if not API_KEY or not CSE_ID:
         raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")
-    print("🌍 Fetching news URLs from Google...")
     all_articles = []
     for query in QUERIES:
-        print(f"🔍 Searching for: {query}")
         try:
             results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
             print(f"   → Found {len(results)} links for '{query}'.")
             for item in results:
                 url = item.get("link", "").strip()
-                if not url:
                     continue
-                print(f"🌐 Scraping: {url}")
                 article_text = scrape_url(url)
                 if article_text:
-                    tagged_text = f"[{query.upper()}]\n{article_text}"
-                    print("Adding text to vector", tagged_text)
-                    all_articles.append(tagged_text)
                 else:
                     print(f"⚠️ Skipped: {url}")
@@ -67,10 +90,11 @@ if __name__ == "__main__":
     if not all_articles:
         print("⚠️ No content scraped. Exiting.")
     else:
-        print(f"📝 Writing {len(all_articles)} articles to {RAW_FILE}...")
-        write_articles_to_file(all_articles, RAW_FILE)
         print("🧠 Building index...")
-        get_or_build_index(DATA_DIR)
         print(f"✅ Indexed and stored at: {INDEX_DIR}")

 import sys
 import os
 import json
+from typing import List, Dict
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 from components.indexers.news_indexer import get_or_build_index
 from components.fetchers.scraper import scrape_url
 from llama_index.core.settings import Settings
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.schema import Document
 # ✅ Set up local embedding model
 Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
 # ✅ Paths
 INDEX_DIR = "storage/index"
 DATA_DIR = "data/news"
+RAW_JSON = os.path.join(DATA_DIR, "news.jsonl")
+def write_articles_jsonl(articles: List[Dict], file_path: str):
     os.makedirs(os.path.dirname(file_path), exist_ok=True)
     with open(file_path, "w", encoding="utf-8") as f:
         for article in articles:
+            f.write(json.dumps(article, ensure_ascii=False) + "\n")
+def build_documents(data: List[Dict]) -> List[Document]:
+    return [
+        Document(
+            text=entry["content"],
+            metadata={
+                "title": entry["title"],
+                "url": entry["url"],
+                "topic": entry["topic"],
+                "source": entry["source"]
+            }
+        )
+        for entry in data
+    ]
 if __name__ == "__main__":
     if not API_KEY or not CSE_ID:
         raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")
+    print("\U0001F30D Fetching news URLs from Google...")
     all_articles = []
     for query in QUERIES:
+        print(f"\U0001F50D Searching for: {query}")
         try:
             results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
             print(f"   → Found {len(results)} links for '{query}'.")
             for item in results:
                 url = item.get("link", "").strip()
+                title = item.get("title", "").strip()
+                source = item.get("displayLink", "").strip()
+                if not url or not title:
                     continue
+                print(f"\U0001F310 Scraping: {url}")
                 article_text = scrape_url(url)
                 if article_text:
+                    all_articles.append({
+                        "topic": query,
+                        "title": title,
+                        "url": url,
+                        "source": source,
+                        "content": article_text
+                    })
                 else:
                     print(f"⚠️ Skipped: {url}")
     if not all_articles:
         print("⚠️ No content scraped. Exiting.")
     else:
+        print(f"📝 Writing {len(all_articles)} articles to {RAW_JSON}...")
+        write_articles_jsonl(all_articles, RAW_JSON)
         print("🧠 Building index...")
+        documents = build_documents(all_articles)
+        get_or_build_index(documents)
         print(f"✅ Indexed and stored at: {INDEX_DIR}")