ragV98 commited on
Commit
6d24925
·
1 Parent(s): cb778a1

new space launch

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
17
+ # CMD ["python", "space_app.py"]
18
+
app.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from routes.api import ingest, query
3
+ from llama_index.core.settings import Settings
4
+
5
+ Settings.llm = None
6
+
7
+
8
+ app = FastAPI()
9
+
10
+ @app.get("/")
11
+ def greet():
12
+ return {"hello": "world"}
13
+
14
+ app.include_router(ingest.router)
15
+ app.include_router(query.router)
components/__init__.py ADDED
File without changes
components/fetchers/__init__.py ADDED
File without changes
components/fetchers/google_search.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from typing import List, Dict
4
+
5
+ def fetch_google_news(query: str, api_key: str, cse_id: str, num_results: int = 10) -> List[Dict]:
6
+ url = "https://www.googleapis.com/customsearch/v1"
7
+ params = {
8
+ "q": query,
9
+ "key": api_key,
10
+ "cx": cse_id,
11
+ "num": num_results,
12
+ }
13
+ response = requests.get(url, params=params)
14
+ if response.status_code != 200:
15
+ raise Exception(f"Google News API error: {response.text}")
16
+ return response.json().get("items", [])
components/fetchers/scraper.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import trafilatura
3
+ from newspaper import Article
4
+ from typing import Optional
5
+ from bs4 import BeautifulSoup
6
+
7
+ HEADERS = {
8
+ "User-Agent": (
9
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
10
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
11
+ "Chrome/115.0.0.0 Safari/537.36"
12
+ )
13
+ }
14
+
15
+ def clean_text(text: str) -> str:
16
+ # Remove excess whitespace, ads, and headings
17
+ cleaned = text.replace("\n", " ").strip()
18
+ cleaned = BeautifulSoup(cleaned, "html.parser").text # remove tags
19
+ cleaned = " ".join(cleaned.split()) # remove multiple spaces
20
+ return cleaned
21
+
22
+ def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
23
+ try:
24
+ response = requests.get(url, timeout=timeout, headers=HEADERS)
25
+ if response.status_code == 200:
26
+ html = response.text
27
+ extracted = trafilatura.extract(html, include_comments=False, include_tables=False)
28
+ if extracted and len(extracted.split()) > 100:
29
+ return clean_text(extracted)
30
+ except Exception as e:
31
+ print(f"⚠️ Trafilatura failed for {url}: {e}")
32
+
33
+ try:
34
+ article = Article(url)
35
+ article.download()
36
+ article.parse()
37
+ if article.text and len(article.text.split()) > 100:
38
+ return clean_text(article.text)
39
+ except Exception as e:
40
+ print(f"⚠️ Newspaper3k failed for {url}: {e}")
41
+
42
+ return None
components/gateways/__init__.py ADDED
File without changes
components/indexers/__init__.py ADDED
File without changes
components/indexers/news_indexer.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.vector_stores.upstash import UpstashVectorStore
2
+ from llama_index.core.storage.storage_context import StorageContext
3
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, load_index_from_storage
4
+ from llama_index.core.node_parser import SimpleNodeParser
5
+ from llama_index.core.settings import Settings
6
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
7
+ import os
8
+
9
+ Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
10
+
11
+ def get_upstash_vector_store():
12
+ return UpstashVectorStore(
13
+ url=os.environ["UPSTASH_VECTOR_REST_URL"],
14
+ token=os.environ["UPSTASH_VECTOR_REST_TOKEN"],
15
+ )
16
+
17
+ def build_news_index(data_dir: str) -> VectorStoreIndex:
18
+ documents = SimpleDirectoryReader(data_dir).load_data()
19
+ nodes = SimpleNodeParser.from_defaults().get_nodes_from_documents(documents)
20
+
21
+ vector_store = get_upstash_vector_store()
22
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
23
+
24
+ index = VectorStoreIndex(nodes, storage_context=storage_context)
25
+ return index
26
+
27
+ def load_news_index() -> VectorStoreIndex:
28
+ vector_store = get_upstash_vector_store()
29
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
30
+ return load_index_from_storage(storage_context)
31
+
32
+ def get_or_build_index(data_dir: str) -> VectorStoreIndex:
33
+ # This should check if the index already exists in Upstash
34
+ try:
35
+ return load_news_index()
36
+ except Exception:
37
+ return build_news_index(data_dir)
components/query_engine/__init__.py ADDED
File without changes
config/__init__.py ADDED
File without changes
config/config.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
7
+ GOOGLE_CSE_ID = os.getenv("GOOGLE_CX_ID")
8
+
9
+ DATA_DIR = "data/raw"
10
+ INDEX_DIR = "storage/index"
11
+
12
+ DEFAULT_QUERIES = [
13
+ "India politics news",
14
+ "Global finance",
15
+ "Tech trends",
16
+ "Sports highlights",
17
+ ]
data/__init__.py ADDED
File without changes
data/news/finance.txt ADDED
File without changes
data/news/india.txt ADDED
File without changes
data/news/news.txt ADDED
File without changes
data/news/sports.txt ADDED
File without changes
data/news/tech.txt ADDED
File without changes
data/news/world.txt ADDED
File without changes
data/raw/sample.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ The Indian government has launched an AI task force to promote innovation in the public sector.
pipeline/__init__.py ADDED
File without changes
pipeline/daily_job.py ADDED
File without changes
pipeline/news_ingest.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import json
4
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
5
+
6
+ from components.indexers.news_indexer import get_or_build_index
7
+ from components.fetchers.google_search import fetch_google_news
8
+ from components.fetchers.scraper import scrape_url
9
+ from llama_index.core.settings import Settings
10
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
11
+
12
+ # ✅ Set up local embedding model
13
+ Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
14
+
15
+ # 🔐 Environment variables
16
+ API_KEY = os.environ.get("GOOGLE_API_KEY")
17
+ CSE_ID = os.environ.get("GOOGLE_CX_ID") # ✅ fixed typo
18
+
19
+ # ✅ News topics to fetch
20
+ QUERIES = [
21
+ "India news", "World news", "Tech news", "Finance news", "Sports news"
22
+ ]
23
+
24
+ # ✅ Paths
25
+ INDEX_DIR = "storage/index"
26
+ DATA_DIR = "data/news"
27
+ RAW_FILE = os.path.join(DATA_DIR, "news.txt")
28
+
29
+ def write_articles_to_file(articles, file_path):
30
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
31
+ with open(file_path, "w", encoding="utf-8") as f:
32
+ for article in articles:
33
+ f.write(article.strip() + "\n\n")
34
+
35
+ if __name__ == "__main__":
36
+ if not API_KEY or not CSE_ID:
37
+ raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")
38
+
39
+ print("🌍 Fetching news URLs from Google...")
40
+
41
+ all_articles = []
42
+
43
+ for query in QUERIES:
44
+ print(f"🔍 Searching for: {query}")
45
+ try:
46
+ results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
47
+ print(f" → Found {len(results)} links for '{query}'.")
48
+
49
+ for item in results:
50
+ url = item.get("link", "").strip()
51
+ if not url:
52
+ continue
53
+
54
+ print(f"🌐 Scraping: {url}")
55
+ article_text = scrape_url(url)
56
+
57
+ if article_text:
58
+ tagged_text = f"[{query.upper()}]\n{article_text}"
59
+ print("Adding text to vector", tagged_text)
60
+ all_articles.append(tagged_text)
61
+ else:
62
+ print(f"⚠️ Skipped: {url}")
63
+
64
+ except Exception as e:
65
+ print(f"❌ Error fetching '{query}': {e}")
66
+
67
+ if not all_articles:
68
+ print("⚠️ No content scraped. Exiting.")
69
+ else:
70
+ print(f"📝 Writing {len(all_articles)} articles to {RAW_FILE}...")
71
+ write_articles_to_file(all_articles, RAW_FILE)
72
+
73
+ print("🧠 Building index...")
74
+ get_or_build_index(DATA_DIR)
75
+
76
+ print(f"✅ Indexed and stored at: {INDEX_DIR}")
pipeline/query_demo.py ADDED
File without changes
pipeline/summarizer.py ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ python-dotenv
4
+ requests
5
+ llama-index
6
+ llama-index-embeddings-huggingface
7
+ llama_index.llms.huggingface
8
+ sentence-transformers # optional, if you use custom embedding
9
+ gradio
10
+ llama-index-vector-stores-upstash
11
+ trafilatura
12
+ newspaper3k
13
+ lxml_html_clean
routes/__init__.py ADDED
File without changes
routes/api/ingest.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ import subprocess
3
+
4
+ router = APIRouter()
5
+
6
+ @router.get("/ingest-news")
7
+ def ingest_news():
8
+ try:
9
+ subprocess.run(["python", "pipeline/news_ingest.py"], check=True)
10
+ return {"status": "success", "message": "News fetched and indexed."}
11
+ except subprocess.CalledProcessError as e:
12
+ return {"status": "error", "message": str(e)}
routes/api/query.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException
2
+ from llama_index.core.indices.base import BaseIndex
3
+ from llama_index.core.query_engine import BaseQueryEngine
4
+ from components.indexers.news_indexer import get_or_build_index
5
+ import os
6
+
7
+ router = APIRouter()
8
+
9
+ DATA_PATH = "data/raw"
10
+ INDEX_PATH = "storage/index"
11
+
12
+ _index: BaseIndex = None
13
+
14
+ def get_index() -> BaseIndex:
15
+ global _index
16
+ if _index is None:
17
+ if not os.path.exists(DATA_PATH):
18
+ raise RuntimeError("❌ `data/raw/` is missing! Add documents for indexing.")
19
+ _index = get_or_build_index(DATA_PATH, INDEX_PATH)
20
+ return _index
21
+
22
+ @router.get("/query-news")
23
+ def query_news(q: str):
24
+ index = get_index()
25
+ query_engine: BaseQueryEngine = index.as_query_engine()
26
+ response = query_engine.query(q)
27
+ return {"response": str(response)}
space_app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from components.indexers.news_indexer import (
4
+ build_news_index,
5
+ load_news_index
6
+ )
7
+ from llama_index.core.query_engine import RetrieverQueryEngine
8
+ from llama_index.core.settings import Settings
9
+
10
+ # 💥 Block OpenAI LLM usage
11
+ Settings.llm = None
12
+
13
+ DATA_DIR = "data/raw"
14
+ INDEX_DIR = "storage/index"
15
+
16
+ # Create dummy file if needed
17
+ if not os.path.exists(DATA_DIR):
18
+ os.makedirs(DATA_DIR, exist_ok=True)
19
+ with open(os.path.join(DATA_DIR, "sample.txt"), "w") as f:
20
+ f.write("Sample news: India’s Prime Minister spoke today about the tech economy boom.")
21
+
22
+ # Build index if missing
23
+ if not os.path.exists(os.path.join(INDEX_DIR, "docstore.json")):
24
+ print("📦 Index not found — building it now...")
25
+ build_news_index(data_dir=DATA_DIR, index_dir=INDEX_DIR)
26
+
27
+ # Load index safely
28
+ print("📥 Loading index...")
29
+ index = load_news_index(INDEX_DIR)
30
+ query_engine = index.as_query_engine()
31
+
32
+ def query_news(question):
33
+ response = query_engine.query(question)
34
+ return str(response)
35
+
36
+ iface = gr.Interface(
37
+ fn=query_news,
38
+ inputs=gr.Textbox(label="Ask about the news"),
39
+ outputs=gr.Textbox(label="Answer"),
40
+ title="LucidFeed: News Assistant",
41
+ description="Ask questions about the latest headlines."
42
+ )
43
+
44
+ iface.launch()
storage/__init__.py ADDED
File without changes
tests/__init__.py ADDED
File without changes