new space launch
Browse files- .DS_Store +0 -0
- Dockerfile +18 -0
- app.py +15 -0
- components/__init__.py +0 -0
- components/fetchers/__init__.py +0 -0
- components/fetchers/google_search.py +16 -0
- components/fetchers/scraper.py +42 -0
- components/gateways/__init__.py +0 -0
- components/indexers/__init__.py +0 -0
- components/indexers/news_indexer.py +37 -0
- components/query_engine/__init__.py +0 -0
- config/__init__.py +0 -0
- config/config.py +17 -0
- data/__init__.py +0 -0
- data/news/finance.txt +0 -0
- data/news/india.txt +0 -0
- data/news/news.txt +0 -0
- data/news/sports.txt +0 -0
- data/news/tech.txt +0 -0
- data/news/world.txt +0 -0
- data/raw/sample.txt +1 -0
- pipeline/__init__.py +0 -0
- pipeline/daily_job.py +0 -0
- pipeline/news_ingest.py +76 -0
- pipeline/query_demo.py +0 -0
- pipeline/summarizer.py +0 -0
- requirements.txt +13 -0
- routes/__init__.py +0 -0
- routes/api/ingest.py +12 -0
- routes/api/query.py +27 -0
- space_app.py +44 -0
- storage/__init__.py +0 -0
- tests/__init__.py +0 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
Dockerfile
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
+
# you will also find guides on how best to write your Dockerfile
|
3 |
+
|
4 |
+
FROM python:3.9
|
5 |
+
|
6 |
+
RUN useradd -m -u 1000 user
|
7 |
+
USER user
|
8 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
9 |
+
|
10 |
+
WORKDIR /app
|
11 |
+
|
12 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
13 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
14 |
+
|
15 |
+
COPY --chown=user . /app
|
16 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
17 |
+
# CMD ["python", "space_app.py"]
|
18 |
+
|
app.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from routes.api import ingest, query
|
3 |
+
from llama_index.core.settings import Settings
|
4 |
+
|
5 |
+
Settings.llm = None
|
6 |
+
|
7 |
+
|
8 |
+
app = FastAPI()
|
9 |
+
|
10 |
+
@app.get("/")
|
11 |
+
def greet():
|
12 |
+
return {"hello": "world"}
|
13 |
+
|
14 |
+
app.include_router(ingest.router)
|
15 |
+
app.include_router(query.router)
|
components/__init__.py
ADDED
File without changes
|
components/fetchers/__init__.py
ADDED
File without changes
|
components/fetchers/google_search.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
from typing import List, Dict
|
4 |
+
|
5 |
+
def fetch_google_news(query: str, api_key: str, cse_id: str, num_results: int = 10) -> List[Dict]:
|
6 |
+
url = "https://www.googleapis.com/customsearch/v1"
|
7 |
+
params = {
|
8 |
+
"q": query,
|
9 |
+
"key": api_key,
|
10 |
+
"cx": cse_id,
|
11 |
+
"num": num_results,
|
12 |
+
}
|
13 |
+
response = requests.get(url, params=params)
|
14 |
+
if response.status_code != 200:
|
15 |
+
raise Exception(f"Google News API error: {response.text}")
|
16 |
+
return response.json().get("items", [])
|
components/fetchers/scraper.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import trafilatura
|
3 |
+
from newspaper import Article
|
4 |
+
from typing import Optional
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
|
7 |
+
HEADERS = {
|
8 |
+
"User-Agent": (
|
9 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
10 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
11 |
+
"Chrome/115.0.0.0 Safari/537.36"
|
12 |
+
)
|
13 |
+
}
|
14 |
+
|
15 |
+
def clean_text(text: str) -> str:
|
16 |
+
# Remove excess whitespace, ads, and headings
|
17 |
+
cleaned = text.replace("\n", " ").strip()
|
18 |
+
cleaned = BeautifulSoup(cleaned, "html.parser").text # remove tags
|
19 |
+
cleaned = " ".join(cleaned.split()) # remove multiple spaces
|
20 |
+
return cleaned
|
21 |
+
|
22 |
+
def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
|
23 |
+
try:
|
24 |
+
response = requests.get(url, timeout=timeout, headers=HEADERS)
|
25 |
+
if response.status_code == 200:
|
26 |
+
html = response.text
|
27 |
+
extracted = trafilatura.extract(html, include_comments=False, include_tables=False)
|
28 |
+
if extracted and len(extracted.split()) > 100:
|
29 |
+
return clean_text(extracted)
|
30 |
+
except Exception as e:
|
31 |
+
print(f"⚠️ Trafilatura failed for {url}: {e}")
|
32 |
+
|
33 |
+
try:
|
34 |
+
article = Article(url)
|
35 |
+
article.download()
|
36 |
+
article.parse()
|
37 |
+
if article.text and len(article.text.split()) > 100:
|
38 |
+
return clean_text(article.text)
|
39 |
+
except Exception as e:
|
40 |
+
print(f"⚠️ Newspaper3k failed for {url}: {e}")
|
41 |
+
|
42 |
+
return None
|
components/gateways/__init__.py
ADDED
File without changes
|
components/indexers/__init__.py
ADDED
File without changes
|
components/indexers/news_indexer.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from llama_index.vector_stores.upstash import UpstashVectorStore
|
2 |
+
from llama_index.core.storage.storage_context import StorageContext
|
3 |
+
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, load_index_from_storage
|
4 |
+
from llama_index.core.node_parser import SimpleNodeParser
|
5 |
+
from llama_index.core.settings import Settings
|
6 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
7 |
+
import os
|
8 |
+
|
9 |
+
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
10 |
+
|
11 |
+
def get_upstash_vector_store():
|
12 |
+
return UpstashVectorStore(
|
13 |
+
url=os.environ["UPSTASH_VECTOR_REST_URL"],
|
14 |
+
token=os.environ["UPSTASH_VECTOR_REST_TOKEN"],
|
15 |
+
)
|
16 |
+
|
17 |
+
def build_news_index(data_dir: str) -> VectorStoreIndex:
|
18 |
+
documents = SimpleDirectoryReader(data_dir).load_data()
|
19 |
+
nodes = SimpleNodeParser.from_defaults().get_nodes_from_documents(documents)
|
20 |
+
|
21 |
+
vector_store = get_upstash_vector_store()
|
22 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
23 |
+
|
24 |
+
index = VectorStoreIndex(nodes, storage_context=storage_context)
|
25 |
+
return index
|
26 |
+
|
27 |
+
def load_news_index() -> VectorStoreIndex:
|
28 |
+
vector_store = get_upstash_vector_store()
|
29 |
+
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
30 |
+
return load_index_from_storage(storage_context)
|
31 |
+
|
32 |
+
def get_or_build_index(data_dir: str) -> VectorStoreIndex:
|
33 |
+
# This should check if the index already exists in Upstash
|
34 |
+
try:
|
35 |
+
return load_news_index()
|
36 |
+
except Exception:
|
37 |
+
return build_news_index(data_dir)
|
components/query_engine/__init__.py
ADDED
File without changes
|
config/__init__.py
ADDED
File without changes
|
config/config.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
|
4 |
+
load_dotenv()
|
5 |
+
|
6 |
+
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
7 |
+
GOOGLE_CSE_ID = os.getenv("GOOGLE_CX_ID")
|
8 |
+
|
9 |
+
DATA_DIR = "data/raw"
|
10 |
+
INDEX_DIR = "storage/index"
|
11 |
+
|
12 |
+
DEFAULT_QUERIES = [
|
13 |
+
"India politics news",
|
14 |
+
"Global finance",
|
15 |
+
"Tech trends",
|
16 |
+
"Sports highlights",
|
17 |
+
]
|
data/__init__.py
ADDED
File without changes
|
data/news/finance.txt
ADDED
File without changes
|
data/news/india.txt
ADDED
File without changes
|
data/news/news.txt
ADDED
File without changes
|
data/news/sports.txt
ADDED
File without changes
|
data/news/tech.txt
ADDED
File without changes
|
data/news/world.txt
ADDED
File without changes
|
data/raw/sample.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
The Indian government has launched an AI task force to promote innovation in the public sector.
|
pipeline/__init__.py
ADDED
File without changes
|
pipeline/daily_job.py
ADDED
File without changes
|
pipeline/news_ingest.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
5 |
+
|
6 |
+
from components.indexers.news_indexer import get_or_build_index
|
7 |
+
from components.fetchers.google_search import fetch_google_news
|
8 |
+
from components.fetchers.scraper import scrape_url
|
9 |
+
from llama_index.core.settings import Settings
|
10 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
11 |
+
|
12 |
+
# ✅ Set up local embedding model
|
13 |
+
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
|
14 |
+
|
15 |
+
# 🔐 Environment variables
|
16 |
+
API_KEY = os.environ.get("GOOGLE_API_KEY")
|
17 |
+
CSE_ID = os.environ.get("GOOGLE_CX_ID") # ✅ fixed typo
|
18 |
+
|
19 |
+
# ✅ News topics to fetch
|
20 |
+
QUERIES = [
|
21 |
+
"India news", "World news", "Tech news", "Finance news", "Sports news"
|
22 |
+
]
|
23 |
+
|
24 |
+
# ✅ Paths
|
25 |
+
INDEX_DIR = "storage/index"
|
26 |
+
DATA_DIR = "data/news"
|
27 |
+
RAW_FILE = os.path.join(DATA_DIR, "news.txt")
|
28 |
+
|
29 |
+
def write_articles_to_file(articles, file_path):
|
30 |
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
31 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
32 |
+
for article in articles:
|
33 |
+
f.write(article.strip() + "\n\n")
|
34 |
+
|
35 |
+
if __name__ == "__main__":
|
36 |
+
if not API_KEY or not CSE_ID:
|
37 |
+
raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")
|
38 |
+
|
39 |
+
print("🌍 Fetching news URLs from Google...")
|
40 |
+
|
41 |
+
all_articles = []
|
42 |
+
|
43 |
+
for query in QUERIES:
|
44 |
+
print(f"🔍 Searching for: {query}")
|
45 |
+
try:
|
46 |
+
results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
|
47 |
+
print(f" → Found {len(results)} links for '{query}'.")
|
48 |
+
|
49 |
+
for item in results:
|
50 |
+
url = item.get("link", "").strip()
|
51 |
+
if not url:
|
52 |
+
continue
|
53 |
+
|
54 |
+
print(f"🌐 Scraping: {url}")
|
55 |
+
article_text = scrape_url(url)
|
56 |
+
|
57 |
+
if article_text:
|
58 |
+
tagged_text = f"[{query.upper()}]\n{article_text}"
|
59 |
+
print("Adding text to vector", tagged_text)
|
60 |
+
all_articles.append(tagged_text)
|
61 |
+
else:
|
62 |
+
print(f"⚠️ Skipped: {url}")
|
63 |
+
|
64 |
+
except Exception as e:
|
65 |
+
print(f"❌ Error fetching '{query}': {e}")
|
66 |
+
|
67 |
+
if not all_articles:
|
68 |
+
print("⚠️ No content scraped. Exiting.")
|
69 |
+
else:
|
70 |
+
print(f"📝 Writing {len(all_articles)} articles to {RAW_FILE}...")
|
71 |
+
write_articles_to_file(all_articles, RAW_FILE)
|
72 |
+
|
73 |
+
print("🧠 Building index...")
|
74 |
+
get_or_build_index(DATA_DIR)
|
75 |
+
|
76 |
+
print(f"✅ Indexed and stored at: {INDEX_DIR}")
|
pipeline/query_demo.py
ADDED
File without changes
|
pipeline/summarizer.py
ADDED
File without changes
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn[standard]
|
3 |
+
python-dotenv
|
4 |
+
requests
|
5 |
+
llama-index
|
6 |
+
llama-index-embeddings-huggingface
|
7 |
+
llama_index.llms.huggingface
|
8 |
+
sentence-transformers # optional, if you use custom embedding
|
9 |
+
gradio
|
10 |
+
llama-index-vector-stores-upstash
|
11 |
+
trafilatura
|
12 |
+
newspaper3k
|
13 |
+
lxml_html_clean
|
routes/__init__.py
ADDED
File without changes
|
routes/api/ingest.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter
|
2 |
+
import subprocess
|
3 |
+
|
4 |
+
router = APIRouter()
|
5 |
+
|
6 |
+
@router.get("/ingest-news")
|
7 |
+
def ingest_news():
|
8 |
+
try:
|
9 |
+
subprocess.run(["python", "pipeline/news_ingest.py"], check=True)
|
10 |
+
return {"status": "success", "message": "News fetched and indexed."}
|
11 |
+
except subprocess.CalledProcessError as e:
|
12 |
+
return {"status": "error", "message": str(e)}
|
routes/api/query.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, HTTPException
|
2 |
+
from llama_index.core.indices.base import BaseIndex
|
3 |
+
from llama_index.core.query_engine import BaseQueryEngine
|
4 |
+
from components.indexers.news_indexer import get_or_build_index
|
5 |
+
import os
|
6 |
+
|
7 |
+
router = APIRouter()
|
8 |
+
|
9 |
+
DATA_PATH = "data/raw"
|
10 |
+
INDEX_PATH = "storage/index"
|
11 |
+
|
12 |
+
_index: BaseIndex = None
|
13 |
+
|
14 |
+
def get_index() -> BaseIndex:
|
15 |
+
global _index
|
16 |
+
if _index is None:
|
17 |
+
if not os.path.exists(DATA_PATH):
|
18 |
+
raise RuntimeError("❌ `data/raw/` is missing! Add documents for indexing.")
|
19 |
+
_index = get_or_build_index(DATA_PATH, INDEX_PATH)
|
20 |
+
return _index
|
21 |
+
|
22 |
+
@router.get("/query-news")
|
23 |
+
def query_news(q: str):
|
24 |
+
index = get_index()
|
25 |
+
query_engine: BaseQueryEngine = index.as_query_engine()
|
26 |
+
response = query_engine.query(q)
|
27 |
+
return {"response": str(response)}
|
space_app.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
from components.indexers.news_indexer import (
|
4 |
+
build_news_index,
|
5 |
+
load_news_index
|
6 |
+
)
|
7 |
+
from llama_index.core.query_engine import RetrieverQueryEngine
|
8 |
+
from llama_index.core.settings import Settings
|
9 |
+
|
10 |
+
# 💥 Block OpenAI LLM usage
|
11 |
+
Settings.llm = None
|
12 |
+
|
13 |
+
DATA_DIR = "data/raw"
|
14 |
+
INDEX_DIR = "storage/index"
|
15 |
+
|
16 |
+
# Create dummy file if needed
|
17 |
+
if not os.path.exists(DATA_DIR):
|
18 |
+
os.makedirs(DATA_DIR, exist_ok=True)
|
19 |
+
with open(os.path.join(DATA_DIR, "sample.txt"), "w") as f:
|
20 |
+
f.write("Sample news: India’s Prime Minister spoke today about the tech economy boom.")
|
21 |
+
|
22 |
+
# Build index if missing
|
23 |
+
if not os.path.exists(os.path.join(INDEX_DIR, "docstore.json")):
|
24 |
+
print("📦 Index not found — building it now...")
|
25 |
+
build_news_index(data_dir=DATA_DIR, index_dir=INDEX_DIR)
|
26 |
+
|
27 |
+
# Load index safely
|
28 |
+
print("📥 Loading index...")
|
29 |
+
index = load_news_index(INDEX_DIR)
|
30 |
+
query_engine = index.as_query_engine()
|
31 |
+
|
32 |
+
def query_news(question):
|
33 |
+
response = query_engine.query(question)
|
34 |
+
return str(response)
|
35 |
+
|
36 |
+
iface = gr.Interface(
|
37 |
+
fn=query_news,
|
38 |
+
inputs=gr.Textbox(label="Ask about the news"),
|
39 |
+
outputs=gr.Textbox(label="Answer"),
|
40 |
+
title="LucidFeed: News Assistant",
|
41 |
+
description="Ask questions about the latest headlines."
|
42 |
+
)
|
43 |
+
|
44 |
+
iface.launch()
|
storage/__init__.py
ADDED
File without changes
|
tests/__init__.py
ADDED
File without changes
|