Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on Jul 17

Commit

6d24925

1 Parent(s): cb778a1

new space launch

Browse files

Files changed (33) hide show

.DS_Store +0 -0
Dockerfile +18 -0
app.py +15 -0
components/__init__.py +0 -0
components/fetchers/__init__.py +0 -0
components/fetchers/google_search.py +16 -0
components/fetchers/scraper.py +42 -0
components/gateways/__init__.py +0 -0
components/indexers/__init__.py +0 -0
components/indexers/news_indexer.py +37 -0
components/query_engine/__init__.py +0 -0
config/__init__.py +0 -0
config/config.py +17 -0
data/__init__.py +0 -0
data/news/finance.txt +0 -0
data/news/india.txt +0 -0
data/news/news.txt +0 -0
data/news/sports.txt +0 -0
data/news/tech.txt +0 -0
data/news/world.txt +0 -0
data/raw/sample.txt +1 -0
pipeline/__init__.py +0 -0
pipeline/daily_job.py +0 -0
pipeline/news_ingest.py +76 -0
pipeline/query_demo.py +0 -0
pipeline/summarizer.py +0 -0
requirements.txt +13 -0
routes/__init__.py +0 -0
routes/api/ingest.py +12 -0
routes/api/query.py +27 -0
space_app.py +44 -0
storage/__init__.py +0 -0
tests/__init__.py +0 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
+# CMD ["python", "space_app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from fastapi import FastAPI
+from routes.api import ingest, query
+from llama_index.core.settings import Settings
+Settings.llm = None
+app = FastAPI()
+@app.get("/")
+def greet():
+    return {"hello": "world"}
+app.include_router(ingest.router)
+app.include_router(query.router)

components/__init__.py ADDED Viewed

File without changes

components/fetchers/__init__.py ADDED Viewed

File without changes

components/fetchers/google_search.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+import requests
+from typing import List, Dict
+def fetch_google_news(query: str, api_key: str, cse_id: str, num_results: int = 10) -> List[Dict]:
+    url = "https://www.googleapis.com/customsearch/v1"
+    params = {
+        "q": query,
+        "key": api_key,
+        "cx": cse_id,
+        "num": num_results,
+    }
+    response = requests.get(url, params=params)
+    if response.status_code != 200:
+        raise Exception(f"Google News API error: {response.text}")
+    return response.json().get("items", [])

components/fetchers/scraper.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import requests
+import trafilatura
+from newspaper import Article
+from typing import Optional
+from bs4 import BeautifulSoup
+HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/115.0.0.0 Safari/537.36"
+    )
+}
+def clean_text(text: str) -> str:
+    # Remove excess whitespace, ads, and headings
+    cleaned = text.replace("\n", " ").strip()
+    cleaned = BeautifulSoup(cleaned, "html.parser").text  # remove tags
+    cleaned = " ".join(cleaned.split())  # remove multiple spaces
+    return cleaned
+def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
+    try:
+        response = requests.get(url, timeout=timeout, headers=HEADERS)
+        if response.status_code == 200:
+            html = response.text
+            extracted = trafilatura.extract(html, include_comments=False, include_tables=False)
+            if extracted and len(extracted.split()) > 100:
+                return clean_text(extracted)
+    except Exception as e:
+        print(f"⚠️ Trafilatura failed for {url}: {e}")
+    try:
+        article = Article(url)
+        article.download()
+        article.parse()
+        if article.text and len(article.text.split()) > 100:
+            return clean_text(article.text)
+    except Exception as e:
+        print(f"⚠️ Newspaper3k failed for {url}: {e}")
+    return None

components/gateways/__init__.py ADDED Viewed

File without changes

components/indexers/__init__.py ADDED Viewed

File without changes

components/indexers/news_indexer.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from llama_index.vector_stores.upstash import UpstashVectorStore
+from llama_index.core.storage.storage_context import StorageContext
+from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, load_index_from_storage
+from llama_index.core.node_parser import SimpleNodeParser
+from llama_index.core.settings import Settings
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+import os
+Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
+def get_upstash_vector_store():
+    return UpstashVectorStore(
+        url=os.environ["UPSTASH_VECTOR_REST_URL"],
+        token=os.environ["UPSTASH_VECTOR_REST_TOKEN"],
+    )
+def build_news_index(data_dir: str) -> VectorStoreIndex:
+    documents = SimpleDirectoryReader(data_dir).load_data()
+    nodes = SimpleNodeParser.from_defaults().get_nodes_from_documents(documents)
+    vector_store = get_upstash_vector_store()
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    index = VectorStoreIndex(nodes, storage_context=storage_context)
+    return index
+def load_news_index() -> VectorStoreIndex:
+    vector_store = get_upstash_vector_store()
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    return load_index_from_storage(storage_context)
+def get_or_build_index(data_dir: str) -> VectorStoreIndex:
+    # This should check if the index already exists in Upstash
+    try:
+        return load_news_index()
+    except Exception:
+        return build_news_index(data_dir)

components/query_engine/__init__.py ADDED Viewed

File without changes

config/__init__.py ADDED Viewed

File without changes

config/config.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+GOOGLE_CSE_ID = os.getenv("GOOGLE_CX_ID")
+DATA_DIR = "data/raw"
+INDEX_DIR = "storage/index"
+DEFAULT_QUERIES = [
+    "India politics news",
+    "Global finance",
+    "Tech trends",
+    "Sports highlights",
+]

data/__init__.py ADDED Viewed

File without changes

data/news/finance.txt ADDED Viewed

File without changes

data/news/india.txt ADDED Viewed

File without changes

data/news/news.txt ADDED Viewed

File without changes

data/news/sports.txt ADDED Viewed

File without changes

data/news/tech.txt ADDED Viewed

File without changes

data/news/world.txt ADDED Viewed

File without changes

data/raw/sample.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ The Indian government has launched an AI task force to promote innovation in the public sector.

pipeline/__init__.py ADDED Viewed

File without changes

pipeline/daily_job.py ADDED Viewed

File without changes

pipeline/news_ingest.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import sys
+import os
+import json
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from components.indexers.news_indexer import get_or_build_index
+from components.fetchers.google_search import fetch_google_news
+from components.fetchers.scraper import scrape_url
+from llama_index.core.settings import Settings
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+# ✅ Set up local embedding model
+Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
+# 🔐 Environment variables
+API_KEY = os.environ.get("GOOGLE_API_KEY")
+CSE_ID = os.environ.get("GOOGLE_CX_ID")  # ✅ fixed typo
+# ✅ News topics to fetch
+QUERIES = [
+    "India news", "World news", "Tech news", "Finance news", "Sports news"
+]
+# ✅ Paths
+INDEX_DIR = "storage/index"
+DATA_DIR = "data/news"
+RAW_FILE = os.path.join(DATA_DIR, "news.txt")
+def write_articles_to_file(articles, file_path):
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    with open(file_path, "w", encoding="utf-8") as f:
+        for article in articles:
+            f.write(article.strip() + "\n\n")
+if __name__ == "__main__":
+    if not API_KEY or not CSE_ID:
+        raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")
+    print("🌍 Fetching news URLs from Google...")
+    all_articles = []
+    for query in QUERIES:
+        print(f"🔍 Searching for: {query}")
+        try:
+            results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
+            print(f"   → Found {len(results)} links for '{query}'.")
+            for item in results:
+                url = item.get("link", "").strip()
+                if not url:
+                    continue
+                print(f"🌐 Scraping: {url}")
+                article_text = scrape_url(url)
+                if article_text:
+                    tagged_text = f"[{query.upper()}]\n{article_text}"
+                    print("Adding text to vector", tagged_text)
+                    all_articles.append(tagged_text)
+                else:
+                    print(f"⚠️ Skipped: {url}")
+        except Exception as e:
+            print(f"❌ Error fetching '{query}': {e}")
+    if not all_articles:
+        print("⚠️ No content scraped. Exiting.")
+    else:
+        print(f"📝 Writing {len(all_articles)} articles to {RAW_FILE}...")
+        write_articles_to_file(all_articles, RAW_FILE)
+        print("🧠 Building index...")
+        get_or_build_index(DATA_DIR)
+        print(f"✅ Indexed and stored at: {INDEX_DIR}")

pipeline/query_demo.py ADDED Viewed

File without changes

pipeline/summarizer.py ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi
+uvicorn[standard]
+python-dotenv
+requests
+llama-index
+llama-index-embeddings-huggingface
+llama_index.llms.huggingface
+sentence-transformers  # optional, if you use custom embedding
+gradio
+llama-index-vector-stores-upstash
+trafilatura
+newspaper3k
+lxml_html_clean

routes/__init__.py ADDED Viewed

File without changes

routes/api/ingest.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from fastapi import APIRouter
+import subprocess
+router = APIRouter()
+@router.get("/ingest-news")
+def ingest_news():
+    try:
+        subprocess.run(["python", "pipeline/news_ingest.py"], check=True)
+        return {"status": "success", "message": "News fetched and indexed."}
+    except subprocess.CalledProcessError as e:
+        return {"status": "error", "message": str(e)}

routes/api/query.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from fastapi import APIRouter, HTTPException
+from llama_index.core.indices.base import BaseIndex
+from llama_index.core.query_engine import BaseQueryEngine
+from components.indexers.news_indexer import get_or_build_index
+import os
+router = APIRouter()
+DATA_PATH = "data/raw"
+INDEX_PATH = "storage/index"
+_index: BaseIndex = None
+def get_index() -> BaseIndex:
+    global _index
+    if _index is None:
+        if not os.path.exists(DATA_PATH):
+            raise RuntimeError("❌ `data/raw/` is missing! Add documents for indexing.")
+        _index = get_or_build_index(DATA_PATH, INDEX_PATH)
+    return _index
+@router.get("/query-news")
+def query_news(q: str):
+    index = get_index()
+    query_engine: BaseQueryEngine = index.as_query_engine()
+    response = query_engine.query(q)
+    return {"response": str(response)}

space_app.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import gradio as gr
+from components.indexers.news_indexer import (
+    build_news_index,
+    load_news_index
+)
+from llama_index.core.query_engine import RetrieverQueryEngine
+from llama_index.core.settings import Settings
+# 💥 Block OpenAI LLM usage
+Settings.llm = None
+DATA_DIR = "data/raw"
+INDEX_DIR = "storage/index"
+# Create dummy file if needed
+if not os.path.exists(DATA_DIR):
+    os.makedirs(DATA_DIR, exist_ok=True)
+    with open(os.path.join(DATA_DIR, "sample.txt"), "w") as f:
+        f.write("Sample news: India’s Prime Minister spoke today about the tech economy boom.")
+# Build index if missing
+if not os.path.exists(os.path.join(INDEX_DIR, "docstore.json")):
+    print("📦 Index not found — building it now...")
+    build_news_index(data_dir=DATA_DIR, index_dir=INDEX_DIR)
+# Load index safely
+print("📥 Loading index...")
+index = load_news_index(INDEX_DIR)
+query_engine = index.as_query_engine()
+def query_news(question):
+    response = query_engine.query(question)
+    return str(response)
+iface = gr.Interface(
+    fn=query_news,
+    inputs=gr.Textbox(label="Ask about the news"),
+    outputs=gr.Textbox(label="Answer"),
+    title="LucidFeed: News Assistant",
+    description="Ask questions about the latest headlines."
+)
+iface.launch()

storage/__init__.py ADDED Viewed

File without changes

tests/__init__.py ADDED Viewed

File without changes