new module - generate feed
Browse files
components/generators/__init__.py
ADDED
File without changes
|
components/generators/daily_feed.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import json
|
4 |
+
import requests
|
5 |
+
|
6 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
7 |
+
|
8 |
+
import redis
|
9 |
+
from typing import List, Dict
|
10 |
+
from llama_index.core import VectorStoreIndex
|
11 |
+
from llama_index.core.query_engine import RetrievalQueryEngine
|
12 |
+
from components.indexers.news_indexer import load_news_index
|
13 |
+
|
14 |
+
# Load environment variables
|
15 |
+
REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
|
16 |
+
REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
|
17 |
+
MISTRAL_URL = os.environ.get("MISTRAL_URL") # Inference endpoint URL
|
18 |
+
HF_TOKEN = os.environ.get("HF_TOKEN") # Hugging Face endpoint token
|
19 |
+
|
20 |
+
# Connect to Redis
|
21 |
+
redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
|
22 |
+
|
23 |
+
# Topics to query
|
24 |
+
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
|
25 |
+
|
26 |
+
# Prompt to summarize topic
|
27 |
+
def build_prompt(content: str, topic: str) -> str:
|
28 |
+
return (
|
29 |
+
f"You are a news summarizer. Summarize the following content in 25-30 words. "
|
30 |
+
f"Make it engaging and informative. Include appropriate emojis. Topic: {topic}\n\n{content}"
|
31 |
+
)
|
32 |
+
|
33 |
+
# Call Mistral via inference endpoint
|
34 |
+
def call_mistral(prompt: str) -> str:
|
35 |
+
headers = {
|
36 |
+
"Authorization": f"Bearer {HF_TOKEN}",
|
37 |
+
"Content-Type": "application/json"
|
38 |
+
}
|
39 |
+
payload = {
|
40 |
+
"inputs": [
|
41 |
+
{"role": "user", "content": prompt}
|
42 |
+
]
|
43 |
+
}
|
44 |
+
try:
|
45 |
+
response = requests.post(MISTRAL_URL, headers=headers, json=payload, timeout=20)
|
46 |
+
response.raise_for_status()
|
47 |
+
return response.json()["outputs"][0]["content"].strip()
|
48 |
+
except Exception as e:
|
49 |
+
print(f"β οΈ Mistral error: {e}")
|
50 |
+
return None
|
51 |
+
|
52 |
+
# Generate summary for topic using Mistral
|
53 |
+
def summarize_topic(docs: List[str], topic: str) -> List[Dict]:
|
54 |
+
feed = []
|
55 |
+
for doc in docs[:5]:
|
56 |
+
prompt = build_prompt(doc, topic)
|
57 |
+
summary = call_mistral(prompt)
|
58 |
+
if summary:
|
59 |
+
feed.append({
|
60 |
+
"summary": summary,
|
61 |
+
"image_url": "https://source.unsplash.com/800x600/?news",
|
62 |
+
"article_link": "https://google.com/search?q=" + topic.replace(" ", "+")
|
63 |
+
})
|
64 |
+
return feed
|
65 |
+
|
66 |
+
# Main generation pipeline
|
67 |
+
def generate_and_cache_daily_feed():
|
68 |
+
index: VectorStoreIndex = load_news_index()
|
69 |
+
query_engine = RetrievalQueryEngine.from_args(index)
|
70 |
+
|
71 |
+
final_feed = []
|
72 |
+
for topic in TOPICS:
|
73 |
+
print(f"\nπ Generating for: {topic}")
|
74 |
+
response = query_engine.query(topic)
|
75 |
+
docs = [str(node.get_content()) for node in response.source_nodes]
|
76 |
+
|
77 |
+
topic_feed = summarize_topic(docs, topic)
|
78 |
+
final_feed.append({
|
79 |
+
"topic": topic.lower().replace(" news", ""),
|
80 |
+
"feed": topic_feed
|
81 |
+
})
|
82 |
+
|
83 |
+
# Cache to Redis
|
84 |
+
redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
|
85 |
+
print(f"β
Cached daily feed under key '{REDIS_KEY}'")
|
86 |
+
return final_feed
|
87 |
+
|
88 |
+
# Redis fetch for API
|
89 |
+
|
90 |
+
def get_cached_daily_feed():
|
91 |
+
cached = redis_client.get(REDIS_KEY)
|
92 |
+
return json.loads(cached) if cached else []
|
pipeline/news_ingest.py
CHANGED
@@ -8,6 +8,7 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
|
8 |
from components.indexers.news_indexer import get_or_build_index_from_docs
|
9 |
from components.fetchers.google_search import fetch_google_news
|
10 |
from components.fetchers.scraper import scrape_url
|
|
|
11 |
from llama_index.core.settings import Settings
|
12 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
13 |
from llama_index.core.schema import Document
|
@@ -97,4 +98,7 @@ if __name__ == "__main__":
|
|
97 |
documents = build_documents(all_articles)
|
98 |
get_or_build_index_from_docs(documents)
|
99 |
|
100 |
-
print(
|
|
|
|
|
|
|
|
8 |
from components.indexers.news_indexer import get_or_build_index_from_docs
|
9 |
from components.fetchers.google_search import fetch_google_news
|
10 |
from components.fetchers.scraper import scrape_url
|
11 |
+
from components.generators.daily_feed import generate_and_cache_daily_feed
|
12 |
from llama_index.core.settings import Settings
|
13 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
14 |
from llama_index.core.schema import Document
|
|
|
98 |
documents = build_documents(all_articles)
|
99 |
get_or_build_index_from_docs(documents)
|
100 |
|
101 |
+
print("β‘ Generating daily feed...")
|
102 |
+
generate_and_cache_daily_feed(documents) # π CALLS HEADLINE BUILDER
|
103 |
+
|
104 |
+
print(f"β
Indexed, headlines generated, and stored at: {INDEX_DIR}")
|