structural ingest change
Browse files
components/generators/daily_feed.py
CHANGED
@@ -1,18 +1,10 @@
|
|
1 |
import os
|
2 |
-
import sys
|
3 |
import json
|
4 |
-
import requests
|
5 |
import redis
|
6 |
-
from typing import List, Dict
|
7 |
-
from llama_index.core import VectorStoreIndex
|
8 |
-
from llama_index.core.query_engine import RetrieverQueryEngine
|
9 |
from llama_index.core.schema import Document
|
10 |
-
from llama_index.core.settings import Settings
|
11 |
from components.LLMs.Mistral import call_mistral
|
12 |
|
13 |
-
# β
Disable implicit LLM usage
|
14 |
-
Settings.llm = None
|
15 |
-
|
16 |
# π Environment variables
|
17 |
REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
|
18 |
REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
|
@@ -20,11 +12,13 @@ REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
|
|
20 |
# β
Redis client
|
21 |
redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
|
22 |
|
23 |
-
# π°
|
24 |
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
|
25 |
|
26 |
-
#
|
27 |
-
|
|
|
|
|
28 |
BASE_PROMPT = (
|
29 |
"You are Nuseβs official news summarizer β fast, sharp, and never generic.\n"
|
30 |
"Your task is to read the following **collection of news excerpts** and extract the most important stories.\n"
|
@@ -55,58 +49,55 @@ BASE_PROMPT = (
|
|
55 |
"You are generating sharp, editorial-style headlines. Only output the summaries. Nothing else."
|
56 |
)
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
if summary_block:
|
73 |
for line in summary_block.splitlines():
|
74 |
line = line.strip()
|
75 |
-
if line.startswith("-")
|
76 |
-
|
77 |
-
if
|
78 |
-
|
79 |
-
|
|
|
80 |
"image_url": "https://source.unsplash.com/800x600/?news",
|
81 |
-
"article_link": f"https://google.com/search?q={
|
82 |
})
|
|
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
# β‘ Generate and cache daily feed
|
89 |
def generate_and_cache_daily_feed(documents: List[Document]):
|
90 |
-
|
91 |
-
|
92 |
-
query_engine = RetrieverQueryEngine(retriever=retriever)
|
93 |
-
|
94 |
-
final_feed = []
|
95 |
-
for topic in TOPICS:
|
96 |
-
print(f"\nπ Generating for: {topic}")
|
97 |
-
response = query_engine.query(topic)
|
98 |
-
docs = [str(node.get_content()) for node in response.source_nodes]
|
99 |
-
topic_feed = summarize_topic(docs, topic)
|
100 |
-
final_feed.append({
|
101 |
-
"topic": topic.lower().replace(" news", ""),
|
102 |
-
"feed": topic_feed
|
103 |
-
})
|
104 |
|
105 |
redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
|
106 |
print(f"β
Cached daily feed under key '{REDIS_KEY}'")
|
107 |
return final_feed
|
108 |
|
109 |
-
# π¦
|
110 |
def get_cached_daily_feed():
|
111 |
cached = redis_client.get(REDIS_KEY)
|
112 |
return json.loads(cached) if cached else []
|
|
|
1 |
import os
|
|
|
2 |
import json
|
|
|
3 |
import redis
|
4 |
+
from typing import List, Dict
|
|
|
|
|
5 |
from llama_index.core.schema import Document
|
|
|
6 |
from components.LLMs.Mistral import call_mistral
|
7 |
|
|
|
|
|
|
|
8 |
# π Environment variables
|
9 |
REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
|
10 |
REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
|
|
|
12 |
# β
Redis client
|
13 |
redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
|
14 |
|
15 |
+
# π° Topic list
|
16 |
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
|
17 |
|
18 |
+
# π§ Flattened topic keys for JSON output
|
19 |
+
TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
|
20 |
+
|
21 |
+
# π§ Summarization prompt
|
22 |
BASE_PROMPT = (
|
23 |
"You are Nuseβs official news summarizer β fast, sharp, and never generic.\n"
|
24 |
"Your task is to read the following **collection of news excerpts** and extract the most important stories.\n"
|
|
|
49 |
"You are generating sharp, editorial-style headlines. Only output the summaries. Nothing else."
|
50 |
)
|
51 |
|
52 |
+
# π§ Categorize summary line into topic
|
53 |
+
def categorize_summary(summary: str) -> str:
|
54 |
+
s = summary.lower()
|
55 |
+
if "india" in s or "modi" in s:
|
56 |
+
return "india"
|
57 |
+
elif any(x in s for x in ["us", "uk", "gaza", "china", "russia", "bangladesh", "israel", "trump", "biden", "world"]):
|
58 |
+
return "world"
|
59 |
+
elif any(x in s for x in ["ai", "tech", "space", "innovation", "startup", "software", "device"]):
|
60 |
+
return "tech"
|
61 |
+
elif any(x in s for x in ["market", "stock", "inflation", "finance", "fed", "reserve", "earnings", "revenue", "economy"]):
|
62 |
+
return "finance"
|
63 |
+
elif any(x in s for x in ["cricket", "football", "nba", "nfl", "sports", "match", "league", "tournament"]):
|
64 |
+
return "sports"
|
65 |
+
else:
|
66 |
+
return "world"
|
67 |
+
|
68 |
+
# π§ͺ Summarize the entire dayβs documents in one LLM pass
|
69 |
+
def summarize_all_documents(documents: List[Document]) -> Dict[str, List[Dict]]:
|
70 |
+
merged_text = "\n\n---\n\n".join(doc.text.strip() for doc in documents if doc.text.strip())
|
71 |
+
|
72 |
+
print("\nπ§ Sending merged prompt to summarizer...\n")
|
73 |
+
summary_block = call_mistral(base_prompt=BASE_PROMPT, tail_prompt=merged_text)
|
74 |
+
|
75 |
+
categorized_feed = {key: [] for key in TOPIC_KEYS}
|
76 |
|
77 |
if summary_block:
|
78 |
for line in summary_block.splitlines():
|
79 |
line = line.strip()
|
80 |
+
if line.startswith("-"):
|
81 |
+
clean = line.lstrip("-β").strip()
|
82 |
+
if clean:
|
83 |
+
topic_key = categorize_summary(clean)
|
84 |
+
categorized_feed[topic_key].append({
|
85 |
+
"summary": clean,
|
86 |
"image_url": "https://source.unsplash.com/800x600/?news",
|
87 |
+
"article_link": f"https://google.com/search?q={topic_key}+news"
|
88 |
})
|
89 |
+
return categorized_feed
|
90 |
|
91 |
+
# π Final callable to build and cache the feed
|
|
|
|
|
|
|
|
|
92 |
def generate_and_cache_daily_feed(documents: List[Document]):
|
93 |
+
all_feed = summarize_all_documents(documents)
|
94 |
+
final_feed = [{"topic": topic, "feed": all_feed[topic]} for topic in TOPIC_KEYS]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
|
97 |
print(f"β
Cached daily feed under key '{REDIS_KEY}'")
|
98 |
return final_feed
|
99 |
|
100 |
+
# π¦ Utility to read cached data
|
101 |
def get_cached_daily_feed():
|
102 |
cached = redis_client.get(REDIS_KEY)
|
103 |
return json.loads(cached) if cached else []
|