ragV98 commited on
Commit
6858714
Β·
1 Parent(s): 9266b3d

structural ingest change

Browse files
Files changed (1) hide show
  1. components/generators/daily_feed.py +42 -51
components/generators/daily_feed.py CHANGED
@@ -1,18 +1,10 @@
1
  import os
2
- import sys
3
  import json
4
- import requests
5
  import redis
6
- from typing import List, Dict, Optional
7
- from llama_index.core import VectorStoreIndex
8
- from llama_index.core.query_engine import RetrieverQueryEngine
9
  from llama_index.core.schema import Document
10
- from llama_index.core.settings import Settings
11
  from components.LLMs.Mistral import call_mistral
12
 
13
- # βœ… Disable implicit LLM usage
14
- Settings.llm = None
15
-
16
  # πŸ” Environment variables
17
  REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
18
  REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
@@ -20,11 +12,13 @@ REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
20
  # βœ… Redis client
21
  redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
22
 
23
- # πŸ“° Topics
24
  TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
25
 
26
- # 🧠 Base summarization prompt (used for all topics)
27
- # 🧠 Define the base summarization prompt
 
 
28
  BASE_PROMPT = (
29
  "You are Nuse’s official news summarizer β€” fast, sharp, and never generic.\n"
30
  "Your task is to read the following **collection of news excerpts** and extract the most important stories.\n"
@@ -55,58 +49,55 @@ BASE_PROMPT = (
55
  "You are generating sharp, editorial-style headlines. Only output the summaries. Nothing else."
56
  )
57
 
58
- def summarize_topic(docs: List[str], topic: str) -> List[Dict]:
59
- feed = []
60
- if not docs:
61
- return feed
62
-
63
- # 🧠 Merge all docs with separators
64
- merged_context = "\n\n---\n\n".join(doc.strip() for doc in docs)
65
- tail_prompt = f"Topic: {topic}\n\n{merged_context}"
66
-
67
- print(f"\nπŸ“€ Prompt tail for summarization:\n{tail_prompt[:500]}...\n")
68
-
69
- # 🧠 Single call to summarizer
70
- summary_block = call_mistral(base_prompt=BASE_PROMPT, tail_prompt=tail_prompt)
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  if summary_block:
73
  for line in summary_block.splitlines():
74
  line = line.strip()
75
- if line.startswith("-") or line.startswith("–"):
76
- clean_summary = line.lstrip("-–").strip()
77
- if clean_summary:
78
- feed.append({
79
- "summary": clean_summary,
 
80
  "image_url": "https://source.unsplash.com/800x600/?news",
81
- "article_link": f"https://google.com/search?q={topic.replace(' ', '+')}"
82
  })
 
83
 
84
- return feed
85
-
86
-
87
-
88
- # ⚑ Generate and cache daily feed
89
  def generate_and_cache_daily_feed(documents: List[Document]):
90
- index = VectorStoreIndex.from_documents(documents)
91
- retriever = index.as_retriever()
92
- query_engine = RetrieverQueryEngine(retriever=retriever)
93
-
94
- final_feed = []
95
- for topic in TOPICS:
96
- print(f"\nπŸ” Generating for: {topic}")
97
- response = query_engine.query(topic)
98
- docs = [str(node.get_content()) for node in response.source_nodes]
99
- topic_feed = summarize_topic(docs, topic)
100
- final_feed.append({
101
- "topic": topic.lower().replace(" news", ""),
102
- "feed": topic_feed
103
- })
104
 
105
  redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
106
  print(f"βœ… Cached daily feed under key '{REDIS_KEY}'")
107
  return final_feed
108
 
109
- # πŸ“¦ For testing or API access
110
  def get_cached_daily_feed():
111
  cached = redis_client.get(REDIS_KEY)
112
  return json.loads(cached) if cached else []
 
1
  import os
 
2
  import json
 
3
  import redis
4
+ from typing import List, Dict
 
 
5
  from llama_index.core.schema import Document
 
6
  from components.LLMs.Mistral import call_mistral
7
 
 
 
 
8
  # πŸ” Environment variables
9
  REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
10
  REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
 
12
  # βœ… Redis client
13
  redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
14
 
15
+ # πŸ“° Topic list
16
  TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
17
 
18
+ # πŸ”§ Flattened topic keys for JSON output
19
+ TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
20
+
21
+ # 🧠 Summarization prompt
22
  BASE_PROMPT = (
23
  "You are Nuse’s official news summarizer β€” fast, sharp, and never generic.\n"
24
  "Your task is to read the following **collection of news excerpts** and extract the most important stories.\n"
 
49
  "You are generating sharp, editorial-style headlines. Only output the summaries. Nothing else."
50
  )
51
 
52
+ # 🧠 Categorize summary line into topic
53
+ def categorize_summary(summary: str) -> str:
54
+ s = summary.lower()
55
+ if "india" in s or "modi" in s:
56
+ return "india"
57
+ elif any(x in s for x in ["us", "uk", "gaza", "china", "russia", "bangladesh", "israel", "trump", "biden", "world"]):
58
+ return "world"
59
+ elif any(x in s for x in ["ai", "tech", "space", "innovation", "startup", "software", "device"]):
60
+ return "tech"
61
+ elif any(x in s for x in ["market", "stock", "inflation", "finance", "fed", "reserve", "earnings", "revenue", "economy"]):
62
+ return "finance"
63
+ elif any(x in s for x in ["cricket", "football", "nba", "nfl", "sports", "match", "league", "tournament"]):
64
+ return "sports"
65
+ else:
66
+ return "world"
67
+
68
+ # πŸ§ͺ Summarize the entire day’s documents in one LLM pass
69
+ def summarize_all_documents(documents: List[Document]) -> Dict[str, List[Dict]]:
70
+ merged_text = "\n\n---\n\n".join(doc.text.strip() for doc in documents if doc.text.strip())
71
+
72
+ print("\n🧠 Sending merged prompt to summarizer...\n")
73
+ summary_block = call_mistral(base_prompt=BASE_PROMPT, tail_prompt=merged_text)
74
+
75
+ categorized_feed = {key: [] for key in TOPIC_KEYS}
76
 
77
  if summary_block:
78
  for line in summary_block.splitlines():
79
  line = line.strip()
80
+ if line.startswith("-"):
81
+ clean = line.lstrip("-–").strip()
82
+ if clean:
83
+ topic_key = categorize_summary(clean)
84
+ categorized_feed[topic_key].append({
85
+ "summary": clean,
86
  "image_url": "https://source.unsplash.com/800x600/?news",
87
+ "article_link": f"https://google.com/search?q={topic_key}+news"
88
  })
89
+ return categorized_feed
90
 
91
+ # πŸš€ Final callable to build and cache the feed
 
 
 
 
92
  def generate_and_cache_daily_feed(documents: List[Document]):
93
+ all_feed = summarize_all_documents(documents)
94
+ final_feed = [{"topic": topic, "feed": all_feed[topic]} for topic in TOPIC_KEYS]
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
97
  print(f"βœ… Cached daily feed under key '{REDIS_KEY}'")
98
  return final_feed
99
 
100
+ # πŸ“¦ Utility to read cached data
101
  def get_cached_daily_feed():
102
  cached = redis_client.get(REDIS_KEY)
103
  return json.loads(cached) if cached else []