ragV98 commited on
Commit
af23d2f
·
1 Parent(s): e4a76c1
app.py CHANGED
@@ -1,8 +1,33 @@
 
 
 
1
  from fastapi import FastAPI
2
- from routes.api import ingest, query, headlines
3
- from llama_index.core.settings import Settings
4
 
5
- Settings.llm = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
 
8
  app = FastAPI()
@@ -11,6 +36,8 @@ app = FastAPI()
11
  def greet():
12
  return {"welcome": "nuse ai"}
13
 
14
- app.include_router(ingest.router)
15
- app.include_router(query.router)
16
- app.include_router(headlines.router)
 
 
 
1
+ # app.py
2
+ import os
3
+ import sys
4
  from fastapi import FastAPI
 
 
5
 
6
+ # --- Crucial for finding your modules ---
7
+ # Assuming app.py is at the project root level.
8
+ # This ensures Python can find 'components' and 'routes' as packages.
9
+ sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
10
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "components")))
11
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "routes"))) # Add the routes directory
12
+ # Add routes/api to path if you are doing 'from routes.api import module' directly
13
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "routes", "api")))
14
+
15
+ # Import your routers
16
+ # These imports expect routes/api/ingest.py, routes/api/query.py, routes/api/headlines.py to exist
17
+ from routes.api import ingest as ingest_router_module
18
+ from routes.api import query as query_router_module # Assuming this exists
19
+ from routes.api import headlines as headlines_router_module
20
+
21
+
22
+ # NOTE: Settings.llm = None
23
+ # This line is problematic if LlamaIndex components in your pipeline (like query engine)
24
+ # rely on a global LLM setting. If you intend to use an LLM with LlamaIndex features,
25
+ # you would set it here, e.g., `Settings.llm = OpenAI()`
26
+ # For this current pipeline, the OpenAI client is initialized explicitly within
27
+ # daily_feed.py and detailed_explainer.py, so setting Settings.llm here is not strictly needed
28
+ # but also not harmful if it's just meant as a placeholder for a different use case.
29
+ # I will leave it commented out as per your original request, but be aware of its implications.
30
+ # Settings.llm = None
31
 
32
 
33
  app = FastAPI()
 
36
  def greet():
37
  return {"welcome": "nuse ai"}
38
 
39
+ # Include your routers
40
+ # Use .router to access the APIRouter instance from the imported modules
41
+ app.include_router(ingest_router_module.router, prefix="/api/ingest")
42
+ app.include_router(query_router_module.router, prefix="/api/query") # Assuming query.py exists
43
+ app.include_router(headlines_router_module.router, prefix="/api/headlines")
components/generators/detailed_explainer.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import numpy as np
4
+ import redis
5
+ from typing import List, Dict, Any, Optional, Set
6
+ from openai import OpenAI
7
+ from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
8
+ from llama_index.core.schema import TextNode
9
+ from components.indexers.news_indexer import get_upstash_vector_store
10
+ import logging
11
+
12
+ from llama_index.core.settings import Settings
13
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
14
+
15
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
16
+
17
+ # 🔐 Environment variables for this module
18
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
19
+ REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
20
+
21
+ # ✅ Redis client for this module
22
+ try:
23
+ detailed_explainer_redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
24
+ detailed_explainer_redis_client.ping()
25
+ logging.info("Redis client initialized for detailed_explainer.py.")
26
+ except Exception as e:
27
+ logging.critical(f"❌ FATAL ERROR: Could not connect to Redis in detailed_explainer.py: {e}")
28
+ raise
29
+
30
+ # Cache Key specific to detailed explanations
31
+ DETAILED_FEED_CACHE_KEY = "detailed_news_feed_cache"
32
+
33
+ # Ensure Settings.embed_model is configured globally.
34
+ try:
35
+ if not hasattr(Settings, 'embed_model') or Settings.embed_model is None:
36
+ logging.info("Settings.embed_model not yet configured, initializing with default HuggingFaceEmbedding.")
37
+ Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
38
+ except Exception as e:
39
+ logging.error(f"Failed to initialize Settings.embed_model in detailed_explainer: {e}")
40
+
41
+
42
+ # LLM prompt for detailed explanation
43
+ EXPLAINER_PROMPT = (
44
+ "You are an expert news analyst. Based on the following article content, "
45
+ "generate a concise, detailed explanation (50-60 words) for the headline provided. "
46
+ "Focus on the 'why it matters' and key context. Do not include any introductory phrases, just the explanation itself."
47
+ "\n\nHeadline: {headline}"
48
+ "\n\nArticle Content:\n{article_content}"
49
+ "\n\nDetailed Explanation (50-60 words):"
50
+ )
51
+
52
+ async def get_detailed_explanation_from_vector(
53
+ summary_item: Dict[str, Any],
54
+ vector_store_client: Any
55
+ ) -> Dict[str, Any]:
56
+ """
57
+ Takes a summary item, queries the vector store for its original article content,
58
+ and generates a detailed explanation using an LLM.
59
+ """
60
+ headline_text = summary_item["summary"]
61
+ representative_article_link = summary_item["article_link"]
62
+ representative_title = summary_item["representative_title"]
63
+
64
+ detailed_content = ""
65
+ sources_found: Set[str] = set()
66
+
67
+ logging.info(f"Retrieving detailed content for headline: '{headline_text}' (from {representative_article_link})")
68
+
69
+ try:
70
+ query_text = f"{representative_title} {representative_article_link}" if representative_title else representative_article_link
71
+
72
+ query_embedding = Settings.embed_model.embed_query(query_text)
73
+
74
+ filters = MetadataFilters(
75
+ filters=[MetadataFilter(key="url", value=representative_article_link, operator=FilterOperator.EQ)]
76
+ )
77
+
78
+ query = VectorStoreQuery(
79
+ query_embedding=query_embedding,
80
+ similarity_top_k=5,
81
+ filters=filters
82
+ )
83
+ result = vector_store_client.query(query)
84
+
85
+ if result.nodes:
86
+ for node in result.nodes:
87
+ node_content = node.get_content().strip()
88
+ if node_content:
89
+ detailed_content += node_content + "\n\n"
90
+ if "source" in node.metadata:
91
+ sources_found.add(node.metadata["source"])
92
+
93
+ if not detailed_content:
94
+ logging.warning(f"No usable content found in nodes retrieved for URL: {representative_article_link}. Falling back to title+url context.")
95
+ detailed_content = representative_title + " " + representative_article_link
96
+
97
+ else:
98
+ logging.warning(f"No original article found in vector store for URL: {representative_article_link}. Using summary as context.")
99
+ detailed_content = summary_item["summary"] + ". " + summary_item.get("explanation", "")
100
+
101
+ except Exception as e:
102
+ logging.error(f"❌ Error querying vector store for detailed content for '{representative_article_link}': {e}", exc_info=True)
103
+ detailed_content = summary_item["summary"] + ". " + summary_item.get("explanation", "")
104
+
105
+ # Generate detailed explanation using LLM
106
+ detailed_explanation_text = ""
107
+ try:
108
+ client = OpenAI(api_key=OPENAI_API_KEY)
109
+ if not OPENAI_API_KEY:
110
+ raise ValueError("OPENAI_API_KEY is not set.")
111
+
112
+ llm_response = client.chat.completions.create(
113
+ model="gpt-4o",
114
+ messages=[
115
+ {"role": "system", "content": "You are a concise and informative news explainer."},
116
+ {"role": "user", "content": EXPLAINER_PROMPT.format(
117
+ headline=headline_text,
118
+ article_content=detailed_content
119
+ )},
120
+ ],
121
+ max_tokens=100,
122
+ temperature=0.4,
123
+ )
124
+ detailed_explanation_text = llm_response.choices[0].message.content.strip()
125
+ logging.info(f"Generated detailed explanation for '{headline_text}'.")
126
+
127
+ except Exception as e:
128
+ logging.error(f"❌ Error generating detailed explanation for '{headline_text}': {e}", exc_info=True)
129
+ detailed_explanation_text = summary_item.get("explanation", "Could not generate a detailed explanation.")
130
+
131
+ return {
132
+ "title": headline_text,
133
+ "description": detailed_explanation_text,
134
+ "sources": list(sources_found) if sources_found else ["General News Sources"]
135
+ }
136
+
137
+ async def generate_detailed_feed(
138
+ cached_feed: Dict[str, Dict[int, Dict[str, Any]]]
139
+ ) -> Dict[str, Dict[int, Dict[str, Any]]]:
140
+ """
141
+ Generates detailed explanations for each summary in the cached feed.
142
+ Does NOT cache the result internally. The caller is responsible for caching.
143
+ """
144
+ if not cached_feed:
145
+ logging.info("No cached feed found to generate detailed explanations from.")
146
+ return {}
147
+
148
+ detailed_feed_structured: Dict[str, Dict[int, Dict[str, Any]]] = {}
149
+ vector_store = get_upstash_vector_store()
150
+
151
+ for topic_key, summaries_map in cached_feed.items():
152
+ logging.info(f"Processing detailed explanations for topic: {topic_key}")
153
+ detailed_summaries_for_topic: Dict[int, Dict[str, Any]] = {}
154
+
155
+ for summary_id in sorted(summaries_map.keys()):
156
+ summary_item = summaries_map[summary_id]
157
+
158
+ detailed_item = await get_detailed_explanation_from_vector(summary_item, vector_store)
159
+
160
+ detailed_summaries_for_topic[summary_id] = detailed_item
161
+
162
+ detailed_feed_structured[topic_key] = detailed_summaries_for_topic
163
+
164
+ logging.info("✅ Detailed explanation generation complete.")
165
+ return detailed_feed_structured
166
+
167
+
168
+ def cache_detailed_feed(feed_data: Dict[str, Dict[int, Dict[str, Any]]]):
169
+ """Caches the given detailed feed data to Redis using its dedicated client."""
170
+ try:
171
+ detailed_explainer_redis_client.set(DETAILED_FEED_CACHE_KEY, json.dumps(feed_data, ensure_ascii=False))
172
+ detailed_explainer_redis_client.expire(DETAILED_FEED_CACHE_KEY, 86400)
173
+ logging.info(f"✅ Detailed feed cached under key '{DETAILED_FEED_CACHE_KEY}' with 24-hour expiry.")
174
+ except Exception as e:
175
+ logging.error(f"❌ [Redis detailed feed caching error]: {e}", exc_info=True)
176
+ raise
177
+
178
+
179
+ def get_cached_detailed_feed() -> Dict[str, Dict[int, Dict[str, Any]]]:
180
+ """Retrieves the cached detailed feed from Redis using its dedicated client."""
181
+ try:
182
+ cached_raw = detailed_explainer_redis_client.get(DETAILED_FEED_CACHE_KEY)
183
+ if cached_raw:
184
+ logging.info(f"✅ Retrieved cached detailed feed from '{DETAILED_FEED_CACHE_KEY}'.")
185
+ return json.loads(cached_raw)
186
+ else:
187
+ logging.info(f"ℹ️ No cached detailed feed found under key '{DETAILED_FEED_CACHE_KEY}'.")
188
+ return {}
189
+ except Exception as e:
190
+ logging.error(f"❌ [Redis detailed feed retrieval error]: {e}", exc_info=True)
191
+ return {}
routes/api/descriptive.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # routes/api/headlines.py
2
+ from fastapi import APIRouter, HTTPException, status
3
+ import logging
4
+ from typing import Dict, Any
5
+
6
+ # Import functions directly from the now standalone detailed_explainer
7
+ # Ensure sys.path in app.py allows these imports to components/generators
8
+ from components.generators.detailed_explainer import (
9
+ generate_detailed_feed,
10
+ cache_detailed_feed,
11
+ get_cached_detailed_feed
12
+ )
13
+ # We also need to get the initial summaries, which are managed by daily_feed.py
14
+ from components.generators.daily_feed import get_cached_daily_feed
15
+
16
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
+
18
+ router = APIRouter()
19
+
20
+ @router.post("/generate-detailed") # Endpoint for triggering detailed generation
21
+ async def generate_detailed_headlines_endpoint() -> Dict[str, Any]:
22
+ """
23
+ Generates detailed explanations for the latest cached summaries.
24
+ This step requires initial summaries to be present in Redis cache (from daily_feed.py).
25
+ The final detailed feed is then cached by this endpoint using its dedicated key.
26
+ """
27
+ logging.info("API Call: POST /api/headlines/generate-detailed initiated.")
28
+ try:
29
+ # Step 1: Retrieve the cached initial summaries
30
+ initial_summaries = get_cached_daily_feed() # This gets data from "initial_news_summary_cache"
31
+
32
+ if not initial_summaries:
33
+ logging.warning("No initial summaries found in cache to generate detailed explanations from.")
34
+ raise HTTPException(
35
+ status_code=status.HTTP_404_NOT_FOUND,
36
+ detail="No initial news summaries found in cache. Please run the ingestion/summarization process first (e.g., /api/ingest/run)."
37
+ )
38
+
39
+ # Step 2: Generate detailed explanations (this is an async call to detailed_explainer)
40
+ detailed_feed = await generate_detailed_feed(initial_summaries)
41
+
42
+ if not detailed_feed:
43
+ raise HTTPException(
44
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
45
+ detail="Failed to generate detailed explanations. Check server logs for errors during LLM calls or content retrieval."
46
+ )
47
+
48
+ # Step 3: Cache the final detailed feed using the function from detailed_explainer
49
+ # This function (cache_detailed_feed) internally uses its own Redis client and DETAILED_FEED_CACHE_KEY
50
+ cache_detailed_feed(detailed_feed)
51
+
52
+ logging.info("API Call: POST /api/headlines/generate-detailed completed successfully.")
53
+
54
+ total_items = sum(len(topic_summaries) for topic_summaries in detailed_feed.values())
55
+
56
+ return {"status": "success", "message": "Detailed headlines generated and cached.", "items": total_items}
57
+
58
+ except HTTPException as he:
59
+ raise he # Re-raise FastAPI's HTTPExceptions
60
+ except Exception as e:
61
+ logging.error(f"Error in /api/headlines/generate-detailed: {e}", exc_info=True)
62
+ raise HTTPException(
63
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
64
+ detail=f"An unexpected error occurred during detailed feed generation: {e}"
65
+ )
66
+
67
+ @router.get("/get-detailed") # Endpoint for retrieving detailed headlines
68
+ async def get_detailed_headlines_endpoint() -> Dict[str, Dict[int, Dict[str, Any]]]:
69
+ """
70
+ Retrieves the most recently cached *fully detailed* news feed.
71
+ Returns 404 if no detailed feed is found in cache.
72
+ """
73
+ logging.info("API Call: GET /api/headlines/get-detailed initiated.")
74
+ try:
75
+ # Retrieve the cached detailed feed using the function from detailed_explainer
76
+ cached_detailed_feed = get_cached_detailed_feed()
77
+
78
+ if not cached_detailed_feed:
79
+ logging.info("No full detailed news feed found in cache.")
80
+ raise HTTPException(
81
+ status_code=status.HTTP_404_NOT_FOUND,
82
+ detail="No detailed news feed found in cache. Please run /api/headlines/generate-detailed first."
83
+ )
84
+
85
+ logging.info("API Call: GET /api/headlines/get-detailed completed successfully.")
86
+ return cached_detailed_feed
87
+
88
+ except HTTPException as he:
89
+ raise he
90
+ except Exception as e:
91
+ logging.error(f"Error in /api/headlines/get-detailed: {e}", exc_info=True)
92
+ raise HTTPException(
93
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
94
+ detail=f"An unexpected error occurred while retrieving cached detailed feed: {e}"
95
+ )