Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on Jul 23

Commit

a0be762

1 Parent(s): a7ccef6

fix 101

Browse files

Files changed (1) hide show

components/generators/daily_feed.py +14 -15

components/generators/daily_feed.py CHANGED Viewed

@@ -7,7 +7,7 @@ from openai import OpenAI
 from components.indexers.news_indexer import get_upstash_vector_store
 from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
 import logging
-import re
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -27,7 +27,7 @@ except Exception as e:
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
 TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
-# 🧠 Summarization Prompt - REVISED
 BASE_PROMPT = (
     "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
     "Generate exactly 3 punchy headlines, each under 20 words. Each headline must be immediately followed by a concise, single-sentence explanation of why the story matters. Do NOT include phrases like 'this is important because', 'why this matters', 'explanation:', etc. Just state the logic directly."
@@ -101,7 +101,7 @@ def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]:
     try:
         client = OpenAI(api_key=OPENAI_API_KEY)
         response = client.chat.completions.create(
-            model="gpt-4", # Consider "gpt-4o" or "gpt-3.5-turbo" for cost/speed
             messages=[
                 {"role": "system", "content": BASE_PROMPT},
                 {"role": "user", "content": content},
@@ -111,32 +111,31 @@ def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]:
         )
         llm_output = response.choices[0].message.content.strip()
-        # --- IMPORTANT: DEBUGGING STEP ---
         logging.info(f"Raw LLM output for topic '{topic_key}':\n---\n{llm_output}\n---")
-        # --- END DEBUGGING STEP ---
         headlines = []
-        # Parse based on the new explicit format: "Headline -- Explanation"
         for line in llm_output.splitlines():
-            line = line.strip() # Remove any leading/trailing whitespace
-            if not line: # Skip empty lines
                 continue
-            # Regex to capture:
-            # - Optional leading bullet/dash/number (e.g., "- ", "1. ", "• ")
-            # - Anything before the '--' as the headline
-            # - Anything after the '--' as the explanation
-            match = re.match(r'^(?:[->•\d\.]+\s*)?([^-\n]+?)\s*--\s*(.*)$', line)
             if match:
                 headline_text = match.group(1).strip()
                 explanation_text = match.group(2).strip()
-                # Further clean explanation_text from any lingering unwanted intros if LLM adds them
                 explanation_text = re.sub(r'^(?:this is important because|why this matters because|this matters because|reason:|significance:)\s*', '', explanation_text, flags=re.IGNORECASE).strip()
                 # Basic validation: ensure both parts are reasonably non-empty
-                if len(headline_text.split()) > 2 and len(explanation_text.split()) > 2: # Headline at least 3 words, explanation at least 3 words
                     headlines.append({"summary": headline_text, "explanation": explanation_text})
                 else:
                     logging.warning(f"Skipping line due to short/empty headline or explanation after parsing: '{line}' for topic '{topic_key}'.")

 from components.indexers.news_indexer import get_upstash_vector_store
 from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
 import logging
+import re
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
 TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
+# 🧠 Summarization Prompt - REVISED (no change from previous)
 BASE_PROMPT = (
     "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
     "Generate exactly 3 punchy headlines, each under 20 words. Each headline must be immediately followed by a concise, single-sentence explanation of why the story matters. Do NOT include phrases like 'this is important because', 'why this matters', 'explanation:', etc. Just state the logic directly."
     try:
         client = OpenAI(api_key=OPENAI_API_KEY)
         response = client.chat.completions.create(
+            model="gpt-4",
             messages=[
                 {"role": "system", "content": BASE_PROMPT},
                 {"role": "user", "content": content},
         )
         llm_output = response.choices[0].message.content.strip()
         logging.info(f"Raw LLM output for topic '{topic_key}':\n---\n{llm_output}\n---")
         headlines = []
         for line in llm_output.splitlines():
+            line = line.strip()
+            if not line:
                 continue
+            # --- THE CRITICAL REGEX FIX ---
+            # Pattern:
+            # ^(?:[->•\d\.]+\s*)?  -> Optional leading bullet/number
+            # (.*?)                -> Non-greedy capture for headline (any characters until --)
+            # \s*--\s* -> The separator "--" with optional whitespace
+            # (.*)                 -> Greedy capture for explanation (rest of the line)
+            match = re.match(r'^(?:[->•\d\.]+\s*)?(.*?)\s*--\s*(.*)$', line)
             if match:
                 headline_text = match.group(1).strip()
                 explanation_text = match.group(2).strip()
+                # Further clean explanation_text if LLM adds unwanted intros despite prompt
                 explanation_text = re.sub(r'^(?:this is important because|why this matters because|this matters because|reason:|significance:)\s*', '', explanation_text, flags=re.IGNORECASE).strip()
                 # Basic validation: ensure both parts are reasonably non-empty
+                if len(headline_text.split()) >= 2 and len(explanation_text.split()) >= 3: # Headline at least 2 words, explanation at least 3 words
                     headlines.append({"summary": headline_text, "explanation": explanation_text})
                 else:
                     logging.warning(f"Skipping line due to short/empty headline or explanation after parsing: '{line}' for topic '{topic_key}'.")