ragV98 commited on
Commit
a0be762
·
1 Parent(s): a7ccef6
Files changed (1) hide show
  1. components/generators/daily_feed.py +14 -15
components/generators/daily_feed.py CHANGED
@@ -7,7 +7,7 @@ from openai import OpenAI
7
  from components.indexers.news_indexer import get_upstash_vector_store
8
  from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
9
  import logging
10
- import re
11
 
12
  # Configure logging
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -27,7 +27,7 @@ except Exception as e:
27
  TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
28
  TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
29
 
30
- # 🧠 Summarization Prompt - REVISED
31
  BASE_PROMPT = (
32
  "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
33
  "Generate exactly 3 punchy headlines, each under 20 words. Each headline must be immediately followed by a concise, single-sentence explanation of why the story matters. Do NOT include phrases like 'this is important because', 'why this matters', 'explanation:', etc. Just state the logic directly."
@@ -101,7 +101,7 @@ def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]:
101
  try:
102
  client = OpenAI(api_key=OPENAI_API_KEY)
103
  response = client.chat.completions.create(
104
- model="gpt-4", # Consider "gpt-4o" or "gpt-3.5-turbo" for cost/speed
105
  messages=[
106
  {"role": "system", "content": BASE_PROMPT},
107
  {"role": "user", "content": content},
@@ -111,32 +111,31 @@ def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]:
111
  )
112
  llm_output = response.choices[0].message.content.strip()
113
 
114
- # --- IMPORTANT: DEBUGGING STEP ---
115
  logging.info(f"Raw LLM output for topic '{topic_key}':\n---\n{llm_output}\n---")
116
- # --- END DEBUGGING STEP ---
117
 
118
  headlines = []
119
- # Parse based on the new explicit format: "Headline -- Explanation"
120
  for line in llm_output.splitlines():
121
- line = line.strip() # Remove any leading/trailing whitespace
122
- if not line: # Skip empty lines
123
  continue
124
 
125
- # Regex to capture:
126
- # - Optional leading bullet/dash/number (e.g., "- ", "1. ", "• ")
127
- # - Anything before the '--' as the headline
128
- # - Anything after the '--' as the explanation
129
- match = re.match(r'^(?:[->•\d\.]+\s*)?([^-\n]+?)\s*--\s*(.*)$', line)
 
 
130
 
131
  if match:
132
  headline_text = match.group(1).strip()
133
  explanation_text = match.group(2).strip()
134
 
135
- # Further clean explanation_text from any lingering unwanted intros if LLM adds them
136
  explanation_text = re.sub(r'^(?:this is important because|why this matters because|this matters because|reason:|significance:)\s*', '', explanation_text, flags=re.IGNORECASE).strip()
137
 
138
  # Basic validation: ensure both parts are reasonably non-empty
139
- if len(headline_text.split()) > 2 and len(explanation_text.split()) > 2: # Headline at least 3 words, explanation at least 3 words
140
  headlines.append({"summary": headline_text, "explanation": explanation_text})
141
  else:
142
  logging.warning(f"Skipping line due to short/empty headline or explanation after parsing: '{line}' for topic '{topic_key}'.")
 
7
  from components.indexers.news_indexer import get_upstash_vector_store
8
  from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
9
  import logging
10
+ import re
11
 
12
  # Configure logging
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
27
  TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
28
  TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
29
 
30
+ # 🧠 Summarization Prompt - REVISED (no change from previous)
31
  BASE_PROMPT = (
32
  "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
33
  "Generate exactly 3 punchy headlines, each under 20 words. Each headline must be immediately followed by a concise, single-sentence explanation of why the story matters. Do NOT include phrases like 'this is important because', 'why this matters', 'explanation:', etc. Just state the logic directly."
 
101
  try:
102
  client = OpenAI(api_key=OPENAI_API_KEY)
103
  response = client.chat.completions.create(
104
+ model="gpt-4",
105
  messages=[
106
  {"role": "system", "content": BASE_PROMPT},
107
  {"role": "user", "content": content},
 
111
  )
112
  llm_output = response.choices[0].message.content.strip()
113
 
 
114
  logging.info(f"Raw LLM output for topic '{topic_key}':\n---\n{llm_output}\n---")
 
115
 
116
  headlines = []
 
117
  for line in llm_output.splitlines():
118
+ line = line.strip()
119
+ if not line:
120
  continue
121
 
122
+ # --- THE CRITICAL REGEX FIX ---
123
+ # Pattern:
124
+ # ^(?:[->•\d\.]+\s*)? -> Optional leading bullet/number
125
+ # (.*?) -> Non-greedy capture for headline (any characters until --)
126
+ # \s*--\s* -> The separator "--" with optional whitespace
127
+ # (.*) -> Greedy capture for explanation (rest of the line)
128
+ match = re.match(r'^(?:[->•\d\.]+\s*)?(.*?)\s*--\s*(.*)$', line)
129
 
130
  if match:
131
  headline_text = match.group(1).strip()
132
  explanation_text = match.group(2).strip()
133
 
134
+ # Further clean explanation_text if LLM adds unwanted intros despite prompt
135
  explanation_text = re.sub(r'^(?:this is important because|why this matters because|this matters because|reason:|significance:)\s*', '', explanation_text, flags=re.IGNORECASE).strip()
136
 
137
  # Basic validation: ensure both parts are reasonably non-empty
138
+ if len(headline_text.split()) >= 2 and len(explanation_text.split()) >= 3: # Headline at least 2 words, explanation at least 3 words
139
  headlines.append({"summary": headline_text, "explanation": explanation_text})
140
  else:
141
  logging.warning(f"Skipping line due to short/empty headline or explanation after parsing: '{line}' for topic '{topic_key}'.")