fix 101
Browse files
components/generators/daily_feed.py
CHANGED
@@ -7,7 +7,7 @@ from openai import OpenAI
|
|
7 |
from components.indexers.news_indexer import get_upstash_vector_store
|
8 |
from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
|
9 |
import logging
|
10 |
-
import re
|
11 |
|
12 |
# Configure logging
|
13 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
@@ -27,7 +27,7 @@ except Exception as e:
|
|
27 |
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
|
28 |
TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
|
29 |
|
30 |
-
# 🧠 Summarization Prompt - REVISED
|
31 |
BASE_PROMPT = (
|
32 |
"You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
|
33 |
"Generate exactly 3 punchy headlines, each under 20 words. Each headline must be immediately followed by a concise, single-sentence explanation of why the story matters. Do NOT include phrases like 'this is important because', 'why this matters', 'explanation:', etc. Just state the logic directly."
|
@@ -101,7 +101,7 @@ def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]:
|
|
101 |
try:
|
102 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
103 |
response = client.chat.completions.create(
|
104 |
-
model="gpt-4",
|
105 |
messages=[
|
106 |
{"role": "system", "content": BASE_PROMPT},
|
107 |
{"role": "user", "content": content},
|
@@ -111,32 +111,31 @@ def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]:
|
|
111 |
)
|
112 |
llm_output = response.choices[0].message.content.strip()
|
113 |
|
114 |
-
# --- IMPORTANT: DEBUGGING STEP ---
|
115 |
logging.info(f"Raw LLM output for topic '{topic_key}':\n---\n{llm_output}\n---")
|
116 |
-
# --- END DEBUGGING STEP ---
|
117 |
|
118 |
headlines = []
|
119 |
-
# Parse based on the new explicit format: "Headline -- Explanation"
|
120 |
for line in llm_output.splitlines():
|
121 |
-
line = line.strip()
|
122 |
-
if not line:
|
123 |
continue
|
124 |
|
125 |
-
#
|
126 |
-
#
|
127 |
-
#
|
128 |
-
# -
|
129 |
-
|
|
|
|
|
130 |
|
131 |
if match:
|
132 |
headline_text = match.group(1).strip()
|
133 |
explanation_text = match.group(2).strip()
|
134 |
|
135 |
-
# Further clean explanation_text
|
136 |
explanation_text = re.sub(r'^(?:this is important because|why this matters because|this matters because|reason:|significance:)\s*', '', explanation_text, flags=re.IGNORECASE).strip()
|
137 |
|
138 |
# Basic validation: ensure both parts are reasonably non-empty
|
139 |
-
if len(headline_text.split())
|
140 |
headlines.append({"summary": headline_text, "explanation": explanation_text})
|
141 |
else:
|
142 |
logging.warning(f"Skipping line due to short/empty headline or explanation after parsing: '{line}' for topic '{topic_key}'.")
|
|
|
7 |
from components.indexers.news_indexer import get_upstash_vector_store
|
8 |
from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
|
9 |
import logging
|
10 |
+
import re
|
11 |
|
12 |
# Configure logging
|
13 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
27 |
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
|
28 |
TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
|
29 |
|
30 |
+
# 🧠 Summarization Prompt - REVISED (no change from previous)
|
31 |
BASE_PROMPT = (
|
32 |
"You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
|
33 |
"Generate exactly 3 punchy headlines, each under 20 words. Each headline must be immediately followed by a concise, single-sentence explanation of why the story matters. Do NOT include phrases like 'this is important because', 'why this matters', 'explanation:', etc. Just state the logic directly."
|
|
|
101 |
try:
|
102 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
103 |
response = client.chat.completions.create(
|
104 |
+
model="gpt-4",
|
105 |
messages=[
|
106 |
{"role": "system", "content": BASE_PROMPT},
|
107 |
{"role": "user", "content": content},
|
|
|
111 |
)
|
112 |
llm_output = response.choices[0].message.content.strip()
|
113 |
|
|
|
114 |
logging.info(f"Raw LLM output for topic '{topic_key}':\n---\n{llm_output}\n---")
|
|
|
115 |
|
116 |
headlines = []
|
|
|
117 |
for line in llm_output.splitlines():
|
118 |
+
line = line.strip()
|
119 |
+
if not line:
|
120 |
continue
|
121 |
|
122 |
+
# --- THE CRITICAL REGEX FIX ---
|
123 |
+
# Pattern:
|
124 |
+
# ^(?:[->•\d\.]+\s*)? -> Optional leading bullet/number
|
125 |
+
# (.*?) -> Non-greedy capture for headline (any characters until --)
|
126 |
+
# \s*--\s* -> The separator "--" with optional whitespace
|
127 |
+
# (.*) -> Greedy capture for explanation (rest of the line)
|
128 |
+
match = re.match(r'^(?:[->•\d\.]+\s*)?(.*?)\s*--\s*(.*)$', line)
|
129 |
|
130 |
if match:
|
131 |
headline_text = match.group(1).strip()
|
132 |
explanation_text = match.group(2).strip()
|
133 |
|
134 |
+
# Further clean explanation_text if LLM adds unwanted intros despite prompt
|
135 |
explanation_text = re.sub(r'^(?:this is important because|why this matters because|this matters because|reason:|significance:)\s*', '', explanation_text, flags=re.IGNORECASE).strip()
|
136 |
|
137 |
# Basic validation: ensure both parts are reasonably non-empty
|
138 |
+
if len(headline_text.split()) >= 2 and len(explanation_text.split()) >= 3: # Headline at least 2 words, explanation at least 3 words
|
139 |
headlines.append({"summary": headline_text, "explanation": explanation_text})
|
140 |
else:
|
141 |
logging.warning(f"Skipping line due to short/empty headline or explanation after parsing: '{line}' for topic '{topic_key}'.")
|