raghavNCI commited on
Commit
3611f6f
Β·
1 Parent(s): 2e99a5a

fixing the article generation

Browse files
Files changed (1) hide show
  1. nuse_modules/headlines_generator.py +35 -22
nuse_modules/headlines_generator.py CHANGED
@@ -1,13 +1,17 @@
1
  from __future__ import annotations
2
  import datetime as _dt
3
  import json, os
 
4
  from typing import List, Dict
5
 
6
- from clients.redis_client import redis_client as _r
7
- from nuse_modules.google_search import search_google_news
8
  from models_initialization.mistral_registry import mistral_generate
9
 
10
 
 
 
 
11
  _CATEGORIES = {
12
  "world": "world news top stories",
13
  "india": "india top stories",
@@ -18,8 +22,12 @@ _CATEGORIES = {
18
 
19
  _ARTICLES_PER_CAT = 5
20
  _SUMMARY_TOKENS = 120
21
- _REDIS_TTL_SECONDS = 24 * 3600
22
 
 
 
 
 
23
  def _dedupe_urls(articles: List[dict]) -> List[dict]:
24
  seen = set()
25
  out = []
@@ -30,19 +38,26 @@ def _dedupe_urls(articles: List[dict]) -> List[dict]:
30
  return out
31
 
32
 
33
- def _summarise_article(article: dict) -> str:
34
  prompt = (
35
  "You are a concise news assistant. Summarise the following article "
36
  "in one sentence (<=25 words). Omit source and author names.\n\n"
37
- f"ARTICLE:\n{article['content']}"
38
  )
39
- return mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
 
 
 
 
40
 
41
 
42
  def _redis_key(date: str, category: str) -> str:
43
  return f"headlines:{date}:{category}"
44
 
45
 
 
 
 
46
  def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
47
  """
48
  Fetches top articles per category, summarises them, stores in Redis,
@@ -54,26 +69,24 @@ def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dic
54
  for cat, query in _CATEGORIES.items():
55
  print(f"[HEADLINES] {cat.title()} …")
56
 
57
- # 1. Google -> list of {title, link, snippet, content}
58
  raw_articles = search_google_news([query], num_results=_ARTICLES_PER_CAT)
59
  raw_articles = _dedupe_urls(raw_articles)
60
 
61
- # 2. Summarise each article
62
  summaries = []
63
- for art in raw_articles:
64
- if not art["content"]:
65
- continue # skip if scraper failed
66
- summary = _summarise_article(art)
67
- summaries.append(
68
- {
69
- "title": art["title"],
70
- "url": art["link"],
71
- "summary": summary,
72
- "source_snippet": art["snippet"],
73
- }
74
- )
75
-
76
- # 3. Store in Upstash Redis
77
  redis_key = _redis_key(date_str, cat)
78
  _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
79
 
 
1
  from __future__ import annotations
2
  import datetime as _dt
3
  import json, os
4
+ import re
5
  from typing import List, Dict
6
 
7
+ from clients.redis_client import redis_client as _r
8
+ from nuse_modules.google_search import search_google_news
9
  from models_initialization.mistral_registry import mistral_generate
10
 
11
 
12
+ # ──────────────────────────────────────────────────────────────
13
+ # Config
14
+ # ──────────────────────────────────────────────────────────────
15
  _CATEGORIES = {
16
  "world": "world news top stories",
17
  "india": "india top stories",
 
22
 
23
  _ARTICLES_PER_CAT = 5
24
  _SUMMARY_TOKENS = 120
25
+ _REDIS_TTL_SECONDS = 24 * 3600
26
 
27
+
28
+ # ──────────────────────────────────────────────────────────────
29
+ # Helpers
30
+ # ──────────────────────────────────────────────────────────────
31
  def _dedupe_urls(articles: List[dict]) -> List[dict]:
32
  seen = set()
33
  out = []
 
38
  return out
39
 
40
 
41
+ def _summarise_article(content: str) -> str:
42
  prompt = (
43
  "You are a concise news assistant. Summarise the following article "
44
  "in one sentence (<=25 words). Omit source and author names.\n\n"
45
+ f"ARTICLE:\n{content}"
46
  )
47
+ raw_output = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
48
+
49
+ # Remove repeated prompt instructions if echoed back
50
+ cleaned = re.sub(r"(you are.*?article[:\n]+)", "", raw_output, flags=re.IGNORECASE | re.DOTALL).strip()
51
+ return cleaned
52
 
53
 
54
  def _redis_key(date: str, category: str) -> str:
55
  return f"headlines:{date}:{category}"
56
 
57
 
58
+ # ──────────────────────────────────────────────────────────────
59
+ # Main Generator
60
+ # ──────────────────────────────────────────────────────────────
61
  def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
62
  """
63
  Fetches top articles per category, summarises them, stores in Redis,
 
69
  for cat, query in _CATEGORIES.items():
70
  print(f"[HEADLINES] {cat.title()} …")
71
 
 
72
  raw_articles = search_google_news([query], num_results=_ARTICLES_PER_CAT)
73
  raw_articles = _dedupe_urls(raw_articles)
74
 
 
75
  summaries = []
76
+ for item in raw_articles:
77
+ if not item.get("content"):
78
+ continue # skip if no full text extracted
79
+
80
+ summary = _summarise_article(item["content"])
81
+
82
+ summaries.append({
83
+ "title": item.get("title"),
84
+ "url": item.get("link"),
85
+ "summary": summary,
86
+ "source_snippet": item.get("snippet"),
87
+ "image": item.get("image"), # added in google_search.py
88
+ })
89
+
90
  redis_key = _redis_key(date_str, cat)
91
  _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
92