raghavNCI commited on
Commit
588f923
·
1 Parent(s): 3611f6f

few more revamps to headlines generation

Browse files
Files changed (1) hide show
  1. nuse_modules/headlines_generator.py +18 -1
nuse_modules/headlines_generator.py CHANGED
@@ -37,11 +37,22 @@ def _dedupe_urls(articles: List[dict]) -> List[dict]:
37
  out.append(art)
38
  return out
39
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  def _summarise_article(content: str) -> str:
42
  prompt = (
43
  "You are a concise news assistant. Summarise the following article "
44
  "in one sentence (<=25 words). Omit source and author names.\n\n"
 
45
  f"ARTICLE:\n{content}"
46
  )
47
  raw_output = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
@@ -74,8 +85,14 @@ def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dic
74
 
75
  summaries = []
76
  for item in raw_articles:
 
 
 
77
  if not item.get("content"):
78
- continue # skip if no full text extracted
 
 
 
79
 
80
  summary = _summarise_article(item["content"])
81
 
 
37
  out.append(art)
38
  return out
39
 
40
+ def is_probably_article(url: str) -> bool:
41
+ """
42
+ Simple heuristic: filters out category pages, homepages, etc.
43
+ """
44
+ bad_patterns = [
45
+ "/world", "/us", "/news", "/topics", "/home", "/video",
46
+ "index.html", ".com/", ".org/", ".net/"
47
+ ]
48
+ return not any(url.rstrip("/").endswith(p.strip("/")) for p in bad_patterns)
49
+
50
 
51
  def _summarise_article(content: str) -> str:
52
  prompt = (
53
  "You are a concise news assistant. Summarise the following article "
54
  "in one sentence (<=25 words). Omit source and author names.\n\n"
55
+ "Some of these articles contain text which is not useful to the context so you can omit it."
56
  f"ARTICLE:\n{content}"
57
  )
58
  raw_output = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
 
85
 
86
  summaries = []
87
  for item in raw_articles:
88
+
89
+ link = item.get("link")
90
+
91
  if not item.get("content"):
92
+ continue
93
+
94
+ if not is_probably_article(link):
95
+ continue
96
 
97
  summary = _summarise_article(item["content"])
98