raghavNCI
commited on
Commit
·
588f923
1
Parent(s):
3611f6f
few more revamps to headlines generation
Browse files
nuse_modules/headlines_generator.py
CHANGED
@@ -37,11 +37,22 @@ def _dedupe_urls(articles: List[dict]) -> List[dict]:
|
|
37 |
out.append(art)
|
38 |
return out
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
def _summarise_article(content: str) -> str:
|
42 |
prompt = (
|
43 |
"You are a concise news assistant. Summarise the following article "
|
44 |
"in one sentence (<=25 words). Omit source and author names.\n\n"
|
|
|
45 |
f"ARTICLE:\n{content}"
|
46 |
)
|
47 |
raw_output = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
|
@@ -74,8 +85,14 @@ def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dic
|
|
74 |
|
75 |
summaries = []
|
76 |
for item in raw_articles:
|
|
|
|
|
|
|
77 |
if not item.get("content"):
|
78 |
-
continue
|
|
|
|
|
|
|
79 |
|
80 |
summary = _summarise_article(item["content"])
|
81 |
|
|
|
37 |
out.append(art)
|
38 |
return out
|
39 |
|
40 |
+
def is_probably_article(url: str) -> bool:
|
41 |
+
"""
|
42 |
+
Simple heuristic: filters out category pages, homepages, etc.
|
43 |
+
"""
|
44 |
+
bad_patterns = [
|
45 |
+
"/world", "/us", "/news", "/topics", "/home", "/video",
|
46 |
+
"index.html", ".com/", ".org/", ".net/"
|
47 |
+
]
|
48 |
+
return not any(url.rstrip("/").endswith(p.strip("/")) for p in bad_patterns)
|
49 |
+
|
50 |
|
51 |
def _summarise_article(content: str) -> str:
|
52 |
prompt = (
|
53 |
"You are a concise news assistant. Summarise the following article "
|
54 |
"in one sentence (<=25 words). Omit source and author names.\n\n"
|
55 |
+
"Some of these articles contain text which is not useful to the context so you can omit it."
|
56 |
f"ARTICLE:\n{content}"
|
57 |
)
|
58 |
raw_output = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
|
|
|
85 |
|
86 |
summaries = []
|
87 |
for item in raw_articles:
|
88 |
+
|
89 |
+
link = item.get("link")
|
90 |
+
|
91 |
if not item.get("content"):
|
92 |
+
continue
|
93 |
+
|
94 |
+
if not is_probably_article(link):
|
95 |
+
continue
|
96 |
|
97 |
summary = _summarise_article(item["content"])
|
98 |
|