raghavNCI
commited on
Commit
·
b029173
1
Parent(s):
601d9f9
headlines revamp
Browse files- nuse_modules/fetchHeadlines.py +16 -22
- nuse_modules/google_search.py +48 -0
- routes/headlines.py +5 -6
nuse_modules/fetchHeadlines.py
CHANGED
@@ -1,26 +1,20 @@
|
|
1 |
-
|
2 |
-
from models_initialization.mistral_registry import mistral_generate
|
3 |
-
|
4 |
-
def summarize_headlines_with_mistral(headlines: list[dict]) -> str:
|
5 |
-
prompt = (
|
6 |
-
"Summarize the following news headlines into a short 3-sentence digest. "
|
7 |
-
"Be factual and neutral. Mention the sources.\n\n"
|
8 |
-
)
|
9 |
-
for item in headlines:
|
10 |
-
prompt += f"- {item['title']} – {item.get('link', '')}\n"
|
11 |
-
|
12 |
-
return mistral_generate(prompt, max_new_tokens=200, temperature=0.5)
|
13 |
|
|
|
14 |
|
15 |
-
def
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
|
19 |
-
|
20 |
|
21 |
-
|
22 |
-
return
|
23 |
-
"summary": summary,
|
24 |
-
"sources": list({article["link"] for article in articles}),
|
25 |
-
"headlines": [article["title"] for article in articles]
|
26 |
-
}
|
|
|
1 |
+
# nuse_modules/fetch_headline_articles.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
from nuse_modules.google_search import search_google_news
|
4 |
|
5 |
+
def fetch_headline_articles():
|
6 |
+
queries = [
|
7 |
+
"India news", "US politics", "UK elections", "China economy",
|
8 |
+
"Tech layoffs", "Ukraine war", "AI regulation", "Africa development",
|
9 |
+
"South America inflation", "Global stock markets", "Climate change",
|
10 |
+
"Middle East", "EU summit", "Canada economy", "Australia news",
|
11 |
+
"Russia sanctions", "Elections 2025", "Big tech", "Trade wars",
|
12 |
+
"Global protests", "Public health", "Oil prices", "Space news",
|
13 |
+
"Cryptocurrency", "Cybersecurity"
|
14 |
+
]
|
15 |
|
16 |
+
print("[INFO] Fetching news articles from Google Custom Search...")
|
17 |
+
articles = search_google_news(queries, results_per_query=30) # 30 per query × 25 queries = ~750 raw
|
18 |
|
19 |
+
print(f"[INFO] Retrieved {len(articles)} unique articles.")
|
20 |
+
return articles
|
|
|
|
|
|
|
|
nuse_modules/google_search.py
CHANGED
@@ -2,10 +2,58 @@
|
|
2 |
|
3 |
import os
|
4 |
import requests
|
|
|
|
|
5 |
|
6 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
7 |
GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
def search_google_news(keywords: list[str], num_results: int = 5):
|
10 |
query = " ".join(keywords)
|
11 |
url = (
|
|
|
2 |
|
3 |
import os
|
4 |
import requests
|
5 |
+
import time
|
6 |
+
from typing import List
|
7 |
|
8 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
9 |
GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
|
10 |
|
11 |
+
def search_google_news_batch(queries: List[str], results_per_query: int = 30) -> List[dict]:
|
12 |
+
all_results = []
|
13 |
+
seen_links = set()
|
14 |
+
|
15 |
+
for query in queries:
|
16 |
+
print(f"[SEARCH] Query: {query}")
|
17 |
+
total_fetched = 0
|
18 |
+
start_index = 1
|
19 |
+
|
20 |
+
while total_fetched < results_per_query and start_index <= 91:
|
21 |
+
url = (
|
22 |
+
f"https://www.googleapis.com/customsearch/v1"
|
23 |
+
f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
|
24 |
+
f"&q={query}&num=10&start={start_index}"
|
25 |
+
)
|
26 |
+
|
27 |
+
try:
|
28 |
+
res = requests.get(url, timeout=10)
|
29 |
+
res.raise_for_status()
|
30 |
+
data = res.json()
|
31 |
+
items = data.get("items", [])
|
32 |
+
|
33 |
+
if not items:
|
34 |
+
break # No more results
|
35 |
+
|
36 |
+
for item in items:
|
37 |
+
link = item.get("link")
|
38 |
+
if link and link not in seen_links:
|
39 |
+
seen_links.add(link)
|
40 |
+
all_results.append({
|
41 |
+
"title": item.get("title"),
|
42 |
+
"link": link,
|
43 |
+
"snippet": item.get("snippet"),
|
44 |
+
"query": query,
|
45 |
+
})
|
46 |
+
|
47 |
+
total_fetched += len(items)
|
48 |
+
start_index += 10
|
49 |
+
time.sleep(0.5) # Avoid rate limits
|
50 |
+
|
51 |
+
except Exception as e:
|
52 |
+
print(f"[ERROR] Query '{query}' failed at start={start_index}: {e}")
|
53 |
+
break
|
54 |
+
|
55 |
+
return all_results
|
56 |
+
|
57 |
def search_google_news(keywords: list[str], num_results: int = 5):
|
58 |
query = " ".join(keywords)
|
59 |
url = (
|
routes/headlines.py
CHANGED
@@ -1,10 +1,9 @@
|
|
1 |
-
from fastapi import APIRouter
|
2 |
-
from nuse_modules.fetchHeadlines import
|
3 |
|
4 |
headlines = APIRouter()
|
5 |
|
6 |
@headlines.get("/headlines")
|
7 |
-
def get_headlines(
|
8 |
-
|
9 |
-
|
10 |
-
return result
|
|
|
1 |
+
from fastapi import APIRouter
|
2 |
+
from nuse_modules.fetchHeadlines import fetch_headline_articles
|
3 |
|
4 |
headlines = APIRouter()
|
5 |
|
6 |
@headlines.get("/headlines")
|
7 |
+
def get_headlines():
|
8 |
+
articles = fetch_headline_articles()
|
9 |
+
return {"total": len(articles), "articles": articles}
|
|