raghavNCI commited on
Commit
b029173
·
1 Parent(s): 601d9f9

headlines revamp

Browse files
nuse_modules/fetchHeadlines.py CHANGED
@@ -1,26 +1,20 @@
1
- from nuse_modules.google_search import search_google_news
2
- from models_initialization.mistral_registry import mistral_generate
3
-
4
- def summarize_headlines_with_mistral(headlines: list[dict]) -> str:
5
- prompt = (
6
- "Summarize the following news headlines into a short 3-sentence digest. "
7
- "Be factual and neutral. Mention the sources.\n\n"
8
- )
9
- for item in headlines:
10
- prompt += f"- {item['title']} – {item.get('link', '')}\n"
11
-
12
- return mistral_generate(prompt, max_new_tokens=200, temperature=0.5)
13
 
 
14
 
15
- def generate_headline_digest(keywords: list[str], num_results: int = 5):
16
- articles = search_google_news(keywords, num_results=num_results)
 
 
 
 
 
 
 
 
17
 
18
- if isinstance(articles, dict) and "error" in articles:
19
- return {"error": articles["error"]}
20
 
21
- summary = summarize_headlines_with_mistral(articles)
22
- return {
23
- "summary": summary,
24
- "sources": list({article["link"] for article in articles}),
25
- "headlines": [article["title"] for article in articles]
26
- }
 
1
+ # nuse_modules/fetch_headline_articles.py
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ from nuse_modules.google_search import search_google_news
4
 
5
+ def fetch_headline_articles():
6
+ queries = [
7
+ "India news", "US politics", "UK elections", "China economy",
8
+ "Tech layoffs", "Ukraine war", "AI regulation", "Africa development",
9
+ "South America inflation", "Global stock markets", "Climate change",
10
+ "Middle East", "EU summit", "Canada economy", "Australia news",
11
+ "Russia sanctions", "Elections 2025", "Big tech", "Trade wars",
12
+ "Global protests", "Public health", "Oil prices", "Space news",
13
+ "Cryptocurrency", "Cybersecurity"
14
+ ]
15
 
16
+ print("[INFO] Fetching news articles from Google Custom Search...")
17
+ articles = search_google_news(queries, results_per_query=30) # 30 per query × 25 queries = ~750 raw
18
 
19
+ print(f"[INFO] Retrieved {len(articles)} unique articles.")
20
+ return articles
 
 
 
 
nuse_modules/google_search.py CHANGED
@@ -2,10 +2,58 @@
2
 
3
  import os
4
  import requests
 
 
5
 
6
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
7
  GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def search_google_news(keywords: list[str], num_results: int = 5):
10
  query = " ".join(keywords)
11
  url = (
 
2
 
3
  import os
4
  import requests
5
+ import time
6
+ from typing import List
7
 
8
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
9
  GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
10
 
11
+ def search_google_news_batch(queries: List[str], results_per_query: int = 30) -> List[dict]:
12
+ all_results = []
13
+ seen_links = set()
14
+
15
+ for query in queries:
16
+ print(f"[SEARCH] Query: {query}")
17
+ total_fetched = 0
18
+ start_index = 1
19
+
20
+ while total_fetched < results_per_query and start_index <= 91:
21
+ url = (
22
+ f"https://www.googleapis.com/customsearch/v1"
23
+ f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
24
+ f"&q={query}&num=10&start={start_index}"
25
+ )
26
+
27
+ try:
28
+ res = requests.get(url, timeout=10)
29
+ res.raise_for_status()
30
+ data = res.json()
31
+ items = data.get("items", [])
32
+
33
+ if not items:
34
+ break # No more results
35
+
36
+ for item in items:
37
+ link = item.get("link")
38
+ if link and link not in seen_links:
39
+ seen_links.add(link)
40
+ all_results.append({
41
+ "title": item.get("title"),
42
+ "link": link,
43
+ "snippet": item.get("snippet"),
44
+ "query": query,
45
+ })
46
+
47
+ total_fetched += len(items)
48
+ start_index += 10
49
+ time.sleep(0.5) # Avoid rate limits
50
+
51
+ except Exception as e:
52
+ print(f"[ERROR] Query '{query}' failed at start={start_index}: {e}")
53
+ break
54
+
55
+ return all_results
56
+
57
  def search_google_news(keywords: list[str], num_results: int = 5):
58
  query = " ".join(keywords)
59
  url = (
routes/headlines.py CHANGED
@@ -1,10 +1,9 @@
1
- from fastapi import APIRouter, Query
2
- from nuse_modules.fetchHeadlines import generate_headline_digest
3
 
4
  headlines = APIRouter()
5
 
6
  @headlines.get("/headlines")
7
- def get_headlines(q: str = Query("India", description="Comma-separated keywords to search")):
8
- keywords = [kw.strip() for kw in q.split(",")]
9
- result = generate_headline_digest(keywords)
10
- return result
 
1
+ from fastapi import APIRouter
2
+ from nuse_modules.fetchHeadlines import fetch_headline_articles
3
 
4
  headlines = APIRouter()
5
 
6
  @headlines.get("/headlines")
7
+ def get_headlines():
8
+ articles = fetch_headline_articles()
9
+ return {"total": len(articles), "articles": articles}