raghavNCI commited on
Commit
9c1bffa
·
1 Parent(s): 02e2d96

revamping the nude modules

Browse files
Dockerfile CHANGED
@@ -11,7 +11,7 @@ WORKDIR /app
11
 
12
  COPY --chown=user ./requirements.txt requirements.txt
13
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
- RUN pip install --no-cache-dir boto3
15
 
16
  COPY --chown=user . /app
17
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
11
 
12
  COPY --chown=user ./requirements.txt requirements.txt
13
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+ RUN pip install --no-cache-dir trafilatura
15
 
16
  COPY --chown=user . /app
17
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
nuse_modules/google_search.py CHANGED
@@ -4,64 +4,31 @@ import os
4
  import requests
5
  import time
6
  from typing import List
 
7
 
8
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
9
  GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
10
 
11
- def search_google_news_batch(queries: List[str], results_per_query: int = 30) -> List[dict]:
12
- all_results = []
13
- seen_links = set()
14
 
15
- for query in queries:
16
- print(f"[SEARCH] Query: {query}")
17
- total_fetched = 0
18
- start_index = 1
19
-
20
- while total_fetched < results_per_query and start_index <= 91:
21
- url = (
22
- f"https://www.googleapis.com/customsearch/v1"
23
- f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
24
- f"&q={query}&num=10&start={start_index}"
25
- )
26
-
27
- try:
28
- res = requests.get(url, timeout=10)
29
- res.raise_for_status()
30
- data = res.json()
31
- items = data.get("items", [])
32
-
33
- if not items:
34
- break # No more results
35
-
36
- for item in items:
37
- link = item.get("link")
38
- if link and link not in seen_links:
39
- seen_links.add(link)
40
- all_results.append({
41
- "title": item.get("title"),
42
- "link": link,
43
- "snippet": item.get("snippet"),
44
- "query": query,
45
- })
46
-
47
- total_fetched += len(items)
48
- start_index += 10
49
- time.sleep(0.5) # Avoid rate limits
50
-
51
- except Exception as e:
52
- print(f"[ERROR] Query '{query}' failed at start={start_index}: {e}")
53
- break
54
 
55
- return all_results
56
 
57
- def search_google_news(keywords: list[str], num_results: int = 5):
58
  query = " ".join(keywords)
59
  url = (
60
  f"https://www.googleapis.com/customsearch/v1"
61
  f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
62
  f"&q={query}&num={num_results}"
63
  )
64
-
65
  try:
66
  res = requests.get(url, timeout=10)
67
  res.raise_for_status()
@@ -69,12 +36,18 @@ def search_google_news(keywords: list[str], num_results: int = 5):
69
  results = []
70
 
71
  for item in data.get("items", []):
 
 
 
72
  results.append({
73
  "title": item.get("title"),
74
- "link": item.get("link"),
75
  "snippet": item.get("snippet"),
 
76
  })
77
 
78
  return results
 
79
  except Exception as e:
80
- return {"error": str(e)}
 
 
4
  import requests
5
  import time
6
  from typing import List
7
+ from trafilatura import fetch_url, extract
8
 
9
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
10
  GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
11
 
 
 
 
12
 
13
+ def extract_full_text(url: str) -> str:
14
+ try:
15
+ downloaded = fetch_url(url)
16
+ if downloaded:
17
+ content = extract(downloaded, include_comments=False, include_tables=False)
18
+ return content or ""
19
+ except Exception as e:
20
+ print(f"[SCRAPER ERROR] {url}: {e}")
21
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
 
23
 
24
+ def search_google_news(keywords: List[str], num_results: int = 5) -> List[dict]:
25
  query = " ".join(keywords)
26
  url = (
27
  f"https://www.googleapis.com/customsearch/v1"
28
  f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
29
  f"&q={query}&num={num_results}"
30
  )
31
+
32
  try:
33
  res = requests.get(url, timeout=10)
34
  res.raise_for_status()
 
36
  results = []
37
 
38
  for item in data.get("items", []):
39
+ link = item.get("link")
40
+ article_text = extract_full_text(link)
41
+
42
  results.append({
43
  "title": item.get("title"),
44
+ "link": link,
45
  "snippet": item.get("snippet"),
46
+ "content": article_text
47
  })
48
 
49
  return results
50
+
51
  except Exception as e:
52
+ print(f"[ERROR] Google search failed: {e}")
53
+ return []
nuse_modules/keyword_extracter.py CHANGED
@@ -1,38 +1,73 @@
1
  # nuse_modules/keyword_extractor.py
2
 
3
- import os
4
- import requests
5
- import json
 
6
 
7
  from models_initialization.mistral_registry import mistral_generate
8
 
9
- def extract_last_keywords(raw: str, max_keywords: int = 8) -> list[str]:
10
- segments = raw.strip().split("\n")
11
 
12
- for line in reversed(segments):
13
- line = line.strip()
14
- if line.lower().startswith("extract") or not line or len(line) < 10:
15
- continue
16
 
17
- if line.count(",") >= 2:
18
- parts = [kw.strip().strip('"') for kw in line.split(",") if kw.strip()]
19
- if all(len(p.split()) <= 3 for p in parts) and 1 <= len(parts) <= max_keywords:
20
- return parts
 
 
21
 
22
- return []
 
 
 
 
 
 
 
23
 
 
 
 
 
24
 
25
- def keywords_extractor(question: str) -> list[str]:
26
- prompt = (
27
- f"Extract the 3–6 most important keywords from the following question. "
28
- f"Return only the keywords, comma-separated (no explanations):\n\n"
29
- f"{question}"
 
 
 
 
 
 
30
  )
 
 
 
 
31
 
32
- raw_output = mistral_generate(prompt, max_new_tokens=32)
33
- keywords = extract_last_keywords(raw_output)
 
 
 
 
 
 
34
 
35
- print("Raw extracted keywords:", raw_output)
36
- print("Parsed keywords:", keywords)
 
 
 
 
 
 
 
 
37
 
38
- return keywords
 
 
1
  # nuse_modules/keyword_extractor.py
2
 
3
+ from __future__ import annotations
4
+ import json, re, logging, itertools
5
+ from collections import Counter
6
+ from pathlib import Path
7
 
8
  from models_initialization.mistral_registry import mistral_generate
9
 
10
+ STOPWORDS = set(Path(__file__).with_name("stopwords_en.txt").read_text().split())
 
11
 
12
+ _JSON_RE = re.compile(r"\[[^\[\]]+\]", re.S) # first [...] block
 
 
 
13
 
14
+ def _dedupe_keep_order(seq):
15
+ seen = set()
16
+ for x in seq:
17
+ if x.lower() not in seen:
18
+ seen.add(x.lower())
19
+ yield x
20
 
21
+ def _extract_with_llm(question: str, k: int) -> list[str]:
22
+ prompt = (
23
+ "Extract the **most important keywords** (nouns or noun-phrases) from the question below.\n"
24
+ f"Return a **JSON list** of {k} or fewer lowercase keywords, no commentary.\n\n"
25
+ f"QUESTION:\n{question}"
26
+ )
27
+ raw = mistral_generate(prompt, max_new_tokens=48, temperature=0.3)
28
+ logging.debug("LLM raw output: %s", raw)
29
 
30
+ # find the first [...] JSON chunk
31
+ match = _JSON_RE.search(raw or "")
32
+ if not match:
33
+ raise ValueError("No JSON list detected in LLM output")
34
 
35
+ try:
36
+ keywords = json.loads(match.group())
37
+ if not isinstance(keywords, list):
38
+ raise ValueError
39
+ except Exception as e:
40
+ raise ValueError("Invalid JSON list") from e
41
+
42
+ cleaned = list(
43
+ _dedupe_keep_order(
44
+ kw.lower().strip(" .,\"'") for kw in keywords if kw and kw.lower() not in STOPWORDS
45
+ )
46
  )
47
+ return cleaned[:k]
48
+
49
+
50
+ _WORD_RE = re.compile(r"[A-Za-z][\w\-]+")
51
 
52
+ def _fallback_keywords(text: str, k: int) -> list[str]:
53
+ tokens = [t.lower() for t in _WORD_RE.findall(text)]
54
+ tokens = [t for t in tokens if t not in STOPWORDS and len(t) > 2]
55
+ counts = Counter(tokens)
56
+ # remove very common words by frequency threshold
57
+ common_cut = (len(tokens) // 100) + 2
58
+ keywords, _ = zip(*counts.most_common(k + common_cut))
59
+ return list(keywords[:k])
60
 
61
+ def keywords_extractor(question: str, max_keywords: int = 6) -> list[str]:
62
+ """
63
+ Return ≤ `max_keywords` keywords for the given question.
64
+ """
65
+ try:
66
+ kw = _extract_with_llm(question, max_keywords)
67
+ if kw:
68
+ return kw
69
+ except Exception as exc:
70
+ logging.warning("LLM keyword extraction failed: %s. Falling back.", exc)
71
 
72
+ # fallback heuristic
73
+ return _fallback_keywords(question, max_keywords)