raghavNCI
commited on
Commit
·
9c1bffa
1
Parent(s):
02e2d96
revamping the nude modules
Browse files- Dockerfile +1 -1
- nuse_modules/google_search.py +20 -47
- nuse_modules/keyword_extracter.py +59 -24
Dockerfile
CHANGED
@@ -11,7 +11,7 @@ WORKDIR /app
|
|
11 |
|
12 |
COPY --chown=user ./requirements.txt requirements.txt
|
13 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
14 |
-
RUN pip install --no-cache-dir
|
15 |
|
16 |
COPY --chown=user . /app
|
17 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
11 |
|
12 |
COPY --chown=user ./requirements.txt requirements.txt
|
13 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
14 |
+
RUN pip install --no-cache-dir trafilatura
|
15 |
|
16 |
COPY --chown=user . /app
|
17 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
nuse_modules/google_search.py
CHANGED
@@ -4,64 +4,31 @@ import os
|
|
4 |
import requests
|
5 |
import time
|
6 |
from typing import List
|
|
|
7 |
|
8 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
9 |
GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
|
10 |
|
11 |
-
def search_google_news_batch(queries: List[str], results_per_query: int = 30) -> List[dict]:
|
12 |
-
all_results = []
|
13 |
-
seen_links = set()
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
f"&q={query}&num=10&start={start_index}"
|
25 |
-
)
|
26 |
-
|
27 |
-
try:
|
28 |
-
res = requests.get(url, timeout=10)
|
29 |
-
res.raise_for_status()
|
30 |
-
data = res.json()
|
31 |
-
items = data.get("items", [])
|
32 |
-
|
33 |
-
if not items:
|
34 |
-
break # No more results
|
35 |
-
|
36 |
-
for item in items:
|
37 |
-
link = item.get("link")
|
38 |
-
if link and link not in seen_links:
|
39 |
-
seen_links.add(link)
|
40 |
-
all_results.append({
|
41 |
-
"title": item.get("title"),
|
42 |
-
"link": link,
|
43 |
-
"snippet": item.get("snippet"),
|
44 |
-
"query": query,
|
45 |
-
})
|
46 |
-
|
47 |
-
total_fetched += len(items)
|
48 |
-
start_index += 10
|
49 |
-
time.sleep(0.5) # Avoid rate limits
|
50 |
-
|
51 |
-
except Exception as e:
|
52 |
-
print(f"[ERROR] Query '{query}' failed at start={start_index}: {e}")
|
53 |
-
break
|
54 |
|
55 |
-
return all_results
|
56 |
|
57 |
-
def search_google_news(keywords:
|
58 |
query = " ".join(keywords)
|
59 |
url = (
|
60 |
f"https://www.googleapis.com/customsearch/v1"
|
61 |
f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
|
62 |
f"&q={query}&num={num_results}"
|
63 |
)
|
64 |
-
|
65 |
try:
|
66 |
res = requests.get(url, timeout=10)
|
67 |
res.raise_for_status()
|
@@ -69,12 +36,18 @@ def search_google_news(keywords: list[str], num_results: int = 5):
|
|
69 |
results = []
|
70 |
|
71 |
for item in data.get("items", []):
|
|
|
|
|
|
|
72 |
results.append({
|
73 |
"title": item.get("title"),
|
74 |
-
"link":
|
75 |
"snippet": item.get("snippet"),
|
|
|
76 |
})
|
77 |
|
78 |
return results
|
|
|
79 |
except Exception as e:
|
80 |
-
|
|
|
|
4 |
import requests
|
5 |
import time
|
6 |
from typing import List
|
7 |
+
from trafilatura import fetch_url, extract
|
8 |
|
9 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
10 |
GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
|
11 |
|
|
|
|
|
|
|
12 |
|
13 |
+
def extract_full_text(url: str) -> str:
|
14 |
+
try:
|
15 |
+
downloaded = fetch_url(url)
|
16 |
+
if downloaded:
|
17 |
+
content = extract(downloaded, include_comments=False, include_tables=False)
|
18 |
+
return content or ""
|
19 |
+
except Exception as e:
|
20 |
+
print(f"[SCRAPER ERROR] {url}: {e}")
|
21 |
+
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
|
|
23 |
|
24 |
+
def search_google_news(keywords: List[str], num_results: int = 5) -> List[dict]:
|
25 |
query = " ".join(keywords)
|
26 |
url = (
|
27 |
f"https://www.googleapis.com/customsearch/v1"
|
28 |
f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
|
29 |
f"&q={query}&num={num_results}"
|
30 |
)
|
31 |
+
|
32 |
try:
|
33 |
res = requests.get(url, timeout=10)
|
34 |
res.raise_for_status()
|
|
|
36 |
results = []
|
37 |
|
38 |
for item in data.get("items", []):
|
39 |
+
link = item.get("link")
|
40 |
+
article_text = extract_full_text(link)
|
41 |
+
|
42 |
results.append({
|
43 |
"title": item.get("title"),
|
44 |
+
"link": link,
|
45 |
"snippet": item.get("snippet"),
|
46 |
+
"content": article_text
|
47 |
})
|
48 |
|
49 |
return results
|
50 |
+
|
51 |
except Exception as e:
|
52 |
+
print(f"[ERROR] Google search failed: {e}")
|
53 |
+
return []
|
nuse_modules/keyword_extracter.py
CHANGED
@@ -1,38 +1,73 @@
|
|
1 |
# nuse_modules/keyword_extractor.py
|
2 |
|
3 |
-
import
|
4 |
-
import
|
5 |
-
import
|
|
|
6 |
|
7 |
from models_initialization.mistral_registry import mistral_generate
|
8 |
|
9 |
-
|
10 |
-
segments = raw.strip().split("\n")
|
11 |
|
12 |
-
|
13 |
-
line = line.strip()
|
14 |
-
if line.lower().startswith("extract") or not line or len(line) < 10:
|
15 |
-
continue
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
21 |
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
)
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
-
|
|
|
|
1 |
# nuse_modules/keyword_extractor.py
|
2 |
|
3 |
+
from __future__ import annotations
|
4 |
+
import json, re, logging, itertools
|
5 |
+
from collections import Counter
|
6 |
+
from pathlib import Path
|
7 |
|
8 |
from models_initialization.mistral_registry import mistral_generate
|
9 |
|
10 |
+
STOPWORDS = set(Path(__file__).with_name("stopwords_en.txt").read_text().split())
|
|
|
11 |
|
12 |
+
_JSON_RE = re.compile(r"\[[^\[\]]+\]", re.S) # first [...] block
|
|
|
|
|
|
|
13 |
|
14 |
+
def _dedupe_keep_order(seq):
|
15 |
+
seen = set()
|
16 |
+
for x in seq:
|
17 |
+
if x.lower() not in seen:
|
18 |
+
seen.add(x.lower())
|
19 |
+
yield x
|
20 |
|
21 |
+
def _extract_with_llm(question: str, k: int) -> list[str]:
|
22 |
+
prompt = (
|
23 |
+
"Extract the **most important keywords** (nouns or noun-phrases) from the question below.\n"
|
24 |
+
f"Return a **JSON list** of {k} or fewer lowercase keywords, no commentary.\n\n"
|
25 |
+
f"QUESTION:\n{question}"
|
26 |
+
)
|
27 |
+
raw = mistral_generate(prompt, max_new_tokens=48, temperature=0.3)
|
28 |
+
logging.debug("LLM raw output: %s", raw)
|
29 |
|
30 |
+
# find the first [...] JSON chunk
|
31 |
+
match = _JSON_RE.search(raw or "")
|
32 |
+
if not match:
|
33 |
+
raise ValueError("No JSON list detected in LLM output")
|
34 |
|
35 |
+
try:
|
36 |
+
keywords = json.loads(match.group())
|
37 |
+
if not isinstance(keywords, list):
|
38 |
+
raise ValueError
|
39 |
+
except Exception as e:
|
40 |
+
raise ValueError("Invalid JSON list") from e
|
41 |
+
|
42 |
+
cleaned = list(
|
43 |
+
_dedupe_keep_order(
|
44 |
+
kw.lower().strip(" .,\"'") for kw in keywords if kw and kw.lower() not in STOPWORDS
|
45 |
+
)
|
46 |
)
|
47 |
+
return cleaned[:k]
|
48 |
+
|
49 |
+
|
50 |
+
_WORD_RE = re.compile(r"[A-Za-z][\w\-]+")
|
51 |
|
52 |
+
def _fallback_keywords(text: str, k: int) -> list[str]:
|
53 |
+
tokens = [t.lower() for t in _WORD_RE.findall(text)]
|
54 |
+
tokens = [t for t in tokens if t not in STOPWORDS and len(t) > 2]
|
55 |
+
counts = Counter(tokens)
|
56 |
+
# remove very common words by frequency threshold
|
57 |
+
common_cut = (len(tokens) // 100) + 2
|
58 |
+
keywords, _ = zip(*counts.most_common(k + common_cut))
|
59 |
+
return list(keywords[:k])
|
60 |
|
61 |
+
def keywords_extractor(question: str, max_keywords: int = 6) -> list[str]:
|
62 |
+
"""
|
63 |
+
Return ≤ `max_keywords` keywords for the given question.
|
64 |
+
"""
|
65 |
+
try:
|
66 |
+
kw = _extract_with_llm(question, max_keywords)
|
67 |
+
if kw:
|
68 |
+
return kw
|
69 |
+
except Exception as exc:
|
70 |
+
logging.warning("LLM keyword extraction failed: %s. Falling back.", exc)
|
71 |
|
72 |
+
# fallback heuristic
|
73 |
+
return _fallback_keywords(question, max_keywords)
|