Spaces:

nuseAI
/

FastAPI

Sleeping

App Files Files Community

raghavNCI commited on Jul 7

Commit

9c1bffa

1 Parent(s): 02e2d96

revamping the nude modules

Browse files

Files changed (3) hide show

Dockerfile +1 -1
nuse_modules/google_search.py +20 -47
nuse_modules/keyword_extracter.py +59 -24

Dockerfile CHANGED Viewed

@@ -11,7 +11,7 @@ WORKDIR /app
 COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
-RUN pip install --no-cache-dir boto3
 COPY --chown=user . /app
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
+RUN pip install --no-cache-dir trafilatura
 COPY --chown=user . /app
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

nuse_modules/google_search.py CHANGED Viewed

@@ -4,64 +4,31 @@ import os
 import requests
 import time
 from typing import List
 GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
-def search_google_news_batch(queries: List[str], results_per_query: int = 30) -> List[dict]:
-    all_results = []
-    seen_links = set()
-    for query in queries:
-        print(f"[SEARCH] Query: {query}")
-        total_fetched = 0
-        start_index = 1
-        while total_fetched < results_per_query and start_index <= 91:
-            url = (
-                f"https://www.googleapis.com/customsearch/v1"
-                f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
-                f"&q={query}&num=10&start={start_index}"
-            )
-            try:
-                res = requests.get(url, timeout=10)
-                res.raise_for_status()
-                data = res.json()
-                items = data.get("items", [])
-                if not items:
-                    break  # No more results
-                for item in items:
-                    link = item.get("link")
-                    if link and link not in seen_links:
-                        seen_links.add(link)
-                        all_results.append({
-                            "title": item.get("title"),
-                            "link": link,
-                            "snippet": item.get("snippet"),
-                            "query": query,
-                        })
-                total_fetched += len(items)
-                start_index += 10
-                time.sleep(0.5)  # Avoid rate limits
-            except Exception as e:
-                print(f"[ERROR] Query '{query}' failed at start={start_index}: {e}")
-                break
-    return all_results
-def search_google_news(keywords: list[str], num_results: int = 5):
     query = " ".join(keywords)
     url = (
         f"https://www.googleapis.com/customsearch/v1"
         f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
         f"&q={query}&num={num_results}"
     )
     try:
         res = requests.get(url, timeout=10)
         res.raise_for_status()
@@ -69,12 +36,18 @@ def search_google_news(keywords: list[str], num_results: int = 5):
         results = []
         for item in data.get("items", []):
             results.append({
                 "title": item.get("title"),
-                "link": item.get("link"),
                 "snippet": item.get("snippet"),
             })
         return results
     except Exception as e:
-        return {"error": str(e)}

 import requests
 import time
 from typing import List
+from trafilatura import fetch_url, extract
 GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 GOOGLE_CX_ID = os.getenv("GOOGLE_CX_ID")
+def extract_full_text(url: str) -> str:
+    try:
+        downloaded = fetch_url(url)
+        if downloaded:
+            content = extract(downloaded, include_comments=False, include_tables=False)
+            return content or ""
+    except Exception as e:
+        print(f"[SCRAPER ERROR] {url}: {e}")
+    return ""
+def search_google_news(keywords: List[str], num_results: int = 5) -> List[dict]:
     query = " ".join(keywords)
     url = (
         f"https://www.googleapis.com/customsearch/v1"
         f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX_ID}"
         f"&q={query}&num={num_results}"
     )
     try:
         res = requests.get(url, timeout=10)
         res.raise_for_status()
         results = []
         for item in data.get("items", []):
+            link = item.get("link")
+            article_text = extract_full_text(link)
             results.append({
                 "title": item.get("title"),
+                "link": link,
                 "snippet": item.get("snippet"),
+                "content": article_text
             })
         return results
     except Exception as e:
+        print(f"[ERROR] Google search failed: {e}")
+        return []

nuse_modules/keyword_extracter.py CHANGED Viewed

@@ -1,38 +1,73 @@
 # nuse_modules/keyword_extractor.py
-import os
-import requests
-import json
 from models_initialization.mistral_registry import mistral_generate
-def extract_last_keywords(raw: str, max_keywords: int = 8) -> list[str]:
-    segments = raw.strip().split("\n")
-    for line in reversed(segments):
-        line = line.strip()
-        if line.lower().startswith("extract") or not line or len(line) < 10:
-            continue
-        if line.count(",") >= 2:
-            parts = [kw.strip().strip('"') for kw in line.split(",") if kw.strip()]
-            if all(len(p.split()) <= 3 for p in parts) and 1 <= len(parts) <= max_keywords:
-                return parts
-    return []
-def keywords_extractor(question: str) -> list[str]:
-    prompt = (
-        f"Extract the 3–6 most important keywords from the following question. "
-        f"Return only the keywords, comma-separated (no explanations):\n\n"
-        f"{question}"
     )
-    raw_output = mistral_generate(prompt, max_new_tokens=32)
-    keywords = extract_last_keywords(raw_output)
-    print("Raw extracted keywords:", raw_output)
-    print("Parsed keywords:", keywords)
-    return keywords

 # nuse_modules/keyword_extractor.py
+from __future__ import annotations
+import json, re, logging, itertools
+from collections import Counter
+from pathlib import Path
 from models_initialization.mistral_registry import mistral_generate
+STOPWORDS = set(Path(__file__).with_name("stopwords_en.txt").read_text().split())
+_JSON_RE = re.compile(r"\[[^\[\]]+\]", re.S)  # first [...] block
+def _dedupe_keep_order(seq):
+    seen = set()
+    for x in seq:
+        if x.lower() not in seen:
+            seen.add(x.lower())
+            yield x
+def _extract_with_llm(question: str, k: int) -> list[str]:
+    prompt = (
+        "Extract the **most important keywords** (nouns or noun-phrases) from the question below.\n"
+        f"Return a **JSON list** of {k} or fewer lowercase keywords, no commentary.\n\n"
+        f"QUESTION:\n{question}"
+    )
+    raw = mistral_generate(prompt, max_new_tokens=48, temperature=0.3)
+    logging.debug("LLM raw output: %s", raw)
+    # find the first [...] JSON chunk
+    match = _JSON_RE.search(raw or "")
+    if not match:
+        raise ValueError("No JSON list detected in LLM output")
+    try:
+        keywords = json.loads(match.group())
+        if not isinstance(keywords, list):
+            raise ValueError
+    except Exception as e:
+        raise ValueError("Invalid JSON list") from e
+    cleaned = list(
+        _dedupe_keep_order(
+            kw.lower().strip(" .,\"'") for kw in keywords if kw and kw.lower() not in STOPWORDS
+        )
     )
+    return cleaned[:k]
+_WORD_RE = re.compile(r"[A-Za-z][\w\-]+")
+def _fallback_keywords(text: str, k: int) -> list[str]:
+    tokens = [t.lower() for t in _WORD_RE.findall(text)]
+    tokens = [t for t in tokens if t not in STOPWORDS and len(t) > 2]
+    counts = Counter(tokens)
+    # remove very common words by frequency threshold
+    common_cut = (len(tokens) // 100) + 2
+    keywords, _ = zip(*counts.most_common(k + common_cut))
+    return list(keywords[:k])
+def keywords_extractor(question: str, max_keywords: int = 6) -> list[str]:
+    """
+    Return ≤ `max_keywords` keywords for the given question.
+    """
+    try:
+        kw = _extract_with_llm(question, max_keywords)
+        if kw:
+            return kw
+    except Exception as exc:
+        logging.warning("LLM keyword extraction failed: %s. Falling back.", exc)
+    # fallback heuristic
+    return _fallback_keywords(question, max_keywords)