Spaces:

nuseAI
/

FastAPI

Sleeping

App Files Files Community

raghavNCI commited on May 22

Commit

aefa1e1

1 Parent(s): 326a8da

added keyword_extractor

Browse files

Files changed (2) hide show

nuse_modules/keyword_extracter.py +65 -0
question.py +2 -27

nuse_modules/keyword_extracter.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# nuse_modules/keyword_extractor.py
+import os
+import requests
+import json
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
+HEADERS = {
+    "Authorization": f"Bearer {HF_TOKEN}",
+    "Content-Type": "application/json"
+}
+def mistral_generate(prompt: str, max_new_tokens=128) -> str:
+    payload = {
+        "inputs": prompt,
+        "parameters": {
+            "max_new_tokens": max_new_tokens,
+            "temperature": 0.7
+        }
+    }
+    try:
+        response = requests.post(HF_API_URL, headers=HEADERS, data=json.dumps(payload), timeout=30)
+        response.raise_for_status()
+        result = response.json()
+        if isinstance(result, list) and len(result) > 0:
+            return result[0].get("generated_text", "").strip()
+    except Exception as e:
+        print("[mistral_generate error]", str(e))
+    return ""
+def extract_last_keywords(raw: str, max_keywords: int = 8) -> list[str]:
+    segments = raw.strip().split("\n")
+    for line in reversed(segments):
+        line = line.strip()
+        if line.lower().startswith("extract") or not line or len(line) < 10:
+            continue
+        if line.count(",") >= 2:
+            parts = [kw.strip().strip('"') for kw in line.split(",") if kw.strip()]
+            if all(len(p.split()) <= 3 for p in parts) and 1 <= len(parts) <= max_keywords:
+                return parts
+    return []
+def keywords_extractor(question: str) -> list[str]:
+    prompt = (
+        f"Extract the 3–6 most important keywords from the following question. "
+        f"Return only the keywords, comma-separated (no explanations):\n\n"
+        f"{question}"
+    )
+    raw_output = mistral_generate(prompt, max_new_tokens=32)
+    keywords = extract_last_keywords(raw_output)
+    print("Raw extracted keywords:", raw_output)
+    print("Parsed keywords:", keywords)
+    return keywords

question.py CHANGED Viewed

@@ -9,6 +9,7 @@ from dotenv import load_dotenv
 from urllib.parse import quote
 import json
 from nuse_modules.classifier import classify_question, REVERSE_MAP
 load_dotenv()
@@ -26,25 +27,6 @@ HEADERS = {
     "Content-Type": "application/json"
 }
-def extract_last_keywords(raw: str, max_keywords=8):
-    segments = raw.strip().split("\n")
-    # Ignore quoted or prompt lines
-    for line in reversed(segments):
-        line = line.strip()
-        if line.lower().startswith("extract") or not line or len(line) < 10:
-            continue
-        # Look for lines with multiple comma-separated items
-        if line.count(",") >= 2:
-            parts = [kw.strip().strip('"') for kw in line.split(",") if kw.strip()]
-            # Ensure they're not just long phrases or sentence fragments
-            if all(len(p.split()) <= 3 for p in parts) and 1 <= len(parts) <= max_keywords:
-                return parts
-    return []
 def is_relevant(article, keywords):
     text = f"{article.get('title', '')} {article.get('content', '')}".lower()
     return any(kw.lower() in text for kw in keywords)
@@ -97,14 +79,7 @@ async def ask_question(input: QuestionInput):
     print("Intent ID:", qid)
     print("Category:", REVERSE_MAP.get(qid, "unknown"))
-    # Step 1: Ask Mistral to extract keywords
-    keyword_prompt = (
-        f"Extract the 3–6 most important keywords from the following question. "
-        f"Return only the keywords, comma-separated (no explanations):\n\n"
-        f"{question}"
-    )
-    raw_keywords = mistral_generate(keyword_prompt, max_new_tokens=32)
-    keywords = extract_last_keywords(raw_keywords)
     print("Raw extracted keywords:", keywords)

 from urllib.parse import quote
 import json
 from nuse_modules.classifier import classify_question, REVERSE_MAP
+from nuse_modules.keyword_extracter import keywords_extractor
 load_dotenv()
     "Content-Type": "application/json"
 }
 def is_relevant(article, keywords):
     text = f"{article.get('title', '')} {article.get('content', '')}".lower()
     return any(kw.lower() in text for kw in keywords)
     print("Intent ID:", qid)
     print("Category:", REVERSE_MAP.get(qid, "unknown"))
+    keywords = keywords_extractor(question)
     print("Raw extracted keywords:", keywords)