Spaces:

drrobot9
/

FUTA_BIOMEDICALENGINEERING_AI

Running

App Files Files Community

drrobot9 commited on 9 days ago

Commit

80c5aac

verified ·

1 Parent(s): 3791963

push updated backend changes and auto start buiding

Browse files

Files changed (15) hide show

.gitattributes +1 -0
Dockerfile +20 -0
__init__.py +0 -0
__pycache__/agents.cpython-311.pyc +0 -0
__pycache__/config.cpython-311.pyc +0 -0
__pycache__/main.cpython-311.pyc +0 -0
__pycache__/rag.cpython-311.pyc +0 -0
agents.py +99 -0
app.py +33 -0
config.py +39 -0
main.py +33 -0
rag.py +163 -0
requirements.txt +11 -0
vectorstore/index.faiss +3 -0
vectorstore/index.pkl +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+vectorstore/index.faiss filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.10-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Copy project files
+COPY . /app
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip \
+    && pip install --no-cache-dir -r requirements.txt
+# Expose the port Hugging Face Spaces expects
+EXPOSE 7860
+# Run FastAPI with Uvicorn
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

__init__.py ADDED Viewed

File without changes

__pycache__/agents.cpython-311.pyc ADDED Viewed

Binary file (7.23 kB). View file

__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (1.53 kB). View file

__pycache__/main.cpython-311.pyc ADDED Viewed

Binary file (1.81 kB). View file

__pycache__/rag.cpython-311.pyc ADDED Viewed

Binary file (9.98 kB). View file

agents.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# bioinformatics_ai/agents.py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from autogen import AssistantAgent, UserProxyAgent
+from config import LLM_MODEL, CONFIDENCE_THRESHOLD, VECTORSTORE_DIR
+from rag import RAGAgent
+import os
+import sys
+#  Ensure Hugging Face cache is in a writable directory (important on HF Spaces)
+if "HF_HOME" not in os.environ:
+    hf_cache = "/home/user/.cache/huggingface"
+    os.environ["HF_HOME"] = hf_cache
+    os.environ["TRANSFORMERS_CACHE"] = os.path.join(hf_cache, "transformers")
+    os.environ["HF_HUB_CACHE"] = os.path.join(hf_cache, "hub")
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+if BASE_DIR not in sys.path:
+    sys.path.insert(0, BASE_DIR)
+# Load BioMistral once
+class BioMistralModel:
+    def __init__(self, model_name=LLM_MODEL, device=None):
+        print(f"[BioMistralModel] Loading model: {model_name}")
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto"
+        )
+    def generate_answer(self, query: str) -> str:
+        prompt = f"You are a helpful bioinformatics tutor. Answer clearly:\n\nQuestion: {query}\nAnswer:"
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=512,
+                do_sample=True,
+                top_p=0.95,
+                temperature=0.7,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return text.split("Answer:", 1)[-1].strip()
+# Formatting Agent
+class FormattingAgent(AssistantAgent):
+    def __init__(self, name="FormattingAgent", **kwargs):
+        super().__init__(name=name, **kwargs)
+    def format_text(self, text: str) -> str:
+        cleaned = " ".join(text.split())
+        if cleaned:
+            cleaned = cleaned[0].upper() + cleaned[1:]
+        return cleaned
+# Tutor Agent
+class TutorAgent(AssistantAgent):
+    def __init__(self, name="TutorAgent", **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.model = BioMistralModel()
+        self.format_agent = FormattingAgent()
+        self.rag_agent = RAGAgent(vectorstore_dir=str(VECTORSTORE_DIR))  # safe conversion
+    def process_query(self, query: str) -> str:
+        print(f"[TutorAgent] Received query: {query}")
+        answer = self.model.generate_answer(query)
+        confidence = self.estimate_confidence(answer)
+        print(f"[TutorAgent] Confidence: {confidence:.2f}")
+        if confidence < CONFIDENCE_THRESHOLD:
+            print("[TutorAgent] Confidence low, but still using BioMistral (RAG unused).")
+        return self.format_agent.format_text(answer)
+    def estimate_confidence(self, answer: str) -> float:
+        length = len(answer.strip())
+        if length > 100:
+            return 0.9
+        elif length > 50:
+            return 0.75
+        else:
+            return 0.5
+# User Agent
+class BioUser(UserProxyAgent):
+    def __init__(self, name="BioUser", **kwargs):
+        #  disable docker-based execution (not available in HF Spaces)
+        kwargs.setdefault("code_execution_config", {"use_docker": False})
+        super().__init__(name=name, **kwargs)

app.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# main.py
+import uvicorn
+from fastapi import FastAPI
+from pydantic import BaseModel
+from agents import TutorAgent, BioUser
+# Initialize FastAPI
+app = FastAPI(title="Bioinformatics Tutor API")
+# Initialize agents
+user_agent = BioUser()
+tutor_agent = TutorAgent()
+# Request model
+class QueryRequest(BaseModel):
+    question: str
+# Response model
+class QueryResponse(BaseModel):
+    answer: str
+@app.post("/ask", response_model=QueryResponse)
+def ask_tutor(request: QueryRequest):
+    """
+    Ask the Bioinformatics Tutor a question.
+    """
+    answer = tutor_agent.process_query(request.question)
+    return QueryResponse(answer=answer)
+@app.get("/")
+def root():
+    return {"message": "Bioinformatics Tutor API is running."}

config.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# config.py
+import os
+import sys
+import logging
+from pathlib import Path
+BASE_DIR = Path(__file__).resolve().parent
+if str(BASE_DIR) not in sys.path:
+    sys.path.insert(0, str(BASE_DIR))
+try:
+    import google.colab
+    IN_COLAB = True
+except ImportError:
+    IN_COLAB = False
+if IN_COLAB:
+    VECTORSTORE_DIR = Path("/content/drive/MyDrive/bioinformatics_tutor_ai/vectorstore")
+else:
+    VECTORSTORE_DIR = BASE_DIR / "vectorstore"
+# Models
+EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+LLM_MODEL = "BioMistral/BioMistral-7B-DARE"
+# Confidence threshold for TutorAgent
+CONFIDENCE_THRESHOLD = 0.65
+if "HF_HOME" not in os.environ:
+    os.environ["HF_HOME"] = "/home/user/.cache/huggingface"
+    os.environ["TRANSFORMERS_CACHE"] = os.path.join(os.environ["HF_HOME"], "transformers")
+    os.environ["HF_HUB_CACHE"] = os.path.join(os.environ["HF_HOME"], "hub")

main.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# main.py
+import uvicorn
+from fastapi import FastAPI
+from pydantic import BaseModel
+from agents import TutorAgent, BioUser
+# Initialize FastAPI
+app = FastAPI(title="Bioinformatics Tutor API")
+# Initialize agents
+user_agent = BioUser()
+tutor_agent = TutorAgent()
+# Request model
+class QueryRequest(BaseModel):
+    question: str
+# Response model
+class QueryResponse(BaseModel):
+    answer: str
+@app.post("/ask", response_model=QueryResponse)
+def ask_tutor(request: QueryRequest):
+    """
+    Ask the Bioinformatics Tutor a question.
+    """
+    answer = tutor_agent.process_query(request.question)
+    return QueryResponse(answer=answer)
+@app.get("/")
+def root():
+    return {"message": "Bioinformatics Tutor API is running."}

rag.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# rag.py
+import os
+import json
+import pickle
+import logging
+from typing import List, Tuple, Optional
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from config import VECTORSTORE_DIR, EMBEDDING_MODEL
+log = logging.getLogger(__name__)
+class RAGAgent:
+    """
+    Loads a FAISS index + metadata from VECTORSTORE_DIR (config).
+    Provides retrieve(query, k) -> (contexts: List[str], sources: List[dict])
+    """
+    def __init__(self, vectorstore_dir: Optional[str] = None, embedding_model: Optional[str] = None):
+        self.vectorstore_dir = vectorstore_dir or str(VECTORSTORE_DIR)
+        self.embedding_model_name = embedding_model or EMBEDDING_MODEL
+        self.index: Optional[faiss.Index] = None
+        self.metadata: Optional[List[dict]] = None
+        self._embedder: Optional[SentenceTransformer] = None
+        self._loaded = False
+    def _find_index_file(self) -> Optional[str]:
+        if not os.path.isdir(self.vectorstore_dir):
+            log.warning("Vectorstore dir not found: %s", self.vectorstore_dir)
+            return None
+        for fname in os.listdir(self.vectorstore_dir):
+            if fname.endswith((".faiss", ".index", ".bin")) or fname.startswith("index"):
+                return os.path.join(self.vectorstore_dir, fname)
+        return None
+    def _find_meta_file(self) -> Optional[str]:
+        if not os.path.isdir(self.vectorstore_dir):
+            return None
+        for candidate in ("index.pkl", "metadata.pkl", "index_meta.pkl", "metadata.json", "index.json"):
+            p = os.path.join(self.vectorstore_dir, candidate)
+            if os.path.exists(p):
+                return p
+        for fname in os.listdir(self.vectorstore_dir):
+            if fname.endswith(".pkl") or fname.endswith(".json"):
+                return os.path.join(self.vectorstore_dir, fname)
+        return None
+    @property
+    def embedder(self) -> SentenceTransformer:
+        if self._embedder is None:
+            log.info("Loading embedder: %s", self.embedding_model_name)
+            self._embedder = SentenceTransformer(self.embedding_model_name)
+        return self._embedder
+    def load(self) -> None:
+        """Load index and metadata into memory (idempotent)."""
+        if self._loaded:
+            return
+        idx_path = self._find_index_file()
+        meta_path = self._find_meta_file()
+        if not idx_path or not meta_path:
+            log.warning("No index/metadata found in %s — retrieval disabled.", self.vectorstore_dir)
+            return
+        log.info("Loading FAISS index from: %s", idx_path)
+        try:
+            self.index = faiss.read_index(idx_path)
+        except Exception as e:
+            log.error("Failed to read FAISS index: %s", e)
+            return
+        log.info("Loading metadata from: %s", meta_path)
+        try:
+            if meta_path.endswith(".json"):
+                with open(meta_path, "r", encoding="utf-8") as f:
+                    self.metadata = json.load(f)
+            else:
+                with open(meta_path, "rb") as f:
+                    self.metadata = pickle.load(f)
+        except Exception as e:
+            log.error("Failed to read metadata: %s", e)
+            return
+        # Normalize metadata type
+        if not isinstance(self.metadata, list):
+            if isinstance(self.metadata, dict):
+                try:
+                    self.metadata = [self.metadata[k] for k in sorted(self.metadata.keys())]
+                except Exception:
+                    self.metadata = list(self.metadata.values())
+            else:
+                self.metadata = list(self.metadata)
+        log.info("Loaded index and metadata: metadata length=%d", len(self.metadata))
+        self._loaded = True
+    def retrieve(self, query: str, k: int = 3) -> Tuple[List[str], List[dict]]:
+        """
+        Return two lists:
+        - contexts: [str, ...] top-k chunk texts (may be fewer)
+        - sources: [ {meta..., "score": float}, ... ]
+        """
+        if not self._loaded:
+            self.load()
+        if self.index is None or self.metadata is None:
+            return [], []
+        q_emb = self.embedder.encode([query], convert_to_numpy=True).astype("float32")
+        # try normalize if index uses normalized vectors
+        try:
+            faiss.normalize_L2(q_emb)
+        except Exception:
+            pass
+        try:
+            D, I = self.index.search(q_emb, k)
+        except Exception as e:
+            log.warning("FAISS search error: %s", e)
+            return [], []
+        if I is None or D is None:
+            return [], []
+        indices = np.array(I).reshape(-1)[:k].tolist()
+        scores = np.array(D).reshape(-1)[:k].tolist()
+        contexts, sources = [], []
+        for idx, score in zip(indices, scores):
+            if int(idx) < 0 or idx >= len(self.metadata):
+                continue
+            meta = self.metadata[int(idx)]
+            text = None
+            if isinstance(meta, dict):
+                for key in ("text", "page_content", "content", "chunk_text", "source_text"):
+                    if key in meta and meta[key]:
+                        text = meta[key]
+                        break
+                if text is None and "metadata" in meta and isinstance(meta["metadata"], dict):
+                    text = meta["metadata"].get("text") or meta["metadata"].get("page_content")
+            elif isinstance(meta, str):
+                text = meta
+            if text is None:
+                text = str(meta)
+            contexts.append(text)
+            sources.append({"meta": meta, "score": float(score)})
+        return contexts, sources

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+autogen
+transformers
+torch
+faiss-cpu
+sentence-transformers
+langchain
+python-dotenv
+langchain-community
+fastapi
+uvicorn
+joblib

vectorstore/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de46c50d583b049d79f8125f981c9408a7fbf3cfc60ec0a18d52f563050f1bf9
+size 2334765

vectorstore/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d050b9c4e1cadccd49b6eeff93d27d27b7fa2d93ac53f6be17714bcee7650f0e
+size 1516917