drrobot9 commited on
Commit
80c5aac
·
verified ·
1 Parent(s): 3791963

push updated backend changes and auto start buiding

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ vectorstore/index.faiss filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
5
+
6
+ # Set working directory
7
+ WORKDIR /app
8
+
9
+ # Copy project files
10
+ COPY . /app
11
+
12
+ # Install Python dependencies
13
+ RUN pip install --no-cache-dir --upgrade pip \
14
+ && pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Expose the port Hugging Face Spaces expects
17
+ EXPOSE 7860
18
+
19
+ # Run FastAPI with Uvicorn
20
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
__init__.py ADDED
File without changes
__pycache__/agents.cpython-311.pyc ADDED
Binary file (7.23 kB). View file
 
__pycache__/config.cpython-311.pyc ADDED
Binary file (1.53 kB). View file
 
__pycache__/main.cpython-311.pyc ADDED
Binary file (1.81 kB). View file
 
__pycache__/rag.cpython-311.pyc ADDED
Binary file (9.98 kB). View file
 
agents.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # bioinformatics_ai/agents.py
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ from autogen import AssistantAgent, UserProxyAgent
5
+ from config import LLM_MODEL, CONFIDENCE_THRESHOLD, VECTORSTORE_DIR
6
+ from rag import RAGAgent
7
+ import os
8
+ import sys
9
+
10
+ # Ensure Hugging Face cache is in a writable directory (important on HF Spaces)
11
+ if "HF_HOME" not in os.environ:
12
+ hf_cache = "/home/user/.cache/huggingface"
13
+ os.environ["HF_HOME"] = hf_cache
14
+ os.environ["TRANSFORMERS_CACHE"] = os.path.join(hf_cache, "transformers")
15
+ os.environ["HF_HUB_CACHE"] = os.path.join(hf_cache, "hub")
16
+
17
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
18
+ if BASE_DIR not in sys.path:
19
+ sys.path.insert(0, BASE_DIR)
20
+
21
+
22
+ # Load BioMistral once
23
+ class BioMistralModel:
24
+ def __init__(self, model_name=LLM_MODEL, device=None):
25
+ print(f"[BioMistralModel] Loading model: {model_name}")
26
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
27
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
28
+ self.model = AutoModelForCausalLM.from_pretrained(
29
+ model_name,
30
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
31
+ device_map="auto"
32
+ )
33
+
34
+ def generate_answer(self, query: str) -> str:
35
+ prompt = f"You are a helpful bioinformatics tutor. Answer clearly:\n\nQuestion: {query}\nAnswer:"
36
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
37
+
38
+ with torch.no_grad():
39
+ outputs = self.model.generate(
40
+ **inputs,
41
+ max_new_tokens=512,
42
+ do_sample=True,
43
+ top_p=0.95,
44
+ temperature=0.7,
45
+ pad_token_id=self.tokenizer.eos_token_id
46
+ )
47
+
48
+ text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
49
+ return text.split("Answer:", 1)[-1].strip()
50
+
51
+
52
+ # Formatting Agent
53
+ class FormattingAgent(AssistantAgent):
54
+ def __init__(self, name="FormattingAgent", **kwargs):
55
+ super().__init__(name=name, **kwargs)
56
+
57
+ def format_text(self, text: str) -> str:
58
+ cleaned = " ".join(text.split())
59
+ if cleaned:
60
+ cleaned = cleaned[0].upper() + cleaned[1:]
61
+ return cleaned
62
+
63
+
64
+ # Tutor Agent
65
+ class TutorAgent(AssistantAgent):
66
+ def __init__(self, name="TutorAgent", **kwargs):
67
+ super().__init__(name=name, **kwargs)
68
+ self.model = BioMistralModel()
69
+ self.format_agent = FormattingAgent()
70
+ self.rag_agent = RAGAgent(vectorstore_dir=str(VECTORSTORE_DIR)) # safe conversion
71
+
72
+ def process_query(self, query: str) -> str:
73
+ print(f"[TutorAgent] Received query: {query}")
74
+
75
+ answer = self.model.generate_answer(query)
76
+ confidence = self.estimate_confidence(answer)
77
+
78
+ print(f"[TutorAgent] Confidence: {confidence:.2f}")
79
+ if confidence < CONFIDENCE_THRESHOLD:
80
+ print("[TutorAgent] Confidence low, but still using BioMistral (RAG unused).")
81
+
82
+ return self.format_agent.format_text(answer)
83
+
84
+ def estimate_confidence(self, answer: str) -> float:
85
+ length = len(answer.strip())
86
+ if length > 100:
87
+ return 0.9
88
+ elif length > 50:
89
+ return 0.75
90
+ else:
91
+ return 0.5
92
+
93
+
94
+ # User Agent
95
+ class BioUser(UserProxyAgent):
96
+ def __init__(self, name="BioUser", **kwargs):
97
+ # disable docker-based execution (not available in HF Spaces)
98
+ kwargs.setdefault("code_execution_config", {"use_docker": False})
99
+ super().__init__(name=name, **kwargs)
app.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ import uvicorn
3
+ from fastapi import FastAPI
4
+ from pydantic import BaseModel
5
+ from agents import TutorAgent, BioUser
6
+
7
+ # Initialize FastAPI
8
+ app = FastAPI(title="Bioinformatics Tutor API")
9
+
10
+ # Initialize agents
11
+ user_agent = BioUser()
12
+ tutor_agent = TutorAgent()
13
+
14
+ # Request model
15
+ class QueryRequest(BaseModel):
16
+ question: str
17
+
18
+ # Response model
19
+ class QueryResponse(BaseModel):
20
+ answer: str
21
+
22
+ @app.post("/ask", response_model=QueryResponse)
23
+ def ask_tutor(request: QueryRequest):
24
+ """
25
+ Ask the Bioinformatics Tutor a question.
26
+ """
27
+ answer = tutor_agent.process_query(request.question)
28
+ return QueryResponse(answer=answer)
29
+
30
+ @app.get("/")
31
+ def root():
32
+ return {"message": "Bioinformatics Tutor API is running."}
33
+
config.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+ import os
3
+ import sys
4
+ import logging
5
+ from pathlib import Path
6
+
7
+
8
+ BASE_DIR = Path(__file__).resolve().parent
9
+
10
+ if str(BASE_DIR) not in sys.path:
11
+ sys.path.insert(0, str(BASE_DIR))
12
+
13
+
14
+ try:
15
+ import google.colab
16
+ IN_COLAB = True
17
+ except ImportError:
18
+ IN_COLAB = False
19
+
20
+
21
+ if IN_COLAB:
22
+ VECTORSTORE_DIR = Path("/content/drive/MyDrive/bioinformatics_tutor_ai/vectorstore")
23
+ else:
24
+ VECTORSTORE_DIR = BASE_DIR / "vectorstore"
25
+
26
+ # Models
27
+ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
28
+ LLM_MODEL = "BioMistral/BioMistral-7B-DARE"
29
+
30
+ # Confidence threshold for TutorAgent
31
+ CONFIDENCE_THRESHOLD = 0.65
32
+
33
+
34
+
35
+
36
+ if "HF_HOME" not in os.environ:
37
+ os.environ["HF_HOME"] = "/home/user/.cache/huggingface"
38
+ os.environ["TRANSFORMERS_CACHE"] = os.path.join(os.environ["HF_HOME"], "transformers")
39
+ os.environ["HF_HUB_CACHE"] = os.path.join(os.environ["HF_HOME"], "hub")
main.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ import uvicorn
3
+ from fastapi import FastAPI
4
+ from pydantic import BaseModel
5
+ from agents import TutorAgent, BioUser
6
+
7
+ # Initialize FastAPI
8
+ app = FastAPI(title="Bioinformatics Tutor API")
9
+
10
+ # Initialize agents
11
+ user_agent = BioUser()
12
+ tutor_agent = TutorAgent()
13
+
14
+ # Request model
15
+ class QueryRequest(BaseModel):
16
+ question: str
17
+
18
+ # Response model
19
+ class QueryResponse(BaseModel):
20
+ answer: str
21
+
22
+ @app.post("/ask", response_model=QueryResponse)
23
+ def ask_tutor(request: QueryRequest):
24
+ """
25
+ Ask the Bioinformatics Tutor a question.
26
+ """
27
+ answer = tutor_agent.process_query(request.question)
28
+ return QueryResponse(answer=answer)
29
+
30
+ @app.get("/")
31
+ def root():
32
+ return {"message": "Bioinformatics Tutor API is running."}
33
+
rag.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rag.py
2
+ import os
3
+ import json
4
+ import pickle
5
+ import logging
6
+ from typing import List, Tuple, Optional
7
+
8
+ import numpy as np
9
+ import faiss
10
+ from sentence_transformers import SentenceTransformer
11
+
12
+ from config import VECTORSTORE_DIR, EMBEDDING_MODEL
13
+
14
+ log = logging.getLogger(__name__)
15
+
16
+
17
+ class RAGAgent:
18
+ """
19
+ Loads a FAISS index + metadata from VECTORSTORE_DIR (config).
20
+ Provides retrieve(query, k) -> (contexts: List[str], sources: List[dict])
21
+ """
22
+
23
+ def __init__(self, vectorstore_dir: Optional[str] = None, embedding_model: Optional[str] = None):
24
+ self.vectorstore_dir = vectorstore_dir or str(VECTORSTORE_DIR)
25
+ self.embedding_model_name = embedding_model or EMBEDDING_MODEL
26
+ self.index: Optional[faiss.Index] = None
27
+ self.metadata: Optional[List[dict]] = None
28
+ self._embedder: Optional[SentenceTransformer] = None
29
+ self._loaded = False
30
+
31
+ def _find_index_file(self) -> Optional[str]:
32
+ if not os.path.isdir(self.vectorstore_dir):
33
+ log.warning("Vectorstore dir not found: %s", self.vectorstore_dir)
34
+ return None
35
+
36
+ for fname in os.listdir(self.vectorstore_dir):
37
+ if fname.endswith((".faiss", ".index", ".bin")) or fname.startswith("index"):
38
+ return os.path.join(self.vectorstore_dir, fname)
39
+ return None
40
+
41
+ def _find_meta_file(self) -> Optional[str]:
42
+ if not os.path.isdir(self.vectorstore_dir):
43
+ return None
44
+
45
+ for candidate in ("index.pkl", "metadata.pkl", "index_meta.pkl", "metadata.json", "index.json"):
46
+ p = os.path.join(self.vectorstore_dir, candidate)
47
+ if os.path.exists(p):
48
+ return p
49
+
50
+ for fname in os.listdir(self.vectorstore_dir):
51
+ if fname.endswith(".pkl") or fname.endswith(".json"):
52
+ return os.path.join(self.vectorstore_dir, fname)
53
+
54
+ return None
55
+
56
+ @property
57
+ def embedder(self) -> SentenceTransformer:
58
+ if self._embedder is None:
59
+ log.info("Loading embedder: %s", self.embedding_model_name)
60
+ self._embedder = SentenceTransformer(self.embedding_model_name)
61
+ return self._embedder
62
+
63
+ def load(self) -> None:
64
+ """Load index and metadata into memory (idempotent)."""
65
+ if self._loaded:
66
+ return
67
+
68
+ idx_path = self._find_index_file()
69
+ meta_path = self._find_meta_file()
70
+
71
+ if not idx_path or not meta_path:
72
+ log.warning("No index/metadata found in %s — retrieval disabled.", self.vectorstore_dir)
73
+ return
74
+
75
+ log.info("Loading FAISS index from: %s", idx_path)
76
+ try:
77
+ self.index = faiss.read_index(idx_path)
78
+ except Exception as e:
79
+ log.error("Failed to read FAISS index: %s", e)
80
+ return
81
+
82
+ log.info("Loading metadata from: %s", meta_path)
83
+ try:
84
+ if meta_path.endswith(".json"):
85
+ with open(meta_path, "r", encoding="utf-8") as f:
86
+ self.metadata = json.load(f)
87
+ else:
88
+ with open(meta_path, "rb") as f:
89
+ self.metadata = pickle.load(f)
90
+ except Exception as e:
91
+ log.error("Failed to read metadata: %s", e)
92
+ return
93
+
94
+ # Normalize metadata type
95
+ if not isinstance(self.metadata, list):
96
+ if isinstance(self.metadata, dict):
97
+ try:
98
+ self.metadata = [self.metadata[k] for k in sorted(self.metadata.keys())]
99
+ except Exception:
100
+ self.metadata = list(self.metadata.values())
101
+ else:
102
+ self.metadata = list(self.metadata)
103
+
104
+ log.info("Loaded index and metadata: metadata length=%d", len(self.metadata))
105
+ self._loaded = True
106
+
107
+ def retrieve(self, query: str, k: int = 3) -> Tuple[List[str], List[dict]]:
108
+ """
109
+ Return two lists:
110
+ - contexts: [str, ...] top-k chunk texts (may be fewer)
111
+ - sources: [ {meta..., "score": float}, ... ]
112
+ """
113
+ if not self._loaded:
114
+ self.load()
115
+
116
+ if self.index is None or self.metadata is None:
117
+ return [], []
118
+
119
+ q_emb = self.embedder.encode([query], convert_to_numpy=True).astype("float32")
120
+
121
+ # try normalize if index uses normalized vectors
122
+ try:
123
+ faiss.normalize_L2(q_emb)
124
+ except Exception:
125
+ pass
126
+
127
+ try:
128
+ D, I = self.index.search(q_emb, k)
129
+ except Exception as e:
130
+ log.warning("FAISS search error: %s", e)
131
+ return [], []
132
+
133
+ if I is None or D is None:
134
+ return [], []
135
+
136
+ indices = np.array(I).reshape(-1)[:k].tolist()
137
+ scores = np.array(D).reshape(-1)[:k].tolist()
138
+
139
+ contexts, sources = [], []
140
+ for idx, score in zip(indices, scores):
141
+ if int(idx) < 0 or idx >= len(self.metadata):
142
+ continue
143
+
144
+ meta = self.metadata[int(idx)]
145
+ text = None
146
+
147
+ if isinstance(meta, dict):
148
+ for key in ("text", "page_content", "content", "chunk_text", "source_text"):
149
+ if key in meta and meta[key]:
150
+ text = meta[key]
151
+ break
152
+ if text is None and "metadata" in meta and isinstance(meta["metadata"], dict):
153
+ text = meta["metadata"].get("text") or meta["metadata"].get("page_content")
154
+ elif isinstance(meta, str):
155
+ text = meta
156
+
157
+ if text is None:
158
+ text = str(meta)
159
+
160
+ contexts.append(text)
161
+ sources.append({"meta": meta, "score": float(score)})
162
+
163
+ return contexts, sources
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ autogen
2
+ transformers
3
+ torch
4
+ faiss-cpu
5
+ sentence-transformers
6
+ langchain
7
+ python-dotenv
8
+ langchain-community
9
+ fastapi
10
+ uvicorn
11
+ joblib
vectorstore/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de46c50d583b049d79f8125f981c9408a7fbf3cfc60ec0a18d52f563050f1bf9
3
+ size 2334765
vectorstore/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d050b9c4e1cadccd49b6eeff93d27d27b7fa2d93ac53f6be17714bcee7650f0e
3
+ size 1516917