push updated backend changes and auto start buiding
Browse files
agents.py
CHANGED
@@ -7,6 +7,13 @@ from rag import RAGAgent
|
|
7 |
import os
|
8 |
import sys
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
11 |
if BASE_DIR not in sys.path:
|
12 |
sys.path.insert(0, BASE_DIR)
|
@@ -87,6 +94,6 @@ class TutorAgent(AssistantAgent):
|
|
87 |
# User Agent
|
88 |
class BioUser(UserProxyAgent):
|
89 |
def __init__(self, name="BioUser", **kwargs):
|
90 |
-
|
91 |
kwargs.setdefault("code_execution_config", {"use_docker": False})
|
92 |
super().__init__(name=name, **kwargs)
|
|
|
7 |
import os
|
8 |
import sys
|
9 |
|
10 |
+
# Ensure Hugging Face cache is in a writable directory (important on HF Spaces)
|
11 |
+
if "HF_HOME" not in os.environ:
|
12 |
+
hf_cache = "/home/user/.cache/huggingface"
|
13 |
+
os.environ["HF_HOME"] = hf_cache
|
14 |
+
os.environ["TRANSFORMERS_CACHE"] = os.path.join(hf_cache, "transformers")
|
15 |
+
os.environ["HF_HUB_CACHE"] = os.path.join(hf_cache, "hub")
|
16 |
+
|
17 |
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
18 |
if BASE_DIR not in sys.path:
|
19 |
sys.path.insert(0, BASE_DIR)
|
|
|
94 |
# User Agent
|
95 |
class BioUser(UserProxyAgent):
|
96 |
def __init__(self, name="BioUser", **kwargs):
|
97 |
+
# disable docker-based execution (not available in HF Spaces)
|
98 |
kwargs.setdefault("code_execution_config", {"use_docker": False})
|
99 |
super().__init__(name=name, **kwargs)
|
config.py
CHANGED
@@ -1,30 +1,42 @@
|
|
1 |
# config.py
|
2 |
import os
|
3 |
import sys
|
|
|
4 |
from pathlib import Path
|
5 |
|
6 |
-
|
7 |
BASE_DIR = Path(__file__).resolve().parent
|
8 |
|
9 |
if str(BASE_DIR) not in sys.path:
|
10 |
sys.path.insert(0, str(BASE_DIR))
|
11 |
|
|
|
12 |
try:
|
13 |
-
import google.colab
|
14 |
IN_COLAB = True
|
15 |
except ImportError:
|
16 |
IN_COLAB = False
|
17 |
|
|
|
18 |
if IN_COLAB:
|
19 |
VECTORSTORE_DIR = Path("/content/drive/MyDrive/bioinformatics_tutor_ai/vectorstore")
|
20 |
else:
|
21 |
VECTORSTORE_DIR = BASE_DIR / "vectorstore"
|
22 |
|
|
|
23 |
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
24 |
LLM_MODEL = "BioMistral/BioMistral-7B-DARE"
|
25 |
|
26 |
-
|
|
|
27 |
|
28 |
-
|
|
|
29 |
if not OPENAI_API_KEY:
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# config.py
|
2 |
import os
|
3 |
import sys
|
4 |
+
import logging
|
5 |
from pathlib import Path
|
6 |
|
7 |
+
|
8 |
BASE_DIR = Path(__file__).resolve().parent
|
9 |
|
10 |
if str(BASE_DIR) not in sys.path:
|
11 |
sys.path.insert(0, str(BASE_DIR))
|
12 |
|
13 |
+
|
14 |
try:
|
15 |
+
import google.colab
|
16 |
IN_COLAB = True
|
17 |
except ImportError:
|
18 |
IN_COLAB = False
|
19 |
|
20 |
+
|
21 |
if IN_COLAB:
|
22 |
VECTORSTORE_DIR = Path("/content/drive/MyDrive/bioinformatics_tutor_ai/vectorstore")
|
23 |
else:
|
24 |
VECTORSTORE_DIR = BASE_DIR / "vectorstore"
|
25 |
|
26 |
+
# Models
|
27 |
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
28 |
LLM_MODEL = "BioMistral/BioMistral-7B-DARE"
|
29 |
|
30 |
+
# Confidence threshold for TutorAgent
|
31 |
+
CONFIDENCE_THRESHOLD = 0.65
|
32 |
|
33 |
+
|
34 |
+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
35 |
if not OPENAI_API_KEY:
|
36 |
+
logging.warning("OPENAI_API_KEY not set! AutoGen Tutor Agent will not work without it.")
|
37 |
+
|
38 |
+
|
39 |
+
if "HF_HOME" not in os.environ:
|
40 |
+
os.environ["HF_HOME"] = "/home/user/.cache/huggingface"
|
41 |
+
os.environ["TRANSFORMERS_CACHE"] = os.path.join(os.environ["HF_HOME"], "transformers")
|
42 |
+
os.environ["HF_HUB_CACHE"] = os.path.join(os.environ["HF_HOME"], "hub")
|
main.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# main.py
|
2 |
+
import uvicorn
|
3 |
+
from fastapi import FastAPI
|
4 |
+
from pydantic import BaseModel
|
5 |
+
from agents import TutorAgent, BioUser
|
6 |
+
|
7 |
+
# Initialize FastAPI
|
8 |
+
app = FastAPI(title="Bioinformatics Tutor API")
|
9 |
+
|
10 |
+
# Initialize agents
|
11 |
+
user_agent = BioUser()
|
12 |
+
tutor_agent = TutorAgent()
|
13 |
+
|
14 |
+
# Request model
|
15 |
+
class QueryRequest(BaseModel):
|
16 |
+
question: str
|
17 |
+
|
18 |
+
# Response model
|
19 |
+
class QueryResponse(BaseModel):
|
20 |
+
answer: str
|
21 |
+
|
22 |
+
@app.post("/ask", response_model=QueryResponse)
|
23 |
+
def ask_tutor(request: QueryRequest):
|
24 |
+
"""
|
25 |
+
Ask the Bioinformatics Tutor a question.
|
26 |
+
"""
|
27 |
+
answer = tutor_agent.process_query(request.question)
|
28 |
+
return QueryResponse(answer=answer)
|
29 |
+
|
30 |
+
@app.get("/")
|
31 |
+
def root():
|
32 |
+
return {"message": "Bioinformatics Tutor API is running."}
|
33 |
+
|
rag.py
CHANGED
@@ -12,7 +12,6 @@ from sentence_transformers import SentenceTransformer
|
|
12 |
from config import VECTORSTORE_DIR, EMBEDDING_MODEL
|
13 |
|
14 |
log = logging.getLogger(__name__)
|
15 |
-
logging.basicConfig(level=logging.INFO)
|
16 |
|
17 |
|
18 |
class RAGAgent:
|
@@ -29,26 +28,30 @@ class RAGAgent:
|
|
29 |
self._embedder: Optional[SentenceTransformer] = None
|
30 |
self._loaded = False
|
31 |
|
32 |
-
def _find_index_file(self) -> str:
|
33 |
if not os.path.isdir(self.vectorstore_dir):
|
34 |
-
|
|
|
35 |
|
36 |
for fname in os.listdir(self.vectorstore_dir):
|
37 |
-
if fname.endswith(".faiss"
|
38 |
return os.path.join(self.vectorstore_dir, fname)
|
|
|
39 |
|
40 |
-
|
|
|
|
|
41 |
|
42 |
-
def _find_meta_file(self) -> str:
|
43 |
for candidate in ("index.pkl", "metadata.pkl", "index_meta.pkl", "metadata.json", "index.json"):
|
44 |
p = os.path.join(self.vectorstore_dir, candidate)
|
45 |
if os.path.exists(p):
|
46 |
return p
|
47 |
-
|
48 |
for fname in os.listdir(self.vectorstore_dir):
|
49 |
-
if fname.endswith(".pkl"):
|
50 |
return os.path.join(self.vectorstore_dir, fname)
|
51 |
-
|
|
|
52 |
|
53 |
@property
|
54 |
def embedder(self) -> SentenceTransformer:
|
@@ -61,32 +64,39 @@ class RAGAgent:
|
|
61 |
"""Load index and metadata into memory (idempotent)."""
|
62 |
if self._loaded:
|
63 |
return
|
|
|
64 |
idx_path = self._find_index_file()
|
65 |
meta_path = self._find_meta_file()
|
66 |
|
|
|
|
|
|
|
|
|
67 |
log.info("Loading FAISS index from: %s", idx_path)
|
68 |
try:
|
69 |
self.index = faiss.read_index(idx_path)
|
70 |
except Exception as e:
|
71 |
-
|
|
|
72 |
|
73 |
log.info("Loading metadata from: %s", meta_path)
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
80 |
|
|
|
81 |
if not isinstance(self.metadata, list):
|
82 |
-
|
83 |
if isinstance(self.metadata, dict):
|
84 |
-
|
85 |
-
keys = sorted(self.metadata.keys())
|
86 |
try:
|
87 |
-
self.metadata = [self.metadata[k] for k in keys]
|
88 |
except Exception:
|
89 |
-
|
90 |
self.metadata = list(self.metadata.values())
|
91 |
else:
|
92 |
self.metadata = list(self.metadata)
|
@@ -106,56 +116,46 @@ class RAGAgent:
|
|
106 |
if self.index is None or self.metadata is None:
|
107 |
return [], []
|
108 |
|
109 |
-
|
110 |
-
|
111 |
# try normalize if index uses normalized vectors
|
112 |
try:
|
113 |
faiss.normalize_L2(q_emb)
|
114 |
except Exception:
|
115 |
pass
|
116 |
-
q_emb = q_emb.astype("float32")
|
117 |
|
118 |
-
# safe search call
|
119 |
try:
|
120 |
D, I = self.index.search(q_emb, k)
|
121 |
except Exception as e:
|
122 |
log.warning("FAISS search error: %s", e)
|
123 |
return [], []
|
124 |
|
125 |
-
# ensure shapes
|
126 |
if I is None or D is None:
|
127 |
return [], []
|
128 |
|
129 |
-
|
130 |
indices = np.array(I).reshape(-1)[:k].tolist()
|
131 |
scores = np.array(D).reshape(-1)[:k].tolist()
|
132 |
|
133 |
-
contexts = []
|
134 |
-
sources = []
|
135 |
for idx, score in zip(indices, scores):
|
136 |
-
if int(idx) < 0:
|
137 |
continue
|
138 |
-
# guard against idx out of metadata bounds
|
139 |
-
if idx >= len(self.metadata):
|
140 |
-
log.debug("Index %s >= metadata length %d — skipping", idx, len(self.metadata))
|
141 |
-
continue
|
142 |
-
meta = self.metadata[int(idx)]
|
143 |
|
144 |
-
|
145 |
text = None
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
if isinstance(meta,
|
153 |
-
text = meta
|
154 |
-
elif isinstance(meta, dict) and "metadata" in meta and isinstance(meta["metadata"], dict):
|
155 |
-
# sometimes nested
|
156 |
text = meta["metadata"].get("text") or meta["metadata"].get("page_content")
|
157 |
-
|
158 |
-
|
|
|
|
|
|
|
159 |
|
160 |
contexts.append(text)
|
161 |
sources.append({"meta": meta, "score": float(score)})
|
|
|
12 |
from config import VECTORSTORE_DIR, EMBEDDING_MODEL
|
13 |
|
14 |
log = logging.getLogger(__name__)
|
|
|
15 |
|
16 |
|
17 |
class RAGAgent:
|
|
|
28 |
self._embedder: Optional[SentenceTransformer] = None
|
29 |
self._loaded = False
|
30 |
|
31 |
+
def _find_index_file(self) -> Optional[str]:
|
32 |
if not os.path.isdir(self.vectorstore_dir):
|
33 |
+
log.warning("Vectorstore dir not found: %s", self.vectorstore_dir)
|
34 |
+
return None
|
35 |
|
36 |
for fname in os.listdir(self.vectorstore_dir):
|
37 |
+
if fname.endswith((".faiss", ".index", ".bin")) or fname.startswith("index"):
|
38 |
return os.path.join(self.vectorstore_dir, fname)
|
39 |
+
return None
|
40 |
|
41 |
+
def _find_meta_file(self) -> Optional[str]:
|
42 |
+
if not os.path.isdir(self.vectorstore_dir):
|
43 |
+
return None
|
44 |
|
|
|
45 |
for candidate in ("index.pkl", "metadata.pkl", "index_meta.pkl", "metadata.json", "index.json"):
|
46 |
p = os.path.join(self.vectorstore_dir, candidate)
|
47 |
if os.path.exists(p):
|
48 |
return p
|
49 |
+
|
50 |
for fname in os.listdir(self.vectorstore_dir):
|
51 |
+
if fname.endswith(".pkl") or fname.endswith(".json"):
|
52 |
return os.path.join(self.vectorstore_dir, fname)
|
53 |
+
|
54 |
+
return None
|
55 |
|
56 |
@property
|
57 |
def embedder(self) -> SentenceTransformer:
|
|
|
64 |
"""Load index and metadata into memory (idempotent)."""
|
65 |
if self._loaded:
|
66 |
return
|
67 |
+
|
68 |
idx_path = self._find_index_file()
|
69 |
meta_path = self._find_meta_file()
|
70 |
|
71 |
+
if not idx_path or not meta_path:
|
72 |
+
log.warning("No index/metadata found in %s — retrieval disabled.", self.vectorstore_dir)
|
73 |
+
return
|
74 |
+
|
75 |
log.info("Loading FAISS index from: %s", idx_path)
|
76 |
try:
|
77 |
self.index = faiss.read_index(idx_path)
|
78 |
except Exception as e:
|
79 |
+
log.error("Failed to read FAISS index: %s", e)
|
80 |
+
return
|
81 |
|
82 |
log.info("Loading metadata from: %s", meta_path)
|
83 |
+
try:
|
84 |
+
if meta_path.endswith(".json"):
|
85 |
+
with open(meta_path, "r", encoding="utf-8") as f:
|
86 |
+
self.metadata = json.load(f)
|
87 |
+
else:
|
88 |
+
with open(meta_path, "rb") as f:
|
89 |
+
self.metadata = pickle.load(f)
|
90 |
+
except Exception as e:
|
91 |
+
log.error("Failed to read metadata: %s", e)
|
92 |
+
return
|
93 |
|
94 |
+
# Normalize metadata type
|
95 |
if not isinstance(self.metadata, list):
|
|
|
96 |
if isinstance(self.metadata, dict):
|
|
|
|
|
97 |
try:
|
98 |
+
self.metadata = [self.metadata[k] for k in sorted(self.metadata.keys())]
|
99 |
except Exception:
|
|
|
100 |
self.metadata = list(self.metadata.values())
|
101 |
else:
|
102 |
self.metadata = list(self.metadata)
|
|
|
116 |
if self.index is None or self.metadata is None:
|
117 |
return [], []
|
118 |
|
119 |
+
q_emb = self.embedder.encode([query], convert_to_numpy=True).astype("float32")
|
120 |
+
|
121 |
# try normalize if index uses normalized vectors
|
122 |
try:
|
123 |
faiss.normalize_L2(q_emb)
|
124 |
except Exception:
|
125 |
pass
|
|
|
126 |
|
|
|
127 |
try:
|
128 |
D, I = self.index.search(q_emb, k)
|
129 |
except Exception as e:
|
130 |
log.warning("FAISS search error: %s", e)
|
131 |
return [], []
|
132 |
|
|
|
133 |
if I is None or D is None:
|
134 |
return [], []
|
135 |
|
|
|
136 |
indices = np.array(I).reshape(-1)[:k].tolist()
|
137 |
scores = np.array(D).reshape(-1)[:k].tolist()
|
138 |
|
139 |
+
contexts, sources = [], []
|
|
|
140 |
for idx, score in zip(indices, scores):
|
141 |
+
if int(idx) < 0 or idx >= len(self.metadata):
|
142 |
continue
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
+
meta = self.metadata[int(idx)]
|
145 |
text = None
|
146 |
+
|
147 |
+
if isinstance(meta, dict):
|
148 |
+
for key in ("text", "page_content", "content", "chunk_text", "source_text"):
|
149 |
+
if key in meta and meta[key]:
|
150 |
+
text = meta[key]
|
151 |
+
break
|
152 |
+
if text is None and "metadata" in meta and isinstance(meta["metadata"], dict):
|
|
|
|
|
|
|
153 |
text = meta["metadata"].get("text") or meta["metadata"].get("page_content")
|
154 |
+
elif isinstance(meta, str):
|
155 |
+
text = meta
|
156 |
+
|
157 |
+
if text is None:
|
158 |
+
text = str(meta)
|
159 |
|
160 |
contexts.append(text)
|
161 |
sources.append({"meta": meta, "score": float(score)})
|