drrobot9 commited on
Commit
3fc834d
·
verified ·
1 Parent(s): ddd60c5

push updated backend changes and auto start buiding

Browse files
Files changed (4) hide show
  1. agents.py +8 -1
  2. config.py +17 -5
  3. main.py +33 -0
  4. rag.py +48 -48
agents.py CHANGED
@@ -7,6 +7,13 @@ from rag import RAGAgent
7
  import os
8
  import sys
9
 
 
 
 
 
 
 
 
10
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
11
  if BASE_DIR not in sys.path:
12
  sys.path.insert(0, BASE_DIR)
@@ -87,6 +94,6 @@ class TutorAgent(AssistantAgent):
87
  # User Agent
88
  class BioUser(UserProxyAgent):
89
  def __init__(self, name="BioUser", **kwargs):
90
-
91
  kwargs.setdefault("code_execution_config", {"use_docker": False})
92
  super().__init__(name=name, **kwargs)
 
7
  import os
8
  import sys
9
 
10
+ # Ensure Hugging Face cache is in a writable directory (important on HF Spaces)
11
+ if "HF_HOME" not in os.environ:
12
+ hf_cache = "/home/user/.cache/huggingface"
13
+ os.environ["HF_HOME"] = hf_cache
14
+ os.environ["TRANSFORMERS_CACHE"] = os.path.join(hf_cache, "transformers")
15
+ os.environ["HF_HUB_CACHE"] = os.path.join(hf_cache, "hub")
16
+
17
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
18
  if BASE_DIR not in sys.path:
19
  sys.path.insert(0, BASE_DIR)
 
94
  # User Agent
95
  class BioUser(UserProxyAgent):
96
  def __init__(self, name="BioUser", **kwargs):
97
+ # disable docker-based execution (not available in HF Spaces)
98
  kwargs.setdefault("code_execution_config", {"use_docker": False})
99
  super().__init__(name=name, **kwargs)
config.py CHANGED
@@ -1,30 +1,42 @@
1
  # config.py
2
  import os
3
  import sys
 
4
  from pathlib import Path
5
 
6
- # Ensure BASE_DIR is a Path, not a string
7
  BASE_DIR = Path(__file__).resolve().parent
8
 
9
  if str(BASE_DIR) not in sys.path:
10
  sys.path.insert(0, str(BASE_DIR))
11
 
 
12
  try:
13
- import google.colab
14
  IN_COLAB = True
15
  except ImportError:
16
  IN_COLAB = False
17
 
 
18
  if IN_COLAB:
19
  VECTORSTORE_DIR = Path("/content/drive/MyDrive/bioinformatics_tutor_ai/vectorstore")
20
  else:
21
  VECTORSTORE_DIR = BASE_DIR / "vectorstore"
22
 
 
23
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
24
  LLM_MODEL = "BioMistral/BioMistral-7B-DARE"
25
 
26
- CONFIDENCE_THRESHOLD = 0.65
 
27
 
28
- OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", None)
 
29
  if not OPENAI_API_KEY:
30
- print(" WARNING: OPENAI_API_KEY not set! AutoGen Tutor Agent will not work without it.")
 
 
 
 
 
 
 
1
  # config.py
2
  import os
3
  import sys
4
+ import logging
5
  from pathlib import Path
6
 
7
+
8
  BASE_DIR = Path(__file__).resolve().parent
9
 
10
  if str(BASE_DIR) not in sys.path:
11
  sys.path.insert(0, str(BASE_DIR))
12
 
13
+
14
  try:
15
+ import google.colab
16
  IN_COLAB = True
17
  except ImportError:
18
  IN_COLAB = False
19
 
20
+
21
  if IN_COLAB:
22
  VECTORSTORE_DIR = Path("/content/drive/MyDrive/bioinformatics_tutor_ai/vectorstore")
23
  else:
24
  VECTORSTORE_DIR = BASE_DIR / "vectorstore"
25
 
26
+ # Models
27
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
28
  LLM_MODEL = "BioMistral/BioMistral-7B-DARE"
29
 
30
+ # Confidence threshold for TutorAgent
31
+ CONFIDENCE_THRESHOLD = 0.65
32
 
33
+
34
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
35
  if not OPENAI_API_KEY:
36
+ logging.warning("OPENAI_API_KEY not set! AutoGen Tutor Agent will not work without it.")
37
+
38
+
39
+ if "HF_HOME" not in os.environ:
40
+ os.environ["HF_HOME"] = "/home/user/.cache/huggingface"
41
+ os.environ["TRANSFORMERS_CACHE"] = os.path.join(os.environ["HF_HOME"], "transformers")
42
+ os.environ["HF_HUB_CACHE"] = os.path.join(os.environ["HF_HOME"], "hub")
main.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ import uvicorn
3
+ from fastapi import FastAPI
4
+ from pydantic import BaseModel
5
+ from agents import TutorAgent, BioUser
6
+
7
+ # Initialize FastAPI
8
+ app = FastAPI(title="Bioinformatics Tutor API")
9
+
10
+ # Initialize agents
11
+ user_agent = BioUser()
12
+ tutor_agent = TutorAgent()
13
+
14
+ # Request model
15
+ class QueryRequest(BaseModel):
16
+ question: str
17
+
18
+ # Response model
19
+ class QueryResponse(BaseModel):
20
+ answer: str
21
+
22
+ @app.post("/ask", response_model=QueryResponse)
23
+ def ask_tutor(request: QueryRequest):
24
+ """
25
+ Ask the Bioinformatics Tutor a question.
26
+ """
27
+ answer = tutor_agent.process_query(request.question)
28
+ return QueryResponse(answer=answer)
29
+
30
+ @app.get("/")
31
+ def root():
32
+ return {"message": "Bioinformatics Tutor API is running."}
33
+
rag.py CHANGED
@@ -12,7 +12,6 @@ from sentence_transformers import SentenceTransformer
12
  from config import VECTORSTORE_DIR, EMBEDDING_MODEL
13
 
14
  log = logging.getLogger(__name__)
15
- logging.basicConfig(level=logging.INFO)
16
 
17
 
18
  class RAGAgent:
@@ -29,26 +28,30 @@ class RAGAgent:
29
  self._embedder: Optional[SentenceTransformer] = None
30
  self._loaded = False
31
 
32
- def _find_index_file(self) -> str:
33
  if not os.path.isdir(self.vectorstore_dir):
34
- raise FileNotFoundError(f"Vectorstore dir not found: {self.vectorstore_dir}")
 
35
 
36
  for fname in os.listdir(self.vectorstore_dir):
37
- if fname.endswith(".faiss") or fname.endswith(".index") or fname.endswith(".bin") or fname.startswith("index"):
38
  return os.path.join(self.vectorstore_dir, fname)
 
39
 
40
- raise FileNotFoundError(f"No FAISS index file (.faiss/.index/.bin) found in {self.vectorstore_dir}")
 
 
41
 
42
- def _find_meta_file(self) -> str:
43
  for candidate in ("index.pkl", "metadata.pkl", "index_meta.pkl", "metadata.json", "index.json"):
44
  p = os.path.join(self.vectorstore_dir, candidate)
45
  if os.path.exists(p):
46
  return p
47
-
48
  for fname in os.listdir(self.vectorstore_dir):
49
- if fname.endswith(".pkl"):
50
  return os.path.join(self.vectorstore_dir, fname)
51
- raise FileNotFoundError(f"No metadata (.pkl/.json) found in {self.vectorstore_dir}")
 
52
 
53
  @property
54
  def embedder(self) -> SentenceTransformer:
@@ -61,32 +64,39 @@ class RAGAgent:
61
  """Load index and metadata into memory (idempotent)."""
62
  if self._loaded:
63
  return
 
64
  idx_path = self._find_index_file()
65
  meta_path = self._find_meta_file()
66
 
 
 
 
 
67
  log.info("Loading FAISS index from: %s", idx_path)
68
  try:
69
  self.index = faiss.read_index(idx_path)
70
  except Exception as e:
71
- raise RuntimeError(f"Failed to read faiss index {idx_path}: {e}")
 
72
 
73
  log.info("Loading metadata from: %s", meta_path)
74
- if meta_path.endswith(".json"):
75
- with open(meta_path, "r", encoding="utf-8") as f:
76
- self.metadata = json.load(f)
77
- else:
78
- with open(meta_path, "rb") as f:
79
- self.metadata = pickle.load(f)
 
 
 
 
80
 
 
81
  if not isinstance(self.metadata, list):
82
-
83
  if isinstance(self.metadata, dict):
84
-
85
- keys = sorted(self.metadata.keys())
86
  try:
87
- self.metadata = [self.metadata[k] for k in keys]
88
  except Exception:
89
-
90
  self.metadata = list(self.metadata.values())
91
  else:
92
  self.metadata = list(self.metadata)
@@ -106,56 +116,46 @@ class RAGAgent:
106
  if self.index is None or self.metadata is None:
107
  return [], []
108
 
109
-
110
- q_emb = self.embedder.encode([query], convert_to_numpy=True)
111
  # try normalize if index uses normalized vectors
112
  try:
113
  faiss.normalize_L2(q_emb)
114
  except Exception:
115
  pass
116
- q_emb = q_emb.astype("float32")
117
 
118
- # safe search call
119
  try:
120
  D, I = self.index.search(q_emb, k)
121
  except Exception as e:
122
  log.warning("FAISS search error: %s", e)
123
  return [], []
124
 
125
- # ensure shapes
126
  if I is None or D is None:
127
  return [], []
128
 
129
-
130
  indices = np.array(I).reshape(-1)[:k].tolist()
131
  scores = np.array(D).reshape(-1)[:k].tolist()
132
 
133
- contexts = []
134
- sources = []
135
  for idx, score in zip(indices, scores):
136
- if int(idx) < 0:
137
  continue
138
- # guard against idx out of metadata bounds
139
- if idx >= len(self.metadata):
140
- log.debug("Index %s >= metadata length %d — skipping", idx, len(self.metadata))
141
- continue
142
- meta = self.metadata[int(idx)]
143
 
144
- # extract text from common keys
145
  text = None
146
- for key in ("text", "page_content", "content", "chunk_text", "source_text"):
147
- if isinstance(meta, dict) and key in meta and meta[key]:
148
- text = meta[key]
149
- break
150
- if text is None:
151
- # fallbac if metadata itself is a string or has 'text' attribute
152
- if isinstance(meta, str):
153
- text = meta
154
- elif isinstance(meta, dict) and "metadata" in meta and isinstance(meta["metadata"], dict):
155
- # sometimes nested
156
  text = meta["metadata"].get("text") or meta["metadata"].get("page_content")
157
- else:
158
- text = str(meta)
 
 
 
159
 
160
  contexts.append(text)
161
  sources.append({"meta": meta, "score": float(score)})
 
12
  from config import VECTORSTORE_DIR, EMBEDDING_MODEL
13
 
14
  log = logging.getLogger(__name__)
 
15
 
16
 
17
  class RAGAgent:
 
28
  self._embedder: Optional[SentenceTransformer] = None
29
  self._loaded = False
30
 
31
+ def _find_index_file(self) -> Optional[str]:
32
  if not os.path.isdir(self.vectorstore_dir):
33
+ log.warning("Vectorstore dir not found: %s", self.vectorstore_dir)
34
+ return None
35
 
36
  for fname in os.listdir(self.vectorstore_dir):
37
+ if fname.endswith((".faiss", ".index", ".bin")) or fname.startswith("index"):
38
  return os.path.join(self.vectorstore_dir, fname)
39
+ return None
40
 
41
+ def _find_meta_file(self) -> Optional[str]:
42
+ if not os.path.isdir(self.vectorstore_dir):
43
+ return None
44
 
 
45
  for candidate in ("index.pkl", "metadata.pkl", "index_meta.pkl", "metadata.json", "index.json"):
46
  p = os.path.join(self.vectorstore_dir, candidate)
47
  if os.path.exists(p):
48
  return p
49
+
50
  for fname in os.listdir(self.vectorstore_dir):
51
+ if fname.endswith(".pkl") or fname.endswith(".json"):
52
  return os.path.join(self.vectorstore_dir, fname)
53
+
54
+ return None
55
 
56
  @property
57
  def embedder(self) -> SentenceTransformer:
 
64
  """Load index and metadata into memory (idempotent)."""
65
  if self._loaded:
66
  return
67
+
68
  idx_path = self._find_index_file()
69
  meta_path = self._find_meta_file()
70
 
71
+ if not idx_path or not meta_path:
72
+ log.warning("No index/metadata found in %s — retrieval disabled.", self.vectorstore_dir)
73
+ return
74
+
75
  log.info("Loading FAISS index from: %s", idx_path)
76
  try:
77
  self.index = faiss.read_index(idx_path)
78
  except Exception as e:
79
+ log.error("Failed to read FAISS index: %s", e)
80
+ return
81
 
82
  log.info("Loading metadata from: %s", meta_path)
83
+ try:
84
+ if meta_path.endswith(".json"):
85
+ with open(meta_path, "r", encoding="utf-8") as f:
86
+ self.metadata = json.load(f)
87
+ else:
88
+ with open(meta_path, "rb") as f:
89
+ self.metadata = pickle.load(f)
90
+ except Exception as e:
91
+ log.error("Failed to read metadata: %s", e)
92
+ return
93
 
94
+ # Normalize metadata type
95
  if not isinstance(self.metadata, list):
 
96
  if isinstance(self.metadata, dict):
 
 
97
  try:
98
+ self.metadata = [self.metadata[k] for k in sorted(self.metadata.keys())]
99
  except Exception:
 
100
  self.metadata = list(self.metadata.values())
101
  else:
102
  self.metadata = list(self.metadata)
 
116
  if self.index is None or self.metadata is None:
117
  return [], []
118
 
119
+ q_emb = self.embedder.encode([query], convert_to_numpy=True).astype("float32")
120
+
121
  # try normalize if index uses normalized vectors
122
  try:
123
  faiss.normalize_L2(q_emb)
124
  except Exception:
125
  pass
 
126
 
 
127
  try:
128
  D, I = self.index.search(q_emb, k)
129
  except Exception as e:
130
  log.warning("FAISS search error: %s", e)
131
  return [], []
132
 
 
133
  if I is None or D is None:
134
  return [], []
135
 
 
136
  indices = np.array(I).reshape(-1)[:k].tolist()
137
  scores = np.array(D).reshape(-1)[:k].tolist()
138
 
139
+ contexts, sources = [], []
 
140
  for idx, score in zip(indices, scores):
141
+ if int(idx) < 0 or idx >= len(self.metadata):
142
  continue
 
 
 
 
 
143
 
144
+ meta = self.metadata[int(idx)]
145
  text = None
146
+
147
+ if isinstance(meta, dict):
148
+ for key in ("text", "page_content", "content", "chunk_text", "source_text"):
149
+ if key in meta and meta[key]:
150
+ text = meta[key]
151
+ break
152
+ if text is None and "metadata" in meta and isinstance(meta["metadata"], dict):
 
 
 
153
  text = meta["metadata"].get("text") or meta["metadata"].get("page_content")
154
+ elif isinstance(meta, str):
155
+ text = meta
156
+
157
+ if text is None:
158
+ text = str(meta)
159
 
160
  contexts.append(text)
161
  sources.append({"meta": meta, "score": float(score)})