broadfield-dev commited on
Commit
87ca86e
·
verified ·
1 Parent(s): 9e89af0

Update database.py

Browse files
Files changed (1) hide show
  1. database.py +27 -1
database.py CHANGED
@@ -165,7 +165,7 @@ def generate_description_tokens(sequence, vectors):
165
  tokens.append(f"span:{vec[3]:.2f}")
166
  return tokens
167
 
168
- def generate_semantic_vector(description, total_lines=100):
169
  """Generate a 6D semantic vector for a textual description, matching our vector format."""
170
  # Use a simplified heuristic to map description to our 6D vector format
171
  category_map = {
@@ -190,6 +190,32 @@ def generate_semantic_vector(description, total_lines=100):
190
 
191
  return vector
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
194
  """Save ChromaDB data to Hugging Face Dataset."""
195
  client = init_chromadb()
 
165
  tokens.append(f"span:{vec[3]:.2f}")
166
  return tokens
167
 
168
+ def generate_semantic_vector_og(description, total_lines=100):
169
  """Generate a 6D semantic vector for a textual description, matching our vector format."""
170
  # Use a simplified heuristic to map description to our 6D vector format
171
  category_map = {
 
190
 
191
  return vector
192
 
193
+ def generate_semantic_vector(description, total_lines=100, use_gpu=False):
194
+ """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
195
+ # Load CodeBERT model and tokenizer
196
+ model_name = "microsoft/codebert-base"
197
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
198
+ device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
199
+ model = AutoModel.from_pretrained(model_name).to(device)
200
+
201
+ # Tokenize and encode the description
202
+ inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
203
+ inputs = {k: v.to(device) for k, v in inputs.items()}
204
+
205
+ # Generate embeddings
206
+ with torch.no_grad():
207
+ outputs = model(**inputs)
208
+ # Use mean pooling of the last hidden states
209
+ vector = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
210
+
211
+ # Truncate or project to 6D (simplified projection: take first 6 dimensions)
212
+ if len(vector) < 6:
213
+ vector.extend([0] * (6 - len(vector)))
214
+ elif len(vector) > 6:
215
+ vector = vector[:6] # Truncate to 6D
216
+
217
+ return vector
218
+
219
  def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
220
  """Save ChromaDB data to Hugging Face Dataset."""
221
  client = init_chromadb()