Spaces:
Running
Running
Update database.py
Browse files- database.py +27 -1
database.py
CHANGED
@@ -165,7 +165,7 @@ def generate_description_tokens(sequence, vectors):
|
|
165 |
tokens.append(f"span:{vec[3]:.2f}")
|
166 |
return tokens
|
167 |
|
168 |
-
def
|
169 |
"""Generate a 6D semantic vector for a textual description, matching our vector format."""
|
170 |
# Use a simplified heuristic to map description to our 6D vector format
|
171 |
category_map = {
|
@@ -190,6 +190,32 @@ def generate_semantic_vector(description, total_lines=100):
|
|
190 |
|
191 |
return vector
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
|
194 |
"""Save ChromaDB data to Hugging Face Dataset."""
|
195 |
client = init_chromadb()
|
|
|
165 |
tokens.append(f"span:{vec[3]:.2f}")
|
166 |
return tokens
|
167 |
|
168 |
+
def generate_semantic_vector_og(description, total_lines=100):
|
169 |
"""Generate a 6D semantic vector for a textual description, matching our vector format."""
|
170 |
# Use a simplified heuristic to map description to our 6D vector format
|
171 |
category_map = {
|
|
|
190 |
|
191 |
return vector
|
192 |
|
193 |
+
def generate_semantic_vector(description, total_lines=100, use_gpu=False):
|
194 |
+
"""Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
|
195 |
+
# Load CodeBERT model and tokenizer
|
196 |
+
model_name = "microsoft/codebert-base"
|
197 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
198 |
+
device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
|
199 |
+
model = AutoModel.from_pretrained(model_name).to(device)
|
200 |
+
|
201 |
+
# Tokenize and encode the description
|
202 |
+
inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
203 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
204 |
+
|
205 |
+
# Generate embeddings
|
206 |
+
with torch.no_grad():
|
207 |
+
outputs = model(**inputs)
|
208 |
+
# Use mean pooling of the last hidden states
|
209 |
+
vector = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
|
210 |
+
|
211 |
+
# Truncate or project to 6D (simplified projection: take first 6 dimensions)
|
212 |
+
if len(vector) < 6:
|
213 |
+
vector.extend([0] * (6 - len(vector)))
|
214 |
+
elif len(vector) > 6:
|
215 |
+
vector = vector[:6] # Truncate to 6D
|
216 |
+
|
217 |
+
return vector
|
218 |
+
|
219 |
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
|
220 |
"""Save ChromaDB data to Hugging Face Dataset."""
|
221 |
client = init_chromadb()
|