Spaces:

broadfield-dev
/

parse_py

Sleeping

App Files Files Community

broadfield-dev commited on Mar 5

Commit

4058ab2

verified ·

1 Parent(s): 927956e

Update process_hf_dataset.py

Browse files

Files changed (1) hide show

process_hf_dataset.py +26 -1

process_hf_dataset.py CHANGED Viewed

@@ -95,7 +95,7 @@ def generate_description_tokens(sequence, vectors, var_map=None):
     return tokens
-def generate_semantic_vector(description, total_lines=100):
     """Generate a 6D semantic vector for a textual description, matching our vector format."""
     # Use a simplified heuristic to map description to our 6D vector format
     category_map = {
@@ -120,6 +120,31 @@ def generate_semantic_vector(description, total_lines=100):
     return vector
 def process_hf_dataset():
     """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
     # Load the dataset

     return tokens
+def generate_semantic_vector_og(description, total_lines=100):
     """Generate a 6D semantic vector for a textual description, matching our vector format."""
     # Use a simplified heuristic to map description to our 6D vector format
     category_map = {
     return vector
+def generate_semantic_vector(description, total_lines=100, use_gpu=False):
+    """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
+    # Load CodeBERT model and tokenizer
+    model_name = "microsoft/codebert-base"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
+    model = AutoModel.from_pretrained(model_name).to(device)
+    # Tokenize and encode the description
+    inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Generate embeddings
+    with torch.no_grad():
+        outputs = model(**inputs)
+        # Use mean pooling of the last hidden states
+        vector = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
+    # Truncate or project to 6D (simplified projection: take first 6 dimensions)
+    if len(vector) < 6:
+        vector.extend([0] * (6 - len(vector)))
+    elif len(vector) > 6:
+        vector = vector[:6]  # Truncate to 6D
+    return vector
 def process_hf_dataset():
     """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
     # Load the dataset