Spaces:

broadfield-dev
/

parse_py

Running

App Files Files Community

broadfield-dev commited on Mar 5

Commit

d6c93c4

verified ·

1 Parent(s): e77acbf

Update process_hf_dataset.py

Browse files

Files changed (1) hide show

process_hf_dataset.py +34 -5

process_hf_dataset.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # process_hf_dataset.py
 from datasets import load_dataset
 import re
-from parser import parse_python_code
 from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
 import chromadb
 import os
@@ -95,6 +95,31 @@ def generate_description_tokens(sequence, vectors, var_map=None):
     return tokens
 def process_hf_dataset():
     """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories."""
     # Load the dataset
@@ -111,16 +136,19 @@ def process_hf_dataset():
         # Rename variables to align with vector categories
         processed_code, var_map = rename_variables(output)
-        # Parse the code to get parts and sequence
         parts, sequence = parse_python_code(processed_code)
-        vectors = [part['vector'] for part in parts]
         # Generate description tokens including variable roles
         description_tokens = f"task:{instruction.replace(' ', '_')}"
         description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
         description_tokens += " " + " ".join(description_tokens_list)
-        # Store in ChromaDB with description
         store_program(client, processed_code, sequence, vectors, DB_NAME)
         # Update metadata with instruction and variable roles as description
@@ -128,7 +156,8 @@ def process_hf_dataset():
         program_id = str(hash(processed_code))
         collection.update(
             ids=[program_id],
-            metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}]
         )
     # Save to Hugging Face Dataset

 # process_hf_dataset.py
 from datasets import load_dataset
 import re
+from parser import parse_python_code, create_vector
 from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
 import chromadb
 import os
     return tokens
+def generate_semantic_vector(description, total_lines=100):
+    """Generate a 6D semantic vector for a textual description, matching our vector format."""
+    # Use a simplified heuristic to map description to our 6D vector format
+    category_map = {
+        'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
+    }
+    # Parse description for key terms
+    tokens = description.lower().split()
+    vector = [0] * 6  # Initialize 6D vector
+    # Map description tokens to categories and assign basic vector values
+    for token in tokens:
+        for cat, cat_id in category_map.items():
+            if cat in token:
+                vector[0] = cat_id  # category_id
+                vector[1] = 1  # level (assume top-level for simplicity)
+                vector[2] = 0.5  # center_pos (midpoint of code)
+                vector[3] = 0.1  # span (small for simplicity)
+                vector[4] = 1  # parent_depth (shallow)
+                vector[5] = cat_id / len(category_map)  # parent_weight (normalized)
+                break
+    return vector
 def process_hf_dataset():
     """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories."""
     # Load the dataset
         # Rename variables to align with vector categories
         processed_code, var_map = rename_variables(output)
+        # Parse the code to get parts and sequence, generating our 6D vectors
         parts, sequence = parse_python_code(processed_code)
+        vectors = [part['vector'] for part in parts]  # Use parser's 6D vectors
         # Generate description tokens including variable roles
         description_tokens = f"task:{instruction.replace(' ', '_')}"
         description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
         description_tokens += " " + " ".join(description_tokens_list)
+        # Generate a 6D semantic vector for the instruction
+        semantic_vector = generate_semantic_vector(instruction)
+        # Store in ChromaDB with description and semantic vector
         store_program(client, processed_code, sequence, vectors, DB_NAME)
         # Update metadata with instruction and variable roles as description
         program_id = str(hash(processed_code))
         collection.update(
             ids=[program_id],
+            metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
+            embeddings=[semantic_vector]  # Update with 6D semantic vector for semantic search
         )
     # Save to Hugging Face Dataset