Spaces:

broadfield-dev
/

parse_py

Sleeping

App Files Files Community

broadfield-dev commited on Mar 5

Commit

ecb205b

verified ·

1 Parent(s): 275730d

Update process_hf_dataset.py

Browse files

Files changed (1) hide show

process_hf_dataset.py +83 -41

process_hf_dataset.py CHANGED Viewed

@@ -8,10 +8,32 @@ import os
 from dotenv import load_dotenv
 from transformers import AutoTokenizer, AutoModel
 import torch
 # Load environment variables
 load_dotenv()
 def rename_variables(code, variable_prefixes=None):
     """Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
     if variable_prefixes is None:
@@ -99,11 +121,9 @@ def generate_description_tokens(sequence, vectors, var_map=None):
 def generate_semantic_vector(description, total_lines=100, use_gpu=False):
     """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
-    # Load CodeBERT model and tokenizer
-    model_name = "microsoft/codebert-base"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
-    model = AutoModel.from_pretrained(model_name).to(device)
     # Tokenize and encode the description
     inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
@@ -123,49 +143,71 @@ def generate_semantic_vector(description, total_lines=100, use_gpu=False):
     return vector
-def process_hf_dataset():
-    """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
     # Load the dataset
     dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
     # Initialize ChromaDB client
     client = init_chromadb()
-    # Process each entry
-    for entry in dataset:
-        instruction = entry['instruction']
-        output = entry['output']
-        # Rename variables to align with vector categories
-        processed_code, var_map = rename_variables(output)
-        # Parse the code to get parts and sequence, generating our 6D vectors
-        parts, sequence = parse_python_code(processed_code)
-        program_vectors = [part['vector'] for part in parts]  # Use parser's 6D vectors for program structure
-        # Generate description tokens including variable roles
-        description_tokens = f"task:{instruction.replace(' ', '_')}"
-        description_tokens_list = generate_description_tokens(sequence, program_vectors, var_map)
-        description_tokens += " " + " ".join(description_tokens_list)
-        # Generate a 6D semantic vector for the instruction
-        semantic_vector = generate_semantic_vector(instruction)
-        # Combine program vectors with semantic vector (use semantic vector for semantic search, store program vectors separately)
-        # Store both semantic and program vectors, but ensure ChromaDB uses 6D
-        combined_vector = semantic_vector  # Use semantic vector for ChromaDB embedding (6D)
-        # Store in ChromaDB with description and combined vector
-        store_program(client, processed_code, sequence, [combined_vector], DB_NAME)
-        # Update metadata with instruction and variable roles as description, and store program vectors
-        collection = client.get_collection(DB_NAME)
-        program_id = str(hash(processed_code))
-        collection.update(
-            ids=[program_id],
-            metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens, "program_vectors": str(program_vectors)}],
-            embeddings=[combined_vector]  # Ensure 6D embedding for semantic search
-        )
     # Save to Hugging Face Dataset
     save_chromadb_to_hf()
@@ -193,4 +235,4 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"))
     print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
 if __name__ == "__main__":
-    process_hf_dataset()

 from dotenv import load_dotenv
 from transformers import AutoTokenizer, AutoModel
 import torch
+from tqdm import tqdm  # For progress bar
+import time
 # Load environment variables
 load_dotenv()
+# Cache CodeBERT model globally to avoid repeated loading
+model_name = "microsoft/codebert-base"
+tokenizer = None
+model = None
+device = None
+def load_codebert_model(use_gpu=False):
+    """Load and cache the CodeBERT model, handling GPU/CPU options."""
+    global tokenizer, model, device
+    if tokenizer is None or model is None:
+        try:
+            device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModel.from_pretrained(model_name).to(device)
+            print(f"CodeBERT model loaded on {device}")
+        except Exception as e:
+            print(f"Error loading CodeBERT model: {e}")
+            raise
+    return tokenizer, model, device
 def rename_variables(code, variable_prefixes=None):
     """Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
     if variable_prefixes is None:
 def generate_semantic_vector(description, total_lines=100, use_gpu=False):
     """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
+    global tokenizer, model, device
+    if tokenizer is None or model is None:
+        tokenizer, model, device = load_codebert_model(use_gpu)
     # Tokenize and encode the description
     inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
     return vector
+def process_hf_dataset(batch_size=100, use_gpu=False):
+    """Process the Hugging Face dataset in batches and store programs in ChromaDB, aligning with vector categories."""
     # Load the dataset
     dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
     # Initialize ChromaDB client
     client = init_chromadb()
+    # Clear existing collection (fresh install) if needed
+    try:
+        client.delete_collection(DB_NAME)
+    except:
+        pass  # Collection may not exist
+    collection = client.create_collection(DB_NAME)
+    # Process in batches with progress bar
+    total_entries = len(dataset)
+    for i in tqdm(range(0, total_entries, batch_size), desc="Processing Hugging Face Dataset"):
+        batch = dataset[i:i + batch_size]
+        batch_programs = []
+        batch_ids = []
+        batch_documents = []
+        batch_metadatas = []
+        batch_embeddings = []
+        for entry in batch:
+            instruction = entry['instruction']
+            output = entry['output']
+            # Rename variables to align with vector categories
+            processed_code, var_map = rename_variables(output)
+            # Parse the code to get parts and sequence, generating our 6D vectors
+            parts, sequence = parse_python_code(processed_code)
+            program_vectors = [part['vector'] for part in parts]  # Use parser's 6D vectors for program structure
+            # Generate description tokens including variable roles
+            description_tokens = f"task:{instruction.replace(' ', '_')}"
+            description_tokens_list = generate_description_tokens(sequence, program_vectors, var_map)
+            description_tokens += " " + " ".join(description_tokens_list)
+            # Generate a 6D semantic vector for the instruction
+            semantic_vector = generate_semantic_vector(instruction, use_gpu=use_gpu)
+            # Store program data
+            program_id = str(hash(processed_code))
+            batch_ids.append(program_id)
+            batch_documents.append(processed_code)
+            batch_metadatas.append({"sequence": ",".join(sequence), "description_tokens": description_tokens, "program_vectors": str(program_vectors)})
+            batch_embeddings.append(semantic_vector)
+            # Add small delay to prevent freezing (optional, adjust as needed)
+            time.sleep(0.01)
+        # Batch add to ChromaDB
+        try:
+            collection.add(
+                documents=batch_documents,
+                metadatas=batch_metadatas,
+                ids=batch_ids,
+                embeddings=batch_embeddings
+            )
+        except Exception as e:
+            print(f"Error adding batch to ChromaDB: {e}")
+            raise
     # Save to Hugging Face Dataset
     save_chromadb_to_hf()
     print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
 if __name__ == "__main__":
+    process_hf_dataset(batch_size=100, use_gpu=False)