Spaces:

broadfield-dev
/

parse_py

Sleeping

App Files Files Community

broadfield-dev commited on Mar 5

Commit

b284540

verified ·

1 Parent(s): 2143f28

Create process_hf_datasets.py

Browse files

Files changed (1) hide show

process_hf_datasets.py +92 -0

process_hf_datasets.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# process_hf_dataset.py
+from datasets import load_dataset
+import re
+from parser import parse_python_code
+from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY
+import chromadb
+def rename_variables(code):
+    """Rename variables in Python code to input_var1, input_var2, etc."""
+    # Simple variable name detection and renaming
+    pattern = r'\b[a-zA-Z_]\w*\b'  # Match variable names (simple heuristic)
+    variables = set()
+    code_lines = code.split('\n')
+    # Find all variable names (simplified approach)
+    for line in code_lines:
+        matches = re.findall(pattern, line)
+        for match in matches:
+            if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']:  # Exclude keywords
+                variables.add(match)
+    # Sort variables by first appearance (simplified, could improve with AST)
+    sorted_vars = sorted(list(variables))
+    var_map = {var: f"input_var{i+1}" for i, var in enumerate(sorted_vars)}
+    # Replace variables in code
+    new_code = code
+    for old_var, new_var in var_map.items():
+        new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
+    return new_code
+def process_hf_dataset():
+    """Process the Hugging Face dataset and store programs in ChromaDB."""
+    # Load the dataset
+    dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
+    # Initialize ChromaDB client
+    client = init_chromadb()
+    # Process each entry
+    for entry in dataset:
+        instruction = entry['instruction']
+        output = entry['output']
+        # Rename variables in the output code
+        processed_code = rename_variables(output)
+        # Parse the code to get parts and sequence
+        parts, sequence = parse_python_code(processed_code)
+        vectors = [part['vector'] for part in parts]
+        # Generate description tokens from instruction
+        description_tokens = f"task:{instruction.replace(' ', '_')}"
+        # Store in ChromaDB with description
+        store_program(client, processed_code, sequence, vectors, DB_NAME)
+        # Update metadata with instruction as description
+        collection = client.get_collection(DB_NAME)
+        program_id = str(hash(processed_code))
+        collection.update(
+            ids=[program_id],
+            metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}]
+        )
+    # Save to Hugging Face Dataset
+    save_chromadb_to_hf()
+def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
+    """Save ChromaDB data to Hugging Face Dataset."""
+    client = init_chromadb()
+    collection = client.get_collection(DB_NAME)
+    # Fetch all data from ChromaDB
+    results = collection.get(include=["documents", "metadatas", "embeddings"])
+    data = {
+        "code": results["documents"],
+        "sequence": [meta["sequence"] for meta in results["metadatas"]],
+        "vectors": results["embeddings"],  # ChromaDB already flattens embeddings
+        "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
+    }
+    # Create a Hugging Face Dataset
+    dataset = Dataset.from_dict(data)
+    # Push to Hugging Face Hub
+    dataset.push_to_hub(dataset_name, token=token)
+    print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
+if __name__ == "__main__":
+    process_hf_dataset()