broadfield-dev commited on
Commit
b284540
·
verified ·
1 Parent(s): 2143f28

Create process_hf_datasets.py

Browse files
Files changed (1) hide show
  1. process_hf_datasets.py +92 -0
process_hf_datasets.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # process_hf_dataset.py
2
+ from datasets import load_dataset
3
+ import re
4
+ from parser import parse_python_code
5
+ from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY
6
+ import chromadb
7
+
8
+ def rename_variables(code):
9
+ """Rename variables in Python code to input_var1, input_var2, etc."""
10
+ # Simple variable name detection and renaming
11
+ pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic)
12
+ variables = set()
13
+ code_lines = code.split('\n')
14
+
15
+ # Find all variable names (simplified approach)
16
+ for line in code_lines:
17
+ matches = re.findall(pattern, line)
18
+ for match in matches:
19
+ if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']: # Exclude keywords
20
+ variables.add(match)
21
+
22
+ # Sort variables by first appearance (simplified, could improve with AST)
23
+ sorted_vars = sorted(list(variables))
24
+ var_map = {var: f"input_var{i+1}" for i, var in enumerate(sorted_vars)}
25
+
26
+ # Replace variables in code
27
+ new_code = code
28
+ for old_var, new_var in var_map.items():
29
+ new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
30
+
31
+ return new_code
32
+
33
+ def process_hf_dataset():
34
+ """Process the Hugging Face dataset and store programs in ChromaDB."""
35
+ # Load the dataset
36
+ dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
37
+
38
+ # Initialize ChromaDB client
39
+ client = init_chromadb()
40
+
41
+ # Process each entry
42
+ for entry in dataset:
43
+ instruction = entry['instruction']
44
+ output = entry['output']
45
+
46
+ # Rename variables in the output code
47
+ processed_code = rename_variables(output)
48
+
49
+ # Parse the code to get parts and sequence
50
+ parts, sequence = parse_python_code(processed_code)
51
+ vectors = [part['vector'] for part in parts]
52
+
53
+ # Generate description tokens from instruction
54
+ description_tokens = f"task:{instruction.replace(' ', '_')}"
55
+
56
+ # Store in ChromaDB with description
57
+ store_program(client, processed_code, sequence, vectors, DB_NAME)
58
+
59
+ # Update metadata with instruction as description
60
+ collection = client.get_collection(DB_NAME)
61
+ program_id = str(hash(processed_code))
62
+ collection.update(
63
+ ids=[program_id],
64
+ metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}]
65
+ )
66
+
67
+ # Save to Hugging Face Dataset
68
+ save_chromadb_to_hf()
69
+
70
+ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
71
+ """Save ChromaDB data to Hugging Face Dataset."""
72
+ client = init_chromadb()
73
+ collection = client.get_collection(DB_NAME)
74
+
75
+ # Fetch all data from ChromaDB
76
+ results = collection.get(include=["documents", "metadatas", "embeddings"])
77
+ data = {
78
+ "code": results["documents"],
79
+ "sequence": [meta["sequence"] for meta in results["metadatas"]],
80
+ "vectors": results["embeddings"], # ChromaDB already flattens embeddings
81
+ "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
82
+ }
83
+
84
+ # Create a Hugging Face Dataset
85
+ dataset = Dataset.from_dict(data)
86
+
87
+ # Push to Hugging Face Hub
88
+ dataset.push_to_hub(dataset_name, token=token)
89
+ print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
90
+
91
+ if __name__ == "__main__":
92
+ process_hf_dataset()