Spaces:
Running
Running
Create process_hf_datasets.py
Browse files- process_hf_datasets.py +92 -0
process_hf_datasets.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# process_hf_dataset.py
|
2 |
+
from datasets import load_dataset
|
3 |
+
import re
|
4 |
+
from parser import parse_python_code
|
5 |
+
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY
|
6 |
+
import chromadb
|
7 |
+
|
8 |
+
def rename_variables(code):
|
9 |
+
"""Rename variables in Python code to input_var1, input_var2, etc."""
|
10 |
+
# Simple variable name detection and renaming
|
11 |
+
pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic)
|
12 |
+
variables = set()
|
13 |
+
code_lines = code.split('\n')
|
14 |
+
|
15 |
+
# Find all variable names (simplified approach)
|
16 |
+
for line in code_lines:
|
17 |
+
matches = re.findall(pattern, line)
|
18 |
+
for match in matches:
|
19 |
+
if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']: # Exclude keywords
|
20 |
+
variables.add(match)
|
21 |
+
|
22 |
+
# Sort variables by first appearance (simplified, could improve with AST)
|
23 |
+
sorted_vars = sorted(list(variables))
|
24 |
+
var_map = {var: f"input_var{i+1}" for i, var in enumerate(sorted_vars)}
|
25 |
+
|
26 |
+
# Replace variables in code
|
27 |
+
new_code = code
|
28 |
+
for old_var, new_var in var_map.items():
|
29 |
+
new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
|
30 |
+
|
31 |
+
return new_code
|
32 |
+
|
33 |
+
def process_hf_dataset():
|
34 |
+
"""Process the Hugging Face dataset and store programs in ChromaDB."""
|
35 |
+
# Load the dataset
|
36 |
+
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
|
37 |
+
|
38 |
+
# Initialize ChromaDB client
|
39 |
+
client = init_chromadb()
|
40 |
+
|
41 |
+
# Process each entry
|
42 |
+
for entry in dataset:
|
43 |
+
instruction = entry['instruction']
|
44 |
+
output = entry['output']
|
45 |
+
|
46 |
+
# Rename variables in the output code
|
47 |
+
processed_code = rename_variables(output)
|
48 |
+
|
49 |
+
# Parse the code to get parts and sequence
|
50 |
+
parts, sequence = parse_python_code(processed_code)
|
51 |
+
vectors = [part['vector'] for part in parts]
|
52 |
+
|
53 |
+
# Generate description tokens from instruction
|
54 |
+
description_tokens = f"task:{instruction.replace(' ', '_')}"
|
55 |
+
|
56 |
+
# Store in ChromaDB with description
|
57 |
+
store_program(client, processed_code, sequence, vectors, DB_NAME)
|
58 |
+
|
59 |
+
# Update metadata with instruction as description
|
60 |
+
collection = client.get_collection(DB_NAME)
|
61 |
+
program_id = str(hash(processed_code))
|
62 |
+
collection.update(
|
63 |
+
ids=[program_id],
|
64 |
+
metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}]
|
65 |
+
)
|
66 |
+
|
67 |
+
# Save to Hugging Face Dataset
|
68 |
+
save_chromadb_to_hf()
|
69 |
+
|
70 |
+
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
|
71 |
+
"""Save ChromaDB data to Hugging Face Dataset."""
|
72 |
+
client = init_chromadb()
|
73 |
+
collection = client.get_collection(DB_NAME)
|
74 |
+
|
75 |
+
# Fetch all data from ChromaDB
|
76 |
+
results = collection.get(include=["documents", "metadatas", "embeddings"])
|
77 |
+
data = {
|
78 |
+
"code": results["documents"],
|
79 |
+
"sequence": [meta["sequence"] for meta in results["metadatas"]],
|
80 |
+
"vectors": results["embeddings"], # ChromaDB already flattens embeddings
|
81 |
+
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
|
82 |
+
}
|
83 |
+
|
84 |
+
# Create a Hugging Face Dataset
|
85 |
+
dataset = Dataset.from_dict(data)
|
86 |
+
|
87 |
+
# Push to Hugging Face Hub
|
88 |
+
dataset.push_to_hub(dataset_name, token=token)
|
89 |
+
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
90 |
+
|
91 |
+
if __name__ == "__main__":
|
92 |
+
process_hf_dataset()
|