Spaces:
Running
Running
Update process_hf_dataset.py
Browse files- process_hf_dataset.py +18 -7
process_hf_dataset.py
CHANGED
@@ -2,8 +2,13 @@
|
|
2 |
from datasets import load_dataset
|
3 |
import re
|
4 |
from parser import parse_python_code
|
5 |
-
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
|
6 |
import chromadb
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def rename_variables(code, variable_prefixes=None):
|
9 |
"""Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
|
@@ -29,17 +34,23 @@ def rename_variables(code, variable_prefixes=None):
|
|
29 |
# Sort variables by first appearance (simplified, could improve with AST)
|
30 |
sorted_vars = sorted(list(variables))
|
31 |
var_map = {}
|
32 |
-
var_count = {
|
33 |
|
34 |
# Assign variables based on context (simplified heuristic)
|
35 |
for var in sorted_vars:
|
36 |
-
# Determine variable role based on context
|
37 |
-
|
|
|
|
|
|
|
|
|
38 |
role = 'input_variable'
|
39 |
-
elif
|
40 |
role = 'returned_variable'
|
41 |
-
|
42 |
role = 'assigned_variable'
|
|
|
|
|
43 |
|
44 |
new_name = f"{role}{var_count[role]}"
|
45 |
var_map[var] = new_name
|
@@ -123,7 +134,7 @@ def process_hf_dataset():
|
|
123 |
# Save to Hugging Face Dataset
|
124 |
save_chromadb_to_hf()
|
125 |
|
126 |
-
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
|
127 |
"""Save ChromaDB data to Hugging Face Dataset."""
|
128 |
client = init_chromadb()
|
129 |
collection = client.get_collection(DB_NAME)
|
|
|
2 |
from datasets import load_dataset
|
3 |
import re
|
4 |
from parser import parse_python_code
|
5 |
+
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
|
6 |
import chromadb
|
7 |
+
import os
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
|
10 |
+
# Load environment variables
|
11 |
+
load_dotenv()
|
12 |
|
13 |
def rename_variables(code, variable_prefixes=None):
|
14 |
"""Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
|
|
|
34 |
# Sort variables by first appearance (simplified, could improve with AST)
|
35 |
sorted_vars = sorted(list(variables))
|
36 |
var_map = {}
|
37 |
+
var_count = {'input_variable': 1, 'assigned_variable': 1, 'returned_variable': 1}
|
38 |
|
39 |
# Assign variables based on context (simplified heuristic)
|
40 |
for var in sorted_vars:
|
41 |
+
# Determine variable role based on context
|
42 |
+
is_input = any(var in line and 'def' in line for line in code_lines) # Check if in function definition (input parameter)
|
43 |
+
is_returned = any('return' in line and var in line for line in code_lines) # Check if used in return statement
|
44 |
+
is_assigned = any('=' in line and var in line.split('=')[0].strip() for line in code_lines) # Check if assigned
|
45 |
+
|
46 |
+
if is_input:
|
47 |
role = 'input_variable'
|
48 |
+
elif is_returned:
|
49 |
role = 'returned_variable'
|
50 |
+
elif is_assigned:
|
51 |
role = 'assigned_variable'
|
52 |
+
else:
|
53 |
+
role = 'assigned_variable' # Default to assigned if unclear
|
54 |
|
55 |
new_name = f"{role}{var_count[role]}"
|
56 |
var_map[var] = new_name
|
|
|
134 |
# Save to Hugging Face Dataset
|
135 |
save_chromadb_to_hf()
|
136 |
|
137 |
+
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
|
138 |
"""Save ChromaDB data to Hugging Face Dataset."""
|
139 |
client = init_chromadb()
|
140 |
collection = client.get_collection(DB_NAME)
|