broadfield-dev commited on
Commit
e77acbf
·
verified ·
1 Parent(s): a1b1b8b

Update process_hf_dataset.py

Browse files
Files changed (1) hide show
  1. process_hf_dataset.py +18 -7
process_hf_dataset.py CHANGED
@@ -2,8 +2,13 @@
2
  from datasets import load_dataset
3
  import re
4
  from parser import parse_python_code
5
- from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY
6
  import chromadb
 
 
 
 
 
7
 
8
  def rename_variables(code, variable_prefixes=None):
9
  """Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
@@ -29,17 +34,23 @@ def rename_variables(code, variable_prefixes=None):
29
  # Sort variables by first appearance (simplified, could improve with AST)
30
  sorted_vars = sorted(list(variables))
31
  var_map = {}
32
- var_count = {prefix: 1 for prefix in variable_prefixes.values()}
33
 
34
  # Assign variables based on context (simplified heuristic)
35
  for var in sorted_vars:
36
- # Determine variable role based on context (simplified)
37
- if var in ['expression', 'input']: # Assume input parameters or initial variables
 
 
 
 
38
  role = 'input_variable'
39
- elif var in code.split() and 'return' in line[x] for x in code_lines if var in line[x]: # Returned variables
40
  role = 'returned_variable'
41
- else: # Default to assigned variables
42
  role = 'assigned_variable'
 
 
43
 
44
  new_name = f"{role}{var_count[role]}"
45
  var_map[var] = new_name
@@ -123,7 +134,7 @@ def process_hf_dataset():
123
  # Save to Hugging Face Dataset
124
  save_chromadb_to_hf()
125
 
126
- def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
127
  """Save ChromaDB data to Hugging Face Dataset."""
128
  client = init_chromadb()
129
  collection = client.get_collection(DB_NAME)
 
2
  from datasets import load_dataset
3
  import re
4
  from parser import parse_python_code
5
+ from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
6
  import chromadb
7
+ import os
8
+ from dotenv import load_dotenv
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
 
13
  def rename_variables(code, variable_prefixes=None):
14
  """Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
 
34
  # Sort variables by first appearance (simplified, could improve with AST)
35
  sorted_vars = sorted(list(variables))
36
  var_map = {}
37
+ var_count = {'input_variable': 1, 'assigned_variable': 1, 'returned_variable': 1}
38
 
39
  # Assign variables based on context (simplified heuristic)
40
  for var in sorted_vars:
41
+ # Determine variable role based on context
42
+ is_input = any(var in line and 'def' in line for line in code_lines) # Check if in function definition (input parameter)
43
+ is_returned = any('return' in line and var in line for line in code_lines) # Check if used in return statement
44
+ is_assigned = any('=' in line and var in line.split('=')[0].strip() for line in code_lines) # Check if assigned
45
+
46
+ if is_input:
47
  role = 'input_variable'
48
+ elif is_returned:
49
  role = 'returned_variable'
50
+ elif is_assigned:
51
  role = 'assigned_variable'
52
+ else:
53
+ role = 'assigned_variable' # Default to assigned if unclear
54
 
55
  new_name = f"{role}{var_count[role]}"
56
  var_map[var] = new_name
 
134
  # Save to Hugging Face Dataset
135
  save_chromadb_to_hf()
136
 
137
+ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
138
  """Save ChromaDB data to Hugging Face Dataset."""
139
  client = init_chromadb()
140
  collection = client.get_collection(DB_NAME)