broadfield-dev commited on
Commit
64b5eaa
·
verified ·
1 Parent(s): 83ff077

Update database.py

Browse files
Files changed (1) hide show
  1. database.py +79 -18
database.py CHANGED
@@ -2,36 +2,55 @@
2
  import chromadb
3
  from parser import parse_python_code
4
  import os
 
 
5
 
6
- def init_chromadb():
7
- # Initialize ChromaDB client (in-memory for now, can persist to disk)
8
- client = chromadb.Client()
9
- return client
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- def create_collection(client, collection_name="python_programs"):
12
- # Create or get a collection for Python programs
13
  try:
14
  collection = client.get_collection(name=collection_name)
15
  except:
16
  collection = client.create_collection(name=collection_name)
17
  return collection
18
 
19
- def store_program(client, code, sequence, vectors, collection_name="python_programs"):
20
- # Create or get collection
21
  collection = create_collection(client, collection_name)
22
 
 
 
 
23
  # Store program data (ID, code, sequence, vectors)
24
  program_id = str(hash(code)) # Use hash of code as ID for uniqueness
25
  collection.add(
26
  documents=[code],
27
  metadatas=[{"sequence": ",".join(sequence)}],
28
  ids=[program_id],
29
- embeddings=[vectors] # Store vectors as embeddings
30
  )
31
  return program_id
32
 
33
  def populate_sample_db(client):
34
- # Sample programs for testing
35
  samples = [
36
  """
37
  import os
@@ -52,12 +71,12 @@ def populate_sample_db(client):
52
  vectors = [part['vector'] for part in parts]
53
  store_program(client, code, sequence, vectors)
54
 
55
- def query_programs(client, operations, collection_name="python_programs", top_k=5):
56
- """Query the database for programs matching the operations sequence."""
57
  collection = create_collection(client, collection_name)
58
 
59
  # Convert operations to a query vector (average of operation vectors)
60
- query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0, 0, 0, 0, 0, 0]
61
 
62
  # Perform similarity search
63
  results = collection.query(
@@ -71,14 +90,17 @@ def query_programs(client, operations, collection_name="python_programs", top_k=
71
  for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
72
  sequence = meta['sequence'].split(',')
73
  if is_subsequence(operations, sequence):
74
- similarity = cosine_similarity([query_vector], [np.mean(eval(doc['vectors']), axis=0) if doc['vectors'] else [0, 0, 0, 0, 0, 0]])[0][0]
 
 
 
 
 
 
75
  matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity})
76
 
77
  return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
78
 
79
- from sklearn.metrics.pairwise import cosine_similarity
80
- import numpy as np
81
-
82
  def create_vector(category, level, location, total_lines, parent_path):
83
  """Helper to create a vector for query (matches parser's create_vector)."""
84
  category_map = {
@@ -101,6 +123,45 @@ def is_subsequence(subseq, seq):
101
  it = iter(seq)
102
  return all(item in it for item in subseq)
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  if __name__ == '__main__':
105
  client = init_chromadb()
106
- populate_sample_db(client)
 
 
 
2
  import chromadb
3
  from parser import parse_python_code
4
  import os
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import numpy as np
7
 
8
+ # User-configurable variables
9
+ DB_NAME = "python_programs" # ChromaDB collection name
10
+ HF_DATASET_NAME = "python_program_vectors" # Hugging Face Dataset name
11
+ HF_TOKEN = "YOUR_HUGGINGFACE_TOKEN" # Replace with your Hugging Face API token
12
+ PERSIST_DIR = "./chroma_data" # Directory for persistent storage (optional)
13
+
14
+ def init_chromadb(persist_dir=PERSIST_DIR):
15
+ """Initialize ChromaDB client, optionally with persistent storage."""
16
+ try:
17
+ # Use persistent storage if directory exists, otherwise in-memory
18
+ if os.path.exists(persist_dir):
19
+ client = chromadb.PersistentClient(path=persist_dir)
20
+ else:
21
+ client = chromadb.Client()
22
+ return client
23
+ except Exception as e:
24
+ print(f"Error initializing ChromaDB: {e}")
25
+ return chromadb.Client() # Fallback to in-memory
26
 
27
+ def create_collection(client, collection_name=DB_NAME):
28
+ """Create or get a ChromaDB collection for Python programs."""
29
  try:
30
  collection = client.get_collection(name=collection_name)
31
  except:
32
  collection = client.create_collection(name=collection_name)
33
  return collection
34
 
35
+ def store_program(client, code, sequence, vectors, collection_name=DB_NAME):
36
+ """Store a program in ChromaDB with its code, sequence, and vectors."""
37
  collection = create_collection(client, collection_name)
38
 
39
+ # Flatten vectors to ensure they are a list of numbers
40
+ flattened_vectors = [item for sublist in vectors for item in sublist]
41
+
42
  # Store program data (ID, code, sequence, vectors)
43
  program_id = str(hash(code)) # Use hash of code as ID for uniqueness
44
  collection.add(
45
  documents=[code],
46
  metadatas=[{"sequence": ",".join(sequence)}],
47
  ids=[program_id],
48
+ embeddings=[flattened_vectors] # Pass as flat list
49
  )
50
  return program_id
51
 
52
  def populate_sample_db(client):
53
+ """Populate ChromaDB with sample Python programs."""
54
  samples = [
55
  """
56
  import os
 
71
  vectors = [part['vector'] for part in parts]
72
  store_program(client, code, sequence, vectors)
73
 
74
+ def query_programs(client, operations, collection_name=DB_NAME, top_k=5):
75
+ """Query ChromaDB for programs matching the operations sequence."""
76
  collection = create_collection(client, collection_name)
77
 
78
  # Convert operations to a query vector (average of operation vectors)
79
+ query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0] * 6
80
 
81
  # Perform similarity search
82
  results = collection.query(
 
90
  for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
91
  sequence = meta['sequence'].split(',')
92
  if is_subsequence(operations, sequence):
93
+ # Extract and flatten vectors from the document (assuming stored as string or list)
94
+ try:
95
+ doc_vectors = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
96
+ program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
97
+ except:
98
+ program_vector = [0] * 6 # Fallback for malformed vectors
99
+ similarity = cosine_similarity([query_vector], [program_vector])[0][0] if program_vector and query_vector else 0
100
  matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity})
101
 
102
  return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
103
 
 
 
 
104
  def create_vector(category, level, location, total_lines, parent_path):
105
  """Helper to create a vector for query (matches parser's create_vector)."""
106
  category_map = {
 
123
  it = iter(seq)
124
  return all(item in it for item in subseq)
125
 
126
+ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
127
+ """Save ChromaDB data to Hugging Face Dataset."""
128
+ from datasets import Dataset
129
+ client = init_chromadb()
130
+ collection = create_collection(client)
131
+
132
+ # Fetch all data from ChromaDB
133
+ results = collection.get(include=["documents", "metadatas", "embeddings"])
134
+ data = {
135
+ "code": results["documents"],
136
+ "sequence": [meta["sequence"] for meta in results["metadatas"]],
137
+ "vectors": [[item for sublist in vec for item in sublist] for vec in results["embeddings"]] # Flatten vectors
138
+ }
139
+
140
+ # Create a Hugging Face Dataset
141
+ dataset = Dataset.from_dict(data)
142
+
143
+ # Push to Hugging Face Hub
144
+ dataset.push_to_hub(dataset_name, token=token)
145
+ print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
146
+
147
+ def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
148
+ """Load ChromaDB data from Hugging Face Dataset."""
149
+ from datasets import load_dataset
150
+ client = init_chromadb()
151
+ collection = create_collection(client)
152
+
153
+ dataset = load_dataset(dataset_name, split="train", token=token)
154
+ for item in dataset:
155
+ collection.add(
156
+ documents=[item["code"]],
157
+ metadatas=[{"sequence": item["sequence"]}],
158
+ ids=[str(hash(item["code"]))],
159
+ embeddings=[item["vectors"]]
160
+ )
161
+ return client
162
+
163
  if __name__ == '__main__':
164
  client = init_chromadb()
165
+ populate_sample_db(client)
166
+ # Uncomment to save to Hugging Face
167
+ # save_chromadb_to_hf()