Spaces:
Running
Running
Update database.py
Browse files- database.py +79 -18
database.py
CHANGED
@@ -2,36 +2,55 @@
|
|
2 |
import chromadb
|
3 |
from parser import parse_python_code
|
4 |
import os
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
def create_collection(client, collection_name=
|
12 |
-
|
13 |
try:
|
14 |
collection = client.get_collection(name=collection_name)
|
15 |
except:
|
16 |
collection = client.create_collection(name=collection_name)
|
17 |
return collection
|
18 |
|
19 |
-
def store_program(client, code, sequence, vectors, collection_name=
|
20 |
-
|
21 |
collection = create_collection(client, collection_name)
|
22 |
|
|
|
|
|
|
|
23 |
# Store program data (ID, code, sequence, vectors)
|
24 |
program_id = str(hash(code)) # Use hash of code as ID for uniqueness
|
25 |
collection.add(
|
26 |
documents=[code],
|
27 |
metadatas=[{"sequence": ",".join(sequence)}],
|
28 |
ids=[program_id],
|
29 |
-
embeddings=[
|
30 |
)
|
31 |
return program_id
|
32 |
|
33 |
def populate_sample_db(client):
|
34 |
-
|
35 |
samples = [
|
36 |
"""
|
37 |
import os
|
@@ -52,12 +71,12 @@ def populate_sample_db(client):
|
|
52 |
vectors = [part['vector'] for part in parts]
|
53 |
store_program(client, code, sequence, vectors)
|
54 |
|
55 |
-
def query_programs(client, operations, collection_name=
|
56 |
-
"""Query
|
57 |
collection = create_collection(client, collection_name)
|
58 |
|
59 |
# Convert operations to a query vector (average of operation vectors)
|
60 |
-
query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0
|
61 |
|
62 |
# Perform similarity search
|
63 |
results = collection.query(
|
@@ -71,14 +90,17 @@ def query_programs(client, operations, collection_name="python_programs", top_k=
|
|
71 |
for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
|
72 |
sequence = meta['sequence'].split(',')
|
73 |
if is_subsequence(operations, sequence):
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity})
|
76 |
|
77 |
return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
|
78 |
|
79 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
80 |
-
import numpy as np
|
81 |
-
|
82 |
def create_vector(category, level, location, total_lines, parent_path):
|
83 |
"""Helper to create a vector for query (matches parser's create_vector)."""
|
84 |
category_map = {
|
@@ -101,6 +123,45 @@ def is_subsequence(subseq, seq):
|
|
101 |
it = iter(seq)
|
102 |
return all(item in it for item in subseq)
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
if __name__ == '__main__':
|
105 |
client = init_chromadb()
|
106 |
-
populate_sample_db(client)
|
|
|
|
|
|
2 |
import chromadb
|
3 |
from parser import parse_python_code
|
4 |
import os
|
5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
+
import numpy as np
|
7 |
|
8 |
+
# User-configurable variables
|
9 |
+
DB_NAME = "python_programs" # ChromaDB collection name
|
10 |
+
HF_DATASET_NAME = "python_program_vectors" # Hugging Face Dataset name
|
11 |
+
HF_TOKEN = "YOUR_HUGGINGFACE_TOKEN" # Replace with your Hugging Face API token
|
12 |
+
PERSIST_DIR = "./chroma_data" # Directory for persistent storage (optional)
|
13 |
+
|
14 |
+
def init_chromadb(persist_dir=PERSIST_DIR):
|
15 |
+
"""Initialize ChromaDB client, optionally with persistent storage."""
|
16 |
+
try:
|
17 |
+
# Use persistent storage if directory exists, otherwise in-memory
|
18 |
+
if os.path.exists(persist_dir):
|
19 |
+
client = chromadb.PersistentClient(path=persist_dir)
|
20 |
+
else:
|
21 |
+
client = chromadb.Client()
|
22 |
+
return client
|
23 |
+
except Exception as e:
|
24 |
+
print(f"Error initializing ChromaDB: {e}")
|
25 |
+
return chromadb.Client() # Fallback to in-memory
|
26 |
|
27 |
+
def create_collection(client, collection_name=DB_NAME):
|
28 |
+
"""Create or get a ChromaDB collection for Python programs."""
|
29 |
try:
|
30 |
collection = client.get_collection(name=collection_name)
|
31 |
except:
|
32 |
collection = client.create_collection(name=collection_name)
|
33 |
return collection
|
34 |
|
35 |
+
def store_program(client, code, sequence, vectors, collection_name=DB_NAME):
|
36 |
+
"""Store a program in ChromaDB with its code, sequence, and vectors."""
|
37 |
collection = create_collection(client, collection_name)
|
38 |
|
39 |
+
# Flatten vectors to ensure they are a list of numbers
|
40 |
+
flattened_vectors = [item for sublist in vectors for item in sublist]
|
41 |
+
|
42 |
# Store program data (ID, code, sequence, vectors)
|
43 |
program_id = str(hash(code)) # Use hash of code as ID for uniqueness
|
44 |
collection.add(
|
45 |
documents=[code],
|
46 |
metadatas=[{"sequence": ",".join(sequence)}],
|
47 |
ids=[program_id],
|
48 |
+
embeddings=[flattened_vectors] # Pass as flat list
|
49 |
)
|
50 |
return program_id
|
51 |
|
52 |
def populate_sample_db(client):
|
53 |
+
"""Populate ChromaDB with sample Python programs."""
|
54 |
samples = [
|
55 |
"""
|
56 |
import os
|
|
|
71 |
vectors = [part['vector'] for part in parts]
|
72 |
store_program(client, code, sequence, vectors)
|
73 |
|
74 |
+
def query_programs(client, operations, collection_name=DB_NAME, top_k=5):
|
75 |
+
"""Query ChromaDB for programs matching the operations sequence."""
|
76 |
collection = create_collection(client, collection_name)
|
77 |
|
78 |
# Convert operations to a query vector (average of operation vectors)
|
79 |
+
query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0] * 6
|
80 |
|
81 |
# Perform similarity search
|
82 |
results = collection.query(
|
|
|
90 |
for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
|
91 |
sequence = meta['sequence'].split(',')
|
92 |
if is_subsequence(operations, sequence):
|
93 |
+
# Extract and flatten vectors from the document (assuming stored as string or list)
|
94 |
+
try:
|
95 |
+
doc_vectors = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
|
96 |
+
program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
|
97 |
+
except:
|
98 |
+
program_vector = [0] * 6 # Fallback for malformed vectors
|
99 |
+
similarity = cosine_similarity([query_vector], [program_vector])[0][0] if program_vector and query_vector else 0
|
100 |
matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity})
|
101 |
|
102 |
return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
|
103 |
|
|
|
|
|
|
|
104 |
def create_vector(category, level, location, total_lines, parent_path):
|
105 |
"""Helper to create a vector for query (matches parser's create_vector)."""
|
106 |
category_map = {
|
|
|
123 |
it = iter(seq)
|
124 |
return all(item in it for item in subseq)
|
125 |
|
126 |
+
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
|
127 |
+
"""Save ChromaDB data to Hugging Face Dataset."""
|
128 |
+
from datasets import Dataset
|
129 |
+
client = init_chromadb()
|
130 |
+
collection = create_collection(client)
|
131 |
+
|
132 |
+
# Fetch all data from ChromaDB
|
133 |
+
results = collection.get(include=["documents", "metadatas", "embeddings"])
|
134 |
+
data = {
|
135 |
+
"code": results["documents"],
|
136 |
+
"sequence": [meta["sequence"] for meta in results["metadatas"]],
|
137 |
+
"vectors": [[item for sublist in vec for item in sublist] for vec in results["embeddings"]] # Flatten vectors
|
138 |
+
}
|
139 |
+
|
140 |
+
# Create a Hugging Face Dataset
|
141 |
+
dataset = Dataset.from_dict(data)
|
142 |
+
|
143 |
+
# Push to Hugging Face Hub
|
144 |
+
dataset.push_to_hub(dataset_name, token=token)
|
145 |
+
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
146 |
+
|
147 |
+
def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
|
148 |
+
"""Load ChromaDB data from Hugging Face Dataset."""
|
149 |
+
from datasets import load_dataset
|
150 |
+
client = init_chromadb()
|
151 |
+
collection = create_collection(client)
|
152 |
+
|
153 |
+
dataset = load_dataset(dataset_name, split="train", token=token)
|
154 |
+
for item in dataset:
|
155 |
+
collection.add(
|
156 |
+
documents=[item["code"]],
|
157 |
+
metadatas=[{"sequence": item["sequence"]}],
|
158 |
+
ids=[str(hash(item["code"]))],
|
159 |
+
embeddings=[item["vectors"]]
|
160 |
+
)
|
161 |
+
return client
|
162 |
+
|
163 |
if __name__ == '__main__':
|
164 |
client = init_chromadb()
|
165 |
+
populate_sample_db(client)
|
166 |
+
# Uncomment to save to Hugging Face
|
167 |
+
# save_chromadb_to_hf()
|