Spaces:
Sleeping
Sleeping
Create add_embeddings.py
Browse files- add_embeddings.py +108 -0
add_embeddings.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import chromadb
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
from loguru import logger
|
5 |
+
|
6 |
+
class SentenceTransformerEmbeddings:
|
7 |
+
def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
|
8 |
+
self.model = SentenceTransformer(model_name)
|
9 |
+
|
10 |
+
def __call__(self, input: list[str]) -> list[list[float]]:
|
11 |
+
embeddings = self.model.encode(input)
|
12 |
+
return embeddings.tolist()
|
13 |
+
|
14 |
+
def load_documents():
|
15 |
+
"""Load and process documents into ChromaDB"""
|
16 |
+
try:
|
17 |
+
# Set up paths
|
18 |
+
base_path = os.path.dirname(os.path.abspath(__file__))
|
19 |
+
doc_path = os.path.join(base_path, 'a2023-45.txt')
|
20 |
+
index_path = os.path.join(base_path, 'index.txt')
|
21 |
+
chroma_path = os.path.join(base_path, 'chroma_db')
|
22 |
+
|
23 |
+
# Ensure ChromaDB directory exists
|
24 |
+
os.makedirs(chroma_path, exist_ok=True)
|
25 |
+
|
26 |
+
logger.info(f"Loading documents from {doc_path} and {index_path}")
|
27 |
+
|
28 |
+
# Initialize ChromaDB
|
29 |
+
chroma_client = chromadb.PersistentClient(path=chroma_path)
|
30 |
+
embedding_function = SentenceTransformerEmbeddings()
|
31 |
+
|
32 |
+
# Create new collection (delete if exists)
|
33 |
+
if "legal_documents" in [col.name for col in chroma_client.list_collections()]:
|
34 |
+
chroma_client.delete_collection("legal_documents")
|
35 |
+
|
36 |
+
collection = chroma_client.create_collection(
|
37 |
+
name="legal_documents",
|
38 |
+
embedding_function=embedding_function
|
39 |
+
)
|
40 |
+
|
41 |
+
# Read and validate files
|
42 |
+
with open(doc_path, 'r', encoding='utf-8') as f:
|
43 |
+
document = f.read().strip()
|
44 |
+
|
45 |
+
with open(index_path, 'r', encoding='utf-8') as f:
|
46 |
+
index_content = [line.strip() for line in f.readlines() if line.strip()]
|
47 |
+
|
48 |
+
# Process document into sections
|
49 |
+
sections = []
|
50 |
+
current_section = ""
|
51 |
+
current_title = ""
|
52 |
+
|
53 |
+
for line in document.split('\n'):
|
54 |
+
line = line.strip()
|
55 |
+
if any(index_line in line for index_line in index_content):
|
56 |
+
if current_section and current_title:
|
57 |
+
sections.append({
|
58 |
+
"title": current_title,
|
59 |
+
"content": current_section.strip()
|
60 |
+
})
|
61 |
+
current_title = line
|
62 |
+
current_section = ""
|
63 |
+
else:
|
64 |
+
if line:
|
65 |
+
current_section += line + "\n"
|
66 |
+
|
67 |
+
# Add final section
|
68 |
+
if current_section and current_title:
|
69 |
+
sections.append({
|
70 |
+
"title": current_title,
|
71 |
+
"content": current_section.strip()
|
72 |
+
})
|
73 |
+
|
74 |
+
# Prepare data for ChromaDB
|
75 |
+
documents = []
|
76 |
+
metadatas = []
|
77 |
+
ids = []
|
78 |
+
|
79 |
+
for i, section in enumerate(sections):
|
80 |
+
if section["content"].strip():
|
81 |
+
documents.append(section["content"])
|
82 |
+
metadatas.append({
|
83 |
+
"title": section["title"],
|
84 |
+
"source": "a2023-45.txt",
|
85 |
+
"section_number": i + 1
|
86 |
+
})
|
87 |
+
ids.append(f"section_{i+1}")
|
88 |
+
|
89 |
+
# Add to ChromaDB
|
90 |
+
collection.add(
|
91 |
+
documents=documents,
|
92 |
+
metadatas=metadatas,
|
93 |
+
ids=ids
|
94 |
+
)
|
95 |
+
|
96 |
+
logger.info(f"Successfully loaded {len(documents)} sections into ChromaDB")
|
97 |
+
return True
|
98 |
+
|
99 |
+
except Exception as e:
|
100 |
+
logger.error(f"Error loading documents: {str(e)}")
|
101 |
+
return False
|
102 |
+
|
103 |
+
if __name__ == "__main__":
|
104 |
+
success = load_documents()
|
105 |
+
if success:
|
106 |
+
print("Documents successfully loaded into ChromaDB")
|
107 |
+
else:
|
108 |
+
print("Failed to load documents into ChromaDB")
|