veerukhannan commited on
Commit
6702977
·
verified ·
1 Parent(s): 58e4f0b

Create add_embeddings.py

Browse files
Files changed (1) hide show
  1. add_embeddings.py +108 -0
add_embeddings.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import chromadb
3
+ from sentence_transformers import SentenceTransformer
4
+ from loguru import logger
5
+
6
+ class SentenceTransformerEmbeddings:
7
+ def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
8
+ self.model = SentenceTransformer(model_name)
9
+
10
+ def __call__(self, input: list[str]) -> list[list[float]]:
11
+ embeddings = self.model.encode(input)
12
+ return embeddings.tolist()
13
+
14
+ def load_documents():
15
+ """Load and process documents into ChromaDB"""
16
+ try:
17
+ # Set up paths
18
+ base_path = os.path.dirname(os.path.abspath(__file__))
19
+ doc_path = os.path.join(base_path, 'a2023-45.txt')
20
+ index_path = os.path.join(base_path, 'index.txt')
21
+ chroma_path = os.path.join(base_path, 'chroma_db')
22
+
23
+ # Ensure ChromaDB directory exists
24
+ os.makedirs(chroma_path, exist_ok=True)
25
+
26
+ logger.info(f"Loading documents from {doc_path} and {index_path}")
27
+
28
+ # Initialize ChromaDB
29
+ chroma_client = chromadb.PersistentClient(path=chroma_path)
30
+ embedding_function = SentenceTransformerEmbeddings()
31
+
32
+ # Create new collection (delete if exists)
33
+ if "legal_documents" in [col.name for col in chroma_client.list_collections()]:
34
+ chroma_client.delete_collection("legal_documents")
35
+
36
+ collection = chroma_client.create_collection(
37
+ name="legal_documents",
38
+ embedding_function=embedding_function
39
+ )
40
+
41
+ # Read and validate files
42
+ with open(doc_path, 'r', encoding='utf-8') as f:
43
+ document = f.read().strip()
44
+
45
+ with open(index_path, 'r', encoding='utf-8') as f:
46
+ index_content = [line.strip() for line in f.readlines() if line.strip()]
47
+
48
+ # Process document into sections
49
+ sections = []
50
+ current_section = ""
51
+ current_title = ""
52
+
53
+ for line in document.split('\n'):
54
+ line = line.strip()
55
+ if any(index_line in line for index_line in index_content):
56
+ if current_section and current_title:
57
+ sections.append({
58
+ "title": current_title,
59
+ "content": current_section.strip()
60
+ })
61
+ current_title = line
62
+ current_section = ""
63
+ else:
64
+ if line:
65
+ current_section += line + "\n"
66
+
67
+ # Add final section
68
+ if current_section and current_title:
69
+ sections.append({
70
+ "title": current_title,
71
+ "content": current_section.strip()
72
+ })
73
+
74
+ # Prepare data for ChromaDB
75
+ documents = []
76
+ metadatas = []
77
+ ids = []
78
+
79
+ for i, section in enumerate(sections):
80
+ if section["content"].strip():
81
+ documents.append(section["content"])
82
+ metadatas.append({
83
+ "title": section["title"],
84
+ "source": "a2023-45.txt",
85
+ "section_number": i + 1
86
+ })
87
+ ids.append(f"section_{i+1}")
88
+
89
+ # Add to ChromaDB
90
+ collection.add(
91
+ documents=documents,
92
+ metadatas=metadatas,
93
+ ids=ids
94
+ )
95
+
96
+ logger.info(f"Successfully loaded {len(documents)} sections into ChromaDB")
97
+ return True
98
+
99
+ except Exception as e:
100
+ logger.error(f"Error loading documents: {str(e)}")
101
+ return False
102
+
103
+ if __name__ == "__main__":
104
+ success = load_documents()
105
+ if success:
106
+ print("Documents successfully loaded into ChromaDB")
107
+ else:
108
+ print("Failed to load documents into ChromaDB")