veerukhannan commited on
Commit
e55cfd3
·
verified ·
1 Parent(s): 97a7638

Update test_embeddings.py

Browse files
Files changed (1) hide show
  1. test_embeddings.py +117 -15
test_embeddings.py CHANGED
@@ -11,26 +11,134 @@ class SentenceTransformerEmbeddings:
11
  embeddings = self.model.encode(input)
12
  return embeddings.tolist()
13
 
14
- def test_chromadb_content():
15
- """Test if ChromaDB has the required content"""
16
  try:
17
- # Set up ChromaDB path
18
  base_path = os.path.dirname(os.path.abspath(__file__))
 
 
19
  chroma_path = os.path.join(base_path, 'chroma_db')
20
 
21
- if not os.path.exists(chroma_path):
22
- logger.error(f"ChromaDB directory not found at {chroma_path}")
 
 
 
 
23
  return False
24
 
 
 
 
25
  # Initialize ChromaDB
26
  chroma_client = chromadb.PersistentClient(path=chroma_path)
 
27
 
28
- # Check if collection exists
29
  collections = chroma_client.list_collections()
30
- if not any(col.name == "legal_documents" for col in collections):
31
- logger.error("Legal documents collection not found in ChromaDB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  return False
33
 
 
 
 
 
 
 
 
34
  # Get collection
35
  collection = chroma_client.get_collection(
36
  name="legal_documents",
@@ -55,13 +163,7 @@ def test_chromadb_content():
55
  logger.error("Test query returned no results")
56
  return False
57
 
58
- # Print sample content
59
- logger.info("Sample content from ChromaDB:")
60
- for i, (doc, metadata) in enumerate(zip(test_results['documents'][0], test_results['metadatas'][0])):
61
- logger.info(f"\nDocument {i+1}:")
62
- logger.info(f"Title: {metadata['title']}")
63
- logger.info(f"Content preview: {doc[:200]}...")
64
-
65
  return True
66
 
67
  except Exception as e:
 
11
  embeddings = self.model.encode(input)
12
  return embeddings.tolist()
13
 
14
+ def initialize_chromadb():
15
+ """Initialize ChromaDB and load documents if needed"""
16
  try:
17
+ # Set up paths
18
  base_path = os.path.dirname(os.path.abspath(__file__))
19
+ doc_path = os.path.join(base_path, 'a2023-45.txt')
20
+ index_path = os.path.join(base_path, 'index.txt')
21
  chroma_path = os.path.join(base_path, 'chroma_db')
22
 
23
+ # Check if required files exist
24
+ if not os.path.exists(doc_path):
25
+ logger.error(f"Document file not found at {doc_path}")
26
+ return False
27
+ if not os.path.exists(index_path):
28
+ logger.error(f"Index file not found at {index_path}")
29
  return False
30
 
31
+ # Ensure ChromaDB directory exists
32
+ os.makedirs(chroma_path, exist_ok=True)
33
+
34
  # Initialize ChromaDB
35
  chroma_client = chromadb.PersistentClient(path=chroma_path)
36
+ embedding_function = SentenceTransformerEmbeddings()
37
 
38
+ # Check if collection exists and has content
39
  collections = chroma_client.list_collections()
40
+ collection_exists = any(col.name == "legal_documents" for col in collections)
41
+
42
+ if collection_exists:
43
+ collection = chroma_client.get_collection(
44
+ name="legal_documents",
45
+ embedding_function=embedding_function
46
+ )
47
+ if collection.count() > 0:
48
+ logger.info("ChromaDB collection already exists and has content")
49
+ return True
50
+
51
+ # If we get here, we need to create or repopulate the collection
52
+ logger.info("Loading documents into ChromaDB...")
53
+
54
+ # Delete existing collection if it exists
55
+ if collection_exists:
56
+ chroma_client.delete_collection("legal_documents")
57
+
58
+ # Create new collection
59
+ collection = chroma_client.create_collection(
60
+ name="legal_documents",
61
+ embedding_function=embedding_function
62
+ )
63
+
64
+ # Read and process documents
65
+ with open(doc_path, 'r', encoding='utf-8') as f:
66
+ document = f.read().strip()
67
+
68
+ with open(index_path, 'r', encoding='utf-8') as f:
69
+ index_content = [line.strip() for line in f.readlines() if line.strip()]
70
+
71
+ # Process document into sections
72
+ sections = []
73
+ current_section = ""
74
+ current_title = ""
75
+
76
+ for line in document.split('\n'):
77
+ line = line.strip()
78
+ if any(index_line in line for index_line in index_content):
79
+ if current_section and current_title:
80
+ sections.append({
81
+ "title": current_title,
82
+ "content": current_section.strip()
83
+ })
84
+ current_title = line
85
+ current_section = ""
86
+ else:
87
+ if line:
88
+ current_section += line + "\n"
89
+
90
+ if current_section and current_title:
91
+ sections.append({
92
+ "title": current_title,
93
+ "content": current_section.strip()
94
+ })
95
+
96
+ # Prepare and add data to ChromaDB
97
+ if sections:
98
+ documents = []
99
+ metadatas = []
100
+ ids = []
101
+
102
+ for i, section in enumerate(sections):
103
+ if section["content"].strip():
104
+ documents.append(section["content"])
105
+ metadatas.append({
106
+ "title": section["title"],
107
+ "source": "a2023-45.txt",
108
+ "section_number": i + 1
109
+ })
110
+ ids.append(f"section_{i+1}")
111
+
112
+ collection.add(
113
+ documents=documents,
114
+ metadatas=metadatas,
115
+ ids=ids
116
+ )
117
+
118
+ logger.info(f"Successfully loaded {len(documents)} sections into ChromaDB")
119
+ return True
120
+ else:
121
+ logger.error("No valid sections found in document")
122
+ return False
123
+
124
+ except Exception as e:
125
+ logger.error(f"Error initializing ChromaDB: {str(e)}")
126
+ return False
127
+
128
+ def test_chromadb_content():
129
+ """Test if ChromaDB has the required content"""
130
+ try:
131
+ # First ensure ChromaDB is initialized
132
+ if not initialize_chromadb():
133
  return False
134
 
135
+ # Set up ChromaDB path
136
+ base_path = os.path.dirname(os.path.abspath(__file__))
137
+ chroma_path = os.path.join(base_path, 'chroma_db')
138
+
139
+ # Initialize ChromaDB
140
+ chroma_client = chromadb.PersistentClient(path=chroma_path)
141
+
142
  # Get collection
143
  collection = chroma_client.get_collection(
144
  name="legal_documents",
 
163
  logger.error("Test query returned no results")
164
  return False
165
 
166
+ logger.info("ChromaDB content verification successful")
 
 
 
 
 
 
167
  return True
168
 
169
  except Exception as e: