veerukhannan commited on
Commit
8f0efdf
·
verified ·
1 Parent(s): 7f37b5a

Create add_embeddings.py

Browse files
Files changed (1) hide show
  1. add_embeddings.py +101 -0
add_embeddings.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from PyPDF2 import PdfReader
3
+ from sentence_transformers import SentenceTransformer
4
+ import chromadb
5
+ from typing import List, Dict
6
+ import re
7
+
8
+ class LegalDocumentProcessor:
9
+ def __init__(self):
10
+ self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
11
+ self.chroma_client = chromadb.Client()
12
+ self.collection = self.chroma_client.create_collection(
13
+ name="indian_legal_docs",
14
+ metadata={"description": "Indian Criminal Law Documents"}
15
+ )
16
+
17
+ def process_pdf(self, pdf_path: str) -> List[str]:
18
+ """Extract text from PDF and split into chunks"""
19
+ reader = PdfReader(pdf_path)
20
+ text = ""
21
+ for page in reader.pages:
22
+ text += page.extract_text()
23
+
24
+ # Split into meaningful chunks (by sections/paragraphs)
25
+ chunks = self._split_into_chunks(text)
26
+ return chunks
27
+
28
+ def _split_into_chunks(self, text: str, max_chunk_size: int = 1000) -> List[str]:
29
+ """Split text into smaller chunks while preserving context"""
30
+ # Split on section boundaries or paragraphs
31
+ sections = re.split(r'(Chapter \d+|Section \d+|\n\n)', text)
32
+
33
+ chunks = []
34
+ current_chunk = ""
35
+
36
+ for section in sections:
37
+ if len(current_chunk) + len(section) < max_chunk_size:
38
+ current_chunk += section
39
+ else:
40
+ if current_chunk:
41
+ chunks.append(current_chunk.strip())
42
+ current_chunk = section
43
+
44
+ if current_chunk:
45
+ chunks.append(current_chunk.strip())
46
+
47
+ return chunks
48
+
49
+ def process_and_store_documents(self):
50
+ """Process all legal documents and store in ChromaDB"""
51
+ pdf_files = {
52
+ "BNS": "c:/Users/veeru/hf_env/BNS.pdf",
53
+ "BNSS": "c:/Users/veeru/hf_env/BNSS.pdf",
54
+ "BSA": "c:/Users/veeru/hf_env/BSA.pdf"
55
+ }
56
+
57
+ for law_code, pdf_path in pdf_files.items():
58
+ # Process PDF
59
+ chunks = self.process_pdf(pdf_path)
60
+
61
+ # Generate embeddings and store in ChromaDB
62
+ for i, chunk in enumerate(chunks):
63
+ embeddings = self.embedding_model.encode([chunk]).tolist()
64
+
65
+ self.collection.add(
66
+ documents=[chunk],
67
+ embeddings=embeddings,
68
+ metadatas=[{
69
+ "law_code": law_code,
70
+ "chunk_id": f"{law_code}_chunk_{i}",
71
+ "source": os.path.basename(pdf_path)
72
+ }],
73
+ ids=[f"{law_code}_chunk_{i}"]
74
+ )
75
+
76
+ def search_documents(self, query: str, n_results: int = 3) -> List[Dict]:
77
+ """Search for relevant legal information"""
78
+ query_embedding = self.embedding_model.encode([query]).tolist()
79
+ results = self.collection.query(
80
+ query_embeddings=query_embedding,
81
+ n_results=n_results
82
+ )
83
+
84
+ return {
85
+ "documents": results["documents"][0],
86
+ "metadatas": results["metadatas"][0]
87
+ }
88
+
89
+ if __name__ == "__main__":
90
+ # Initialize and run document processing
91
+ processor = LegalDocumentProcessor()
92
+ processor.process_and_store_documents()
93
+
94
+ # Test search functionality
95
+ test_query = "What are the provisions for digital evidence?"
96
+ results = processor.search_documents(test_query)
97
+ print(f"Query: {test_query}")
98
+ print("\nResults:")
99
+ for doc, metadata in zip(results["documents"], results["metadatas"]):
100
+ print(f"\nFrom {metadata['source']}:")
101
+ print(doc[:200] + "...")