Spaces:

veerukhannan
/

Nyaya-Mitra

Runtime error

App Files Files Community

veerukhannan commited on Nov 24, 2024

Commit

8f0efdf

verified ·

1 Parent(s): 7f37b5a

Create add_embeddings.py

Browse files

Files changed (1) hide show

add_embeddings.py +101 -0

add_embeddings.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import os
+from PyPDF2 import PdfReader
+from sentence_transformers import SentenceTransformer
+import chromadb
+from typing import List, Dict
+import re
+class LegalDocumentProcessor:
+    def __init__(self):
+        self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+        self.chroma_client = chromadb.Client()
+        self.collection = self.chroma_client.create_collection(
+            name="indian_legal_docs",
+            metadata={"description": "Indian Criminal Law Documents"}
+        )
+    def process_pdf(self, pdf_path: str) -> List[str]:
+        """Extract text from PDF and split into chunks"""
+        reader = PdfReader(pdf_path)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+        # Split into meaningful chunks (by sections/paragraphs)
+        chunks = self._split_into_chunks(text)
+        return chunks
+    def _split_into_chunks(self, text: str, max_chunk_size: int = 1000) -> List[str]:
+        """Split text into smaller chunks while preserving context"""
+        # Split on section boundaries or paragraphs
+        sections = re.split(r'(Chapter \d+|Section \d+|\n\n)', text)
+        chunks = []
+        current_chunk = ""
+        for section in sections:
+            if len(current_chunk) + len(section) < max_chunk_size:
+                current_chunk += section
+            else:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                current_chunk = section
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        return chunks
+    def process_and_store_documents(self):
+        """Process all legal documents and store in ChromaDB"""
+        pdf_files = {
+            "BNS": "c:/Users/veeru/hf_env/BNS.pdf",
+            "BNSS": "c:/Users/veeru/hf_env/BNSS.pdf",
+            "BSA": "c:/Users/veeru/hf_env/BSA.pdf"
+        }
+        for law_code, pdf_path in pdf_files.items():
+            # Process PDF
+            chunks = self.process_pdf(pdf_path)
+            # Generate embeddings and store in ChromaDB
+            for i, chunk in enumerate(chunks):
+                embeddings = self.embedding_model.encode([chunk]).tolist()
+                self.collection.add(
+                    documents=[chunk],
+                    embeddings=embeddings,
+                    metadatas=[{
+                        "law_code": law_code,
+                        "chunk_id": f"{law_code}_chunk_{i}",
+                        "source": os.path.basename(pdf_path)
+                    }],
+                    ids=[f"{law_code}_chunk_{i}"]
+                )
+    def search_documents(self, query: str, n_results: int = 3) -> List[Dict]:
+        """Search for relevant legal information"""
+        query_embedding = self.embedding_model.encode([query]).tolist()
+        results = self.collection.query(
+            query_embeddings=query_embedding,
+            n_results=n_results
+        )
+        return {
+            "documents": results["documents"][0],
+            "metadatas": results["metadatas"][0]
+        }
+if __name__ == "__main__":
+    # Initialize and run document processing
+    processor = LegalDocumentProcessor()
+    processor.process_and_store_documents()
+    # Test search functionality
+    test_query = "What are the provisions for digital evidence?"
+    results = processor.search_documents(test_query)
+    print(f"Query: {test_query}")
+    print("\nResults:")
+    for doc, metadata in zip(results["documents"], results["metadatas"]):
+        print(f"\nFrom {metadata['source']}:")
+        print(doc[:200] + "...")