Spaces:

veerukhannan
/

advisor

Sleeping

File size: 6,782 Bytes

5f5f8de
6404fd8
2b58400
5f5f8de
c784c97
6404fd8
 
2105dc2
5f5f8de
a8dcc53
c784c97
5f5f8de
c784c97
2105dc2
a8dcc53
6404fd8
 
2105dc2
6404fd8
c784c97
 
6404fd8
 
c784c97
 
 
 
 
 
 
 
6404fd8
 
2105dc2
6404fd8
 
 
c784c97
 
6404fd8
 
 
c784c97
 
 
2105dc2
c784c97
 
 
 
 
 
6404fd8
c784c97
 
 
 
 
 
 
 
 
 
2105dc2
121ef90
c784c97
 
121ef90
2105dc2
c784c97
 
 
2105dc2
c784c97
2105dc2
 
 
c784c97
2105dc2
 
6404fd8
2105dc2
 
6404fd8
2105dc2
 
6404fd8
 
c784c97
 
 
 
 
 
 
 
 
2105dc2
a8dcc53
c784c97
 
 
a8dcc53
a53e1b6
2105dc2
c784c97
2105dc2
 
c784c97
 
2105dc2
c784c97
 
105179a
6404fd8
 
c784c97
a8dcc53
6404fd8
a8dcc53
 
 
 
 
c784c97
a8dcc53
 
 
 
 
 
 
a53e1b6
c784c97
 
a8dcc53
6404fd8
c784c97
cb7bbf3
c784c97
 
 
cb7bbf3
a8dcc53
c784c97
5f5f8de
c784c97
 
121ef90
c784c97
2105dc2
 
 
 
5f5f8de
c784c97
 
 
6404fd8
c784c97
6404fd8
 
121ef90
6404fd8
c784c97
a8dcc53
c784c97
5f5f8de
 
6404fd8
5f5f8de
c784c97
 
105179a
c784c97
 
 
 
 
6404fd8
c784c97
 
 
 
6404fd8
 
 
5f5f8de
6404fd8
38dd749
c784c97
 
2b58400
c784c97

import gradio as gr
from typing import List, Dict
from langchain_huggingface import HuggingFacePipeline  # Fixed import
from langchain_core.prompts import ChatPromptTemplate
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import chromadb
from chromadb.utils import embedding_functions
import torch
import os

class LegalChatbot:
    def __init__(self):
        print("Initializing Legal Chatbot...")
        
        # Initialize ChromaDB
        self.chroma_client = chromadb.Client()
        
        # Initialize embedding function
        self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="all-MiniLM-L6-v2",
            device="cpu"
        )
        
        # Create collection
        self.collection = self.chroma_client.create_collection(
            name="text_collection",
            embedding_function=self.embedding_function,
            metadata={"hnsw:space": "cosine"}
        )

        # Initialize the model - using a smaller model suitable for CPU
        pipe = pipeline(
            "text-generation",
            model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.95,
            repetition_penalty=1.15,
            device="cpu"
        )
        self.llm = HuggingFacePipeline(pipeline=pipe)
        
        # Create prompt template
        self.template = """
        IMPORTANT: You are a helpful assistant that provides information about the Bharatiya Nyaya Sanhita, 2023 based on the retrieved context.
        
        STRICT RULES:
        1. Base your response ONLY on the provided context
        2. If you cannot find relevant information, respond with: "I apologize, but I cannot find information about that in the database."
        3. Do not make assumptions or use external knowledge
        4. Be concise and accurate in your responses
        5. If quoting from the context, clearly indicate it
        
        Context: {context}
        
        Chat History: {chat_history}
        
        Question: {question}
        
        Answer:"""
        
        self.prompt = ChatPromptTemplate.from_template(self.template)
        self.chat_history = ""
        self.initialized = False

    def _initialize_database(self) -> bool:
        """Initialize the database with document content"""
        try:
            if self.initialized:
                return True
                
            print("Loading documents into database...")
            
            # Read the main text file
            with open('a2023-45.txt', 'r', encoding='utf-8') as f:
                text_content = f.read()
            
            # Read the index file
            with open('index.txt', 'r', encoding='utf-8') as f:
                index_lines = f.readlines()
            
            # Create chunks
            chunk_size = 512
            chunks = []
            for i in range(0, len(text_content), chunk_size):
                chunk = text_content[i:i + chunk_size]
                chunks.append(chunk)
            
            # Add documents in batches
            batch_size = 50
            for i in range(0, len(chunks), batch_size):
                batch = chunks[i:i + batch_size]
                batch_ids = [f"doc_{j}" for j in range(i, i + len(batch))]
                batch_metadata = [{
                    "index": index_lines[j].strip() if j < len(index_lines) else f"Chunk {j+1}",
                    "chunk_number": j
                } for j in range(i, i + len(batch))]
                
                self.collection.add(
                    documents=batch,
                    ids=batch_ids,
                    metadatas=batch_metadata
                )
            
            self.initialized = True
            return True
            
        except Exception as e:
            print(f"Error initializing database: {str(e)}")
            return False

    def _search_database(self, query: str) -> List[Dict]:
        """Search the database for relevant documents"""
        try:
            results = self.collection.query(
                query_texts=[query],
                n_results=3,
                include=["documents", "metadatas", "distances"]
            )
            
            return [
                {
                    "content": doc,
                    "metadata": meta,
                    "score": 1 - dist
                }
                for doc, meta, dist in zip(
                    results['documents'][0],
                    results['metadatas'][0],
                    results['distances'][0]
                )
            ]
        except Exception as e:
            print(f"Error searching database: {str(e)}")
            return []

    def chat(self, query: str, history) -> str:
        """Process a query and return a response"""
        try:
            # Initialize database if needed
            if not self.initialized and not self._initialize_database():
                return "Error: Unable to initialize the database. Please try again."
            
            # Search for relevant content
            search_results = self._search_database(query)
            
            if not search_results:
                return "I apologize, but I cannot find information about that in the database."
            
            # Extract and combine relevant content
            context = "\n\n".join([
                f"[Section {r['metadata']['index']}]\n{r['content']}"
                for r in search_results
            ])
            
            # Generate response using LLM
            chain = self.prompt | self.llm
            result = chain.invoke({
                "context": context,
                "chat_history": self.chat_history,
                "question": query
            })
            
            # Update chat history
            self.chat_history += f"\nUser: {query}\nAI: {result}\n"
            
            return result
            
        except Exception as e:
            return f"Error processing query: {str(e)}"

# Initialize the chatbot
chatbot = LegalChatbot()

# Create the Gradio interface
iface = gr.ChatInterface(
    chatbot.chat,
    title="Bharatiya Nyaya Sanhita, 2023 - Legal Assistant",
    description="Ask questions about the Bharatiya Nyaya Sanhita, 2023. The system will initialize on your first query.",
    examples=[
        "What is criminal conspiracy?",
        "What are the punishments for corruption?",
        "Explain the concept of culpable homicide",
        "What constitutes theft under the act?"
    ],
    theme=gr.themes.Soft()
)

# Launch the interface
if __name__ == "__main__":
    iface.launch(
        share=False,
        show_error=True
    )