Spaces:

veerukhannan
/

advisor

Sleeping

App Files Files Community

veerukhannan commited on Nov 23, 2024

Commit

c784c97

verified ·

1 Parent(s): 2105dc2

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -142

app.py CHANGED Viewed

@@ -2,91 +2,80 @@ import gradio as gr
 from typing import List, Dict
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
-from transformers import pipeline
 import chromadb
 from chromadb.utils import embedding_functions
-from sentence_transformers import SentenceTransformer
 import torch
-from tqdm import tqdm
 import os
-class LegalSearchSystem:
     def __init__(self):
-        print("Initializing Legal Search System...")
         # Initialize ChromaDB
         self.chroma_client = chromadb.Client()
         # Initialize embedding function
         self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
-            model_name="all-MiniLM-L6-v2"
         )
-        # Initialize the model for text generation
         pipe = pipeline(
             "text-generation",
             model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
             max_new_tokens=512,
             temperature=0.7,
             top_p=0.95,
-            repetition_penalty=1.15
         )
         self.llm = HuggingFacePipeline(pipeline=pipe)
-        # Create or get collection
-        self.collection = self.chroma_client.create_collection(
-            name="text_collection",
-            embedding_function=self.embedding_function,
-            metadata={"hnsw:space": "cosine"}
-        )
-        # Initialize chat templates
-        self.templates = {
-            "default": """
-            You are a legal assistant providing information about the Bharatiya Nyaya Sanhita, 2023.
-            Context: {context}
-            Chat History: {chat_history}
-            Question: {question}
-            Instructions:
-            1. Answer based ONLY on the provided context
-            2. If information isn't in context, say "I don't have enough information"
-            3. Be precise and cite specific sections when possible
-            4. Use clear, legal terminology
-            Answer:""",
-            "summary": """
-            Provide a summary of the legal provisions from the context.
-            Context: {context}
-            Question: {question}
-            Format:
-            1. Main Points
-            2. Key Provisions
-            3. Important Definitions
-            Summary:"""
-        }
-        self.chat_history = []
         self.initialized = False
-    def initialize_embeddings(self) -> str:
-        """Initialize the system by loading and embedding documents"""
         try:
             if self.initialized:
-                return "System already initialized!"
-            print("Loading documents and creating embeddings...")
-            # Read main text file
             with open('a2023-45.txt', 'r', encoding='utf-8') as f:
                 text_content = f.read()
-            # Read index file
             with open('index.txt', 'r', encoding='utf-8') as f:
                 index_lines = f.readlines()
@@ -97,59 +86,35 @@ class LegalSearchSystem:
                 chunk = text_content[i:i + chunk_size]
                 chunks.append(chunk)
-            # Add documents to collection
-            print(f"Processing {len(chunks)} chunks...")
-            for i, chunk in enumerate(chunks):
-                # Get corresponding index line if available
-                index_text = index_lines[i].strip() if i < len(index_lines) else f"Chunk {i+1}"
                 self.collection.add(
-                    documents=[chunk],
-                    ids=[f"doc_{i}"],
-                    metadatas=[{
-                        "index": index_text,
-                        "chunk_number": i
-                    }]
                 )
             self.initialized = True
-            return f"Successfully loaded {len(chunks)} chunks into the system!"
-        except Exception as e:
-            return f"Error initializing system: {str(e)}"
-    def verify_system(self) -> str:
-        """Verify system is working properly"""
-        try:
-            # Check document count
-            count = self.collection.count()
-            if count == 0:
-                return "Error: No documents found in the system!"
-            # Test basic query
-            test_query = "What is criminal conspiracy?"
-            results = self.collection.query(
-                query_texts=[test_query],
-                n_results=1
-            )
-            if not results['documents'][0]:
-                return "Error: Search functionality not working properly!"
-            return f"System verification successful! Found {count} documents."
         except Exception as e:
-            return f"System verification failed: {str(e)}"
-    def search(self, query: str, n_results: int = 3) -> List[Dict]:
-        """Search for relevant documents"""
-        if not self.initialized:
-            return [{"error": "System not initialized! Please wait."}]
         try:
             results = self.collection.query(
                 query_texts=[query],
-                n_results=n_results,
                 include=["documents", "metadatas", "distances"]
             )
@@ -157,7 +122,7 @@ class LegalSearchSystem:
                 {
                     "content": doc,
                     "metadata": meta,
-                    "similarity": 1 - dist
                 }
                 for doc, meta, dist in zip(
                     results['documents'][0],
@@ -166,80 +131,66 @@ class LegalSearchSystem:
                 )
             ]
         except Exception as e:
-            return [{"error": f"Search error: {str(e)}"}]
     def chat(self, query: str, history) -> str:
-        """Process query and return response"""
         try:
-            if not self.initialized:
-                init_msg = self.initialize_embeddings()
-                if "Error" in init_msg:
-                    return init_msg
             # Search for relevant content
-            search_results = self.search(query)
-            if "error" in search_results[0]:
-                return search_results[0]["error"]
-            # Prepare context
             context = "\n\n".join([
                 f"[Section {r['metadata']['index']}]\n{r['content']}"
                 for r in search_results
             ])
-            # Select template
-            template_type = "summary" if "summarize" in query.lower() else "default"
-            prompt = ChatPromptTemplate.from_template(self.templates[template_type])
-            # Generate response
-            chain = prompt | self.llm
-            response = chain.invoke({
                 "context": context,
-                "chat_history": "\n".join([f"{h[0]}: {h[1]}" for h in self.chat_history[-3:]]),
                 "question": query
             })
             # Update chat history
-            self.chat_history.append(("User", query))
-            self.chat_history.append(("Assistant", response))
-            return response
         except Exception as e:
             return f"Error processing query: {str(e)}"
-# Initialize the system
-system = LegalSearchSystem()
-# Create Gradio interface
-demo = gr.Interface(
-    fn=system.chat,
-    inputs=[
-        gr.Textbox(
-            label="Your Question",
-            placeholder="Ask about the Bharatiya Nyaya Sanhita, 2023...",
-            lines=2
-        ),
-        gr.State([])  # For chat history
-    ],
-    outputs=gr.Textbox(label="Answer", lines=10),
-    title="🔍 Bharatiya Nyaya Sanhita, 2023 - Legal Search System",
-    description="""
-    Ask questions about the Bharatiya Nyaya Sanhita, 2023:
-    - For summaries, include the word "summarize" in your question
-    - For specific provisions, ask directly about the topic
-    - System will automatically initialize on first query
-    """,
     examples=[
-        ["What is the definition of criminal conspiracy?"],
-        ["Summarize the provisions related to theft"],
-        ["What are the punishments for corruption?"],
-        ["Explain the concept of culpable homicide"]
     ],
     theme=gr.themes.Soft()
 )
 # Launch the interface
 if __name__ == "__main__":
-    demo.launch()

 from typing import List, Dict
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import chromadb
 from chromadb.utils import embedding_functions
 import torch
 import os
+class LegalChatbot:
     def __init__(self):
+        print("Initializing Legal Chatbot...")
         # Initialize ChromaDB
         self.chroma_client = chromadb.Client()
         # Initialize embedding function
         self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
+            model_name="all-MiniLM-L6-v2",
+            device="cpu"
         )
+        # Create collection
+        self.collection = self.chroma_client.create_collection(
+            name="text_collection",
+            embedding_function=self.embedding_function,
+            metadata={"hnsw:space": "cosine"}
+        )
+        # Initialize the model - using a smaller model suitable for CPU
         pipe = pipeline(
             "text-generation",
             model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
             max_new_tokens=512,
             temperature=0.7,
             top_p=0.95,
+            repetition_penalty=1.15,
+            device="cpu"
         )
         self.llm = HuggingFacePipeline(pipeline=pipe)
+        # Create prompt template
+        self.template = """
+        IMPORTANT: You are a helpful assistant that provides information about the Bharatiya Nyaya Sanhita, 2023 based on the retrieved context.
+        STRICT RULES:
+        1. Base your response ONLY on the provided context
+        2. If you cannot find relevant information, respond with: "I apologize, but I cannot find information about that in the database."
+        3. Do not make assumptions or use external knowledge
+        4. Be concise and accurate in your responses
+        5. If quoting from the context, clearly indicate it
+        Context: {context}
+        Chat History: {chat_history}
+        Question: {question}
+        Answer:"""
+        self.prompt = ChatPromptTemplate.from_template(self.template)
+        self.chat_history = ""
         self.initialized = False
+    def _initialize_database(self) -> bool:
+        """Initialize the database with document content"""
         try:
             if self.initialized:
+                return True
+            print("Loading documents into database...")
+            # Read the main text file
             with open('a2023-45.txt', 'r', encoding='utf-8') as f:
                 text_content = f.read()
+            # Read the index file
             with open('index.txt', 'r', encoding='utf-8') as f:
                 index_lines = f.readlines()
                 chunk = text_content[i:i + chunk_size]
                 chunks.append(chunk)
+            # Add documents in batches
+            batch_size = 50
+            for i in range(0, len(chunks), batch_size):
+                batch = chunks[i:i + batch_size]
+                batch_ids = [f"doc_{j}" for j in range(i, i + len(batch))]
+                batch_metadata = [{
+                    "index": index_lines[j].strip() if j < len(index_lines) else f"Chunk {j+1}",
+                    "chunk_number": j
+                } for j in range(i, i + len(batch))]
                 self.collection.add(
+                    documents=batch,
+                    ids=batch_ids,
+                    metadatas=batch_metadata
                 )
             self.initialized = True
+            return True
         except Exception as e:
+            print(f"Error initializing database: {str(e)}")
+            return False
+    def _search_database(self, query: str) -> List[Dict]:
+        """Search the database for relevant documents"""
         try:
             results = self.collection.query(
                 query_texts=[query],
+                n_results=3,
                 include=["documents", "metadatas", "distances"]
             )
                 {
                     "content": doc,
                     "metadata": meta,
+                    "score": 1 - dist
                 }
                 for doc, meta, dist in zip(
                     results['documents'][0],
                 )
             ]
         except Exception as e:
+            print(f"Error searching database: {str(e)}")
+            return []
     def chat(self, query: str, history) -> str:
+        """Process a query and return a response"""
         try:
+            # Initialize database if needed
+            if not self.initialized and not self._initialize_database():
+                return "Error: Unable to initialize the database. Please try again."
             # Search for relevant content
+            search_results = self._search_database(query)
+            if not search_results:
+                return "I apologize, but I cannot find information about that in the database."
+            # Extract and combine relevant content
             context = "\n\n".join([
                 f"[Section {r['metadata']['index']}]\n{r['content']}"
                 for r in search_results
             ])
+            # Generate response using LLM
+            chain = self.prompt | self.llm
+            result = chain.invoke({
                 "context": context,
+                "chat_history": self.chat_history,
                 "question": query
             })
             # Update chat history
+            self.chat_history += f"\nUser: {query}\nAI: {result}\n"
+            return result
         except Exception as e:
             return f"Error processing query: {str(e)}"
+# Initialize the chatbot
+chatbot = LegalChatbot()
+# Create the Gradio interface
+iface = gr.ChatInterface(
+    chatbot.chat,
+    title="Bharatiya Nyaya Sanhita, 2023 - Legal Assistant",
+    description="Ask questions about the Bharatiya Nyaya Sanhita, 2023. The system will initialize on your first query.",
     examples=[
+        "What is criminal conspiracy?",
+        "What are the punishments for corruption?",
+        "Explain the concept of culpable homicide",
+        "What constitutes theft under the act?"
     ],
     theme=gr.themes.Soft()
 )
 # Launch the interface
 if __name__ == "__main__":
+    iface.launch(
+        share=False,
+        debug=False,
+        show_error=True,
+        enable_queue=True
+    )