veerukhannan commited on
Commit
c784c97
·
verified ·
1 Parent(s): 2105dc2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -142
app.py CHANGED
@@ -2,91 +2,80 @@ import gradio as gr
2
  from typing import List, Dict
3
  from langchain_core.prompts import ChatPromptTemplate
4
  from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
5
- from transformers import pipeline
6
  import chromadb
7
  from chromadb.utils import embedding_functions
8
- from sentence_transformers import SentenceTransformer
9
  import torch
10
- from tqdm import tqdm
11
  import os
12
 
13
- class LegalSearchSystem:
14
  def __init__(self):
15
- print("Initializing Legal Search System...")
16
 
17
  # Initialize ChromaDB
18
  self.chroma_client = chromadb.Client()
19
 
20
  # Initialize embedding function
21
  self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
22
- model_name="all-MiniLM-L6-v2"
 
23
  )
24
 
25
- # Initialize the model for text generation
 
 
 
 
 
 
 
26
  pipe = pipeline(
27
  "text-generation",
28
  model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
29
  max_new_tokens=512,
30
  temperature=0.7,
31
  top_p=0.95,
32
- repetition_penalty=1.15
 
33
  )
34
  self.llm = HuggingFacePipeline(pipeline=pipe)
35
 
36
- # Create or get collection
37
- self.collection = self.chroma_client.create_collection(
38
- name="text_collection",
39
- embedding_function=self.embedding_function,
40
- metadata={"hnsw:space": "cosine"}
41
- )
42
 
43
- # Initialize chat templates
44
- self.templates = {
45
- "default": """
46
- You are a legal assistant providing information about the Bharatiya Nyaya Sanhita, 2023.
47
-
48
- Context: {context}
49
- Chat History: {chat_history}
50
- Question: {question}
51
-
52
- Instructions:
53
- 1. Answer based ONLY on the provided context
54
- 2. If information isn't in context, say "I don't have enough information"
55
- 3. Be precise and cite specific sections when possible
56
- 4. Use clear, legal terminology
57
-
58
- Answer:""",
59
-
60
- "summary": """
61
- Provide a summary of the legal provisions from the context.
62
-
63
- Context: {context}
64
- Question: {question}
65
-
66
- Format:
67
- 1. Main Points
68
- 2. Key Provisions
69
- 3. Important Definitions
70
-
71
- Summary:"""
72
- }
73
 
74
- self.chat_history = []
 
 
 
 
 
 
 
 
 
75
  self.initialized = False
76
 
77
- def initialize_embeddings(self) -> str:
78
- """Initialize the system by loading and embedding documents"""
79
  try:
80
  if self.initialized:
81
- return "System already initialized!"
82
-
83
- print("Loading documents and creating embeddings...")
84
 
85
- # Read main text file
86
  with open('a2023-45.txt', 'r', encoding='utf-8') as f:
87
  text_content = f.read()
88
 
89
- # Read index file
90
  with open('index.txt', 'r', encoding='utf-8') as f:
91
  index_lines = f.readlines()
92
 
@@ -97,59 +86,35 @@ class LegalSearchSystem:
97
  chunk = text_content[i:i + chunk_size]
98
  chunks.append(chunk)
99
 
100
- # Add documents to collection
101
- print(f"Processing {len(chunks)} chunks...")
102
- for i, chunk in enumerate(chunks):
103
- # Get corresponding index line if available
104
- index_text = index_lines[i].strip() if i < len(index_lines) else f"Chunk {i+1}"
 
 
 
 
105
 
106
  self.collection.add(
107
- documents=[chunk],
108
- ids=[f"doc_{i}"],
109
- metadatas=[{
110
- "index": index_text,
111
- "chunk_number": i
112
- }]
113
  )
114
 
115
  self.initialized = True
116
- return f"Successfully loaded {len(chunks)} chunks into the system!"
117
-
118
- except Exception as e:
119
- return f"Error initializing system: {str(e)}"
120
-
121
- def verify_system(self) -> str:
122
- """Verify system is working properly"""
123
- try:
124
- # Check document count
125
- count = self.collection.count()
126
- if count == 0:
127
- return "Error: No documents found in the system!"
128
-
129
- # Test basic query
130
- test_query = "What is criminal conspiracy?"
131
- results = self.collection.query(
132
- query_texts=[test_query],
133
- n_results=1
134
- )
135
-
136
- if not results['documents'][0]:
137
- return "Error: Search functionality not working properly!"
138
-
139
- return f"System verification successful! Found {count} documents."
140
 
141
  except Exception as e:
142
- return f"System verification failed: {str(e)}"
 
143
 
144
- def search(self, query: str, n_results: int = 3) -> List[Dict]:
145
- """Search for relevant documents"""
146
- if not self.initialized:
147
- return [{"error": "System not initialized! Please wait."}]
148
-
149
  try:
150
  results = self.collection.query(
151
  query_texts=[query],
152
- n_results=n_results,
153
  include=["documents", "metadatas", "distances"]
154
  )
155
 
@@ -157,7 +122,7 @@ class LegalSearchSystem:
157
  {
158
  "content": doc,
159
  "metadata": meta,
160
- "similarity": 1 - dist
161
  }
162
  for doc, meta, dist in zip(
163
  results['documents'][0],
@@ -166,80 +131,66 @@ class LegalSearchSystem:
166
  )
167
  ]
168
  except Exception as e:
169
- return [{"error": f"Search error: {str(e)}"}]
 
170
 
171
  def chat(self, query: str, history) -> str:
172
- """Process query and return response"""
173
  try:
174
- if not self.initialized:
175
- init_msg = self.initialize_embeddings()
176
- if "Error" in init_msg:
177
- return init_msg
178
 
179
  # Search for relevant content
180
- search_results = self.search(query)
181
 
182
- if "error" in search_results[0]:
183
- return search_results[0]["error"]
184
 
185
- # Prepare context
186
  context = "\n\n".join([
187
  f"[Section {r['metadata']['index']}]\n{r['content']}"
188
  for r in search_results
189
  ])
190
 
191
- # Select template
192
- template_type = "summary" if "summarize" in query.lower() else "default"
193
- prompt = ChatPromptTemplate.from_template(self.templates[template_type])
194
-
195
- # Generate response
196
- chain = prompt | self.llm
197
- response = chain.invoke({
198
  "context": context,
199
- "chat_history": "\n".join([f"{h[0]}: {h[1]}" for h in self.chat_history[-3:]]),
200
  "question": query
201
  })
202
 
203
  # Update chat history
204
- self.chat_history.append(("User", query))
205
- self.chat_history.append(("Assistant", response))
206
 
207
- return response
208
 
209
  except Exception as e:
210
  return f"Error processing query: {str(e)}"
211
 
212
- # Initialize the system
213
- system = LegalSearchSystem()
214
 
215
- # Create Gradio interface
216
- demo = gr.Interface(
217
- fn=system.chat,
218
- inputs=[
219
- gr.Textbox(
220
- label="Your Question",
221
- placeholder="Ask about the Bharatiya Nyaya Sanhita, 2023...",
222
- lines=2
223
- ),
224
- gr.State([]) # For chat history
225
- ],
226
- outputs=gr.Textbox(label="Answer", lines=10),
227
- title="🔍 Bharatiya Nyaya Sanhita, 2023 - Legal Search System",
228
- description="""
229
- Ask questions about the Bharatiya Nyaya Sanhita, 2023:
230
- - For summaries, include the word "summarize" in your question
231
- - For specific provisions, ask directly about the topic
232
- - System will automatically initialize on first query
233
- """,
234
  examples=[
235
- ["What is the definition of criminal conspiracy?"],
236
- ["Summarize the provisions related to theft"],
237
- ["What are the punishments for corruption?"],
238
- ["Explain the concept of culpable homicide"]
239
  ],
240
  theme=gr.themes.Soft()
241
  )
242
 
243
  # Launch the interface
244
  if __name__ == "__main__":
245
- demo.launch()
 
 
 
 
 
 
2
  from typing import List, Dict
3
  from langchain_core.prompts import ChatPromptTemplate
4
  from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
5
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
6
  import chromadb
7
  from chromadb.utils import embedding_functions
 
8
  import torch
 
9
  import os
10
 
11
+ class LegalChatbot:
12
  def __init__(self):
13
+ print("Initializing Legal Chatbot...")
14
 
15
  # Initialize ChromaDB
16
  self.chroma_client = chromadb.Client()
17
 
18
  # Initialize embedding function
19
  self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
20
+ model_name="all-MiniLM-L6-v2",
21
+ device="cpu"
22
  )
23
 
24
+ # Create collection
25
+ self.collection = self.chroma_client.create_collection(
26
+ name="text_collection",
27
+ embedding_function=self.embedding_function,
28
+ metadata={"hnsw:space": "cosine"}
29
+ )
30
+
31
+ # Initialize the model - using a smaller model suitable for CPU
32
  pipe = pipeline(
33
  "text-generation",
34
  model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
35
  max_new_tokens=512,
36
  temperature=0.7,
37
  top_p=0.95,
38
+ repetition_penalty=1.15,
39
+ device="cpu"
40
  )
41
  self.llm = HuggingFacePipeline(pipeline=pipe)
42
 
43
+ # Create prompt template
44
+ self.template = """
45
+ IMPORTANT: You are a helpful assistant that provides information about the Bharatiya Nyaya Sanhita, 2023 based on the retrieved context.
 
 
 
46
 
47
+ STRICT RULES:
48
+ 1. Base your response ONLY on the provided context
49
+ 2. If you cannot find relevant information, respond with: "I apologize, but I cannot find information about that in the database."
50
+ 3. Do not make assumptions or use external knowledge
51
+ 4. Be concise and accurate in your responses
52
+ 5. If quoting from the context, clearly indicate it
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ Context: {context}
55
+
56
+ Chat History: {chat_history}
57
+
58
+ Question: {question}
59
+
60
+ Answer:"""
61
+
62
+ self.prompt = ChatPromptTemplate.from_template(self.template)
63
+ self.chat_history = ""
64
  self.initialized = False
65
 
66
+ def _initialize_database(self) -> bool:
67
+ """Initialize the database with document content"""
68
  try:
69
  if self.initialized:
70
+ return True
71
+
72
+ print("Loading documents into database...")
73
 
74
+ # Read the main text file
75
  with open('a2023-45.txt', 'r', encoding='utf-8') as f:
76
  text_content = f.read()
77
 
78
+ # Read the index file
79
  with open('index.txt', 'r', encoding='utf-8') as f:
80
  index_lines = f.readlines()
81
 
 
86
  chunk = text_content[i:i + chunk_size]
87
  chunks.append(chunk)
88
 
89
+ # Add documents in batches
90
+ batch_size = 50
91
+ for i in range(0, len(chunks), batch_size):
92
+ batch = chunks[i:i + batch_size]
93
+ batch_ids = [f"doc_{j}" for j in range(i, i + len(batch))]
94
+ batch_metadata = [{
95
+ "index": index_lines[j].strip() if j < len(index_lines) else f"Chunk {j+1}",
96
+ "chunk_number": j
97
+ } for j in range(i, i + len(batch))]
98
 
99
  self.collection.add(
100
+ documents=batch,
101
+ ids=batch_ids,
102
+ metadatas=batch_metadata
 
 
 
103
  )
104
 
105
  self.initialized = True
106
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  except Exception as e:
109
+ print(f"Error initializing database: {str(e)}")
110
+ return False
111
 
112
+ def _search_database(self, query: str) -> List[Dict]:
113
+ """Search the database for relevant documents"""
 
 
 
114
  try:
115
  results = self.collection.query(
116
  query_texts=[query],
117
+ n_results=3,
118
  include=["documents", "metadatas", "distances"]
119
  )
120
 
 
122
  {
123
  "content": doc,
124
  "metadata": meta,
125
+ "score": 1 - dist
126
  }
127
  for doc, meta, dist in zip(
128
  results['documents'][0],
 
131
  )
132
  ]
133
  except Exception as e:
134
+ print(f"Error searching database: {str(e)}")
135
+ return []
136
 
137
  def chat(self, query: str, history) -> str:
138
+ """Process a query and return a response"""
139
  try:
140
+ # Initialize database if needed
141
+ if not self.initialized and not self._initialize_database():
142
+ return "Error: Unable to initialize the database. Please try again."
 
143
 
144
  # Search for relevant content
145
+ search_results = self._search_database(query)
146
 
147
+ if not search_results:
148
+ return "I apologize, but I cannot find information about that in the database."
149
 
150
+ # Extract and combine relevant content
151
  context = "\n\n".join([
152
  f"[Section {r['metadata']['index']}]\n{r['content']}"
153
  for r in search_results
154
  ])
155
 
156
+ # Generate response using LLM
157
+ chain = self.prompt | self.llm
158
+ result = chain.invoke({
 
 
 
 
159
  "context": context,
160
+ "chat_history": self.chat_history,
161
  "question": query
162
  })
163
 
164
  # Update chat history
165
+ self.chat_history += f"\nUser: {query}\nAI: {result}\n"
 
166
 
167
+ return result
168
 
169
  except Exception as e:
170
  return f"Error processing query: {str(e)}"
171
 
172
+ # Initialize the chatbot
173
+ chatbot = LegalChatbot()
174
 
175
+ # Create the Gradio interface
176
+ iface = gr.ChatInterface(
177
+ chatbot.chat,
178
+ title="Bharatiya Nyaya Sanhita, 2023 - Legal Assistant",
179
+ description="Ask questions about the Bharatiya Nyaya Sanhita, 2023. The system will initialize on your first query.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  examples=[
181
+ "What is criminal conspiracy?",
182
+ "What are the punishments for corruption?",
183
+ "Explain the concept of culpable homicide",
184
+ "What constitutes theft under the act?"
185
  ],
186
  theme=gr.themes.Soft()
187
  )
188
 
189
  # Launch the interface
190
  if __name__ == "__main__":
191
+ iface.launch(
192
+ share=False,
193
+ debug=False,
194
+ show_error=True,
195
+ enable_queue=True
196
+ )