veerukhannan commited on
Commit
2105dc2
·
verified ·
1 Parent(s): bb05d9c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -153
app.py CHANGED
@@ -2,162 +2,150 @@ import gradio as gr
2
  from typing import List, Dict
3
  from langchain_core.prompts import ChatPromptTemplate
4
  from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
5
- from langchain_community.embeddings import HuggingFaceEmbeddings
6
- from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
7
- import torch
8
- from sentence_transformers import SentenceTransformer
9
  import chromadb
10
  from chromadb.utils import embedding_functions
11
- import numpy as np
 
12
  from tqdm import tqdm
13
  import os
14
- from huggingface_hub import login
15
- from dotenv import load_dotenv
16
 
17
- # Load environment variables
18
- load_dotenv()
19
-
20
- # Login to Hugging Face Hub if token is available
21
- if os.getenv("HUGGINGFACE_API_TOKEN"):
22
- login(token=os.getenv("HUGGINGFACE_API_TOKEN"))
23
-
24
- class EnhancedChatbot:
25
  def __init__(self):
 
 
26
  # Initialize ChromaDB
27
  self.chroma_client = chromadb.Client()
28
 
29
- # Initialize embedding model using sentence-transformers
30
  self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
31
  model_name="all-MiniLM-L6-v2"
32
  )
33
 
34
- # Create collection with cosine similarity
35
- self.collection = self.chroma_client.create_collection(
36
- name="text_collection",
37
- embedding_function=self.embedding_function,
38
- metadata={"hnsw:space": "cosine"}
39
- )
40
-
41
- # Initialize the LLM with 8-bit quantization for efficiency
42
- model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
43
- tokenizer = AutoTokenizer.from_pretrained(model_name)
44
- model = AutoModelForCausalLM.from_pretrained(
45
- model_name,
46
- load_in_8bit=True,
47
- device_map="auto",
48
- torch_dtype=torch.float16
49
- )
50
-
51
  pipe = pipeline(
52
  "text-generation",
53
- model=model,
54
- tokenizer=tokenizer,
55
  max_new_tokens=512,
56
  temperature=0.7,
57
  top_p=0.95,
58
- repetition_penalty=1.15,
59
- do_sample=True
60
  )
61
-
62
  self.llm = HuggingFacePipeline(pipeline=pipe)
63
 
64
- # Enhanced prompt templates with specific use cases
 
 
 
 
 
 
 
65
  self.templates = {
66
  "default": """
67
- You are a knowledgeable assistant providing accurate information based on the given context.
68
-
69
- GUIDELINES:
70
- 1. Use ONLY the provided context
71
- 2. If information is not in context, say "I don't have enough information"
72
- 3. Be concise and clear
73
- 4. Use markdown formatting for better readability
74
- 5. If quoting, use proper citation format
75
 
76
  Context: {context}
77
  Chat History: {chat_history}
78
  Question: {question}
79
 
80
- Response:""",
81
-
82
- "summary": """
83
- Create a comprehensive summary of the provided context.
84
-
85
- Context: {context}
86
-
87
- REQUIREMENTS:
88
- 1. Structure the summary with clear headings
89
- 2. Use bullet points for key information
90
- 3. Highlight important concepts
91
- 4. Maintain factual accuracy
92
-
93
- Summary:""",
94
-
95
- "technical": """
96
- Provide a detailed technical analysis of the context.
97
-
98
- Context: {context}
99
- Question: {question}
100
-
101
- GUIDELINES:
102
- 1. Focus on technical specifications
103
- 2. Explain complex concepts clearly
104
- 3. Use appropriate technical terminology
105
- 4. Include relevant examples from context
106
- 5. Structure the response logically
107
 
108
- Technical Analysis:""",
109
 
110
- "comparative": """
111
- Compare and analyze different aspects from the context.
112
 
113
  Context: {context}
114
  Question: {question}
115
 
116
- APPROACH:
117
- 1. Identify key points for comparison
118
- 2. Analyze similarities and differences
119
- 3. Present balanced viewpoints
120
- 4. Use tables or lists for clarity
121
 
122
- Comparison:"""
123
  }
124
 
125
  self.chat_history = []
126
- self.loaded = False
127
 
128
- def load_data(self, file_path: str, chunk_size: int = 512, overlap: int = 50):
129
- """Load and index data with progress bar"""
130
- if self.loaded:
131
- return True
132
-
133
  try:
134
- # Read the text file
135
- with open(file_path, 'r', encoding='utf-8') as f:
136
- content = f.read()
 
 
 
 
 
 
 
 
 
137
 
138
- # Create chunks with overlap
 
139
  chunks = []
140
- for i in range(0, len(content), chunk_size - overlap):
141
- chunk = content[i:i + chunk_size]
142
  chunks.append(chunk)
143
 
144
- # Add documents to collection with progress bar
145
- for i, chunk in tqdm(enumerate(chunks), desc="Loading chunks", total=len(chunks)):
 
 
 
 
146
  self.collection.add(
147
  documents=[chunk],
148
- ids=[f"chunk_{i}"],
149
- metadatas=[{"source": file_path, "chunk_id": i}]
 
 
 
150
  )
151
 
152
- self.loaded = True
153
- return True
154
 
155
  except Exception as e:
156
- print(f"Error loading data: {str(e)}")
157
- return False
158
 
159
- def _search_documents(self, query: str, n_results: int = 5) -> List[Dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  """Search for relevant documents"""
 
 
 
161
  try:
162
  results = self.collection.query(
163
  query_texts=[query],
@@ -169,7 +157,7 @@ class EnhancedChatbot:
169
  {
170
  "content": doc,
171
  "metadata": meta,
172
- "similarity": 1 - dist # Convert distance to similarity
173
  }
174
  for doc, meta, dist in zip(
175
  results['documents'][0],
@@ -178,44 +166,30 @@ class EnhancedChatbot:
178
  )
179
  ]
180
  except Exception as e:
181
- print(f"Search error: {str(e)}")
182
- return []
183
-
184
- def _select_template(self, query: str) -> str:
185
- """Select appropriate template based on query content"""
186
- query_lower = query.lower()
187
-
188
- if any(word in query_lower for word in ["summarize", "summary", "overview"]):
189
- return "summary"
190
- elif any(word in query_lower for word in ["technical", "explain how", "how does"]):
191
- return "technical"
192
- elif any(word in query_lower for word in ["compare", "difference", "versus", "vs"]):
193
- return "comparative"
194
- return "default"
195
 
196
  def chat(self, query: str, history) -> str:
197
- """Process query and generate response"""
198
  try:
199
- if not self.loaded:
200
- if not self.load_data('a2023-45.txt'):
201
- return "Error: Failed to load document data."
 
202
 
203
  # Search for relevant content
204
- search_results = self._search_documents(query)
205
 
206
- if not search_results:
207
- return "I apologize, but I couldn't find relevant information in the database."
208
 
209
- # Prepare context with similarity scores
210
- context_parts = []
211
- for result in search_results:
212
- context_parts.append(
213
- f"[Similarity: {result['similarity']:.2f}]\n{result['content']}"
214
- )
215
- context = "\n\n".join(context_parts)
216
 
217
- # Select and use appropriate template
218
- template_type = self._select_template(query)
219
  prompt = ChatPromptTemplate.from_template(self.templates[template_type])
220
 
221
  # Generate response
@@ -235,38 +209,33 @@ class EnhancedChatbot:
235
  except Exception as e:
236
  return f"Error processing query: {str(e)}"
237
 
238
- # Initialize chatbot
239
- chatbot = EnhancedChatbot()
240
 
241
  # Create Gradio interface
242
  demo = gr.Interface(
243
- fn=chatbot.chat,
244
  inputs=[
245
  gr.Textbox(
246
  label="Your Question",
247
- placeholder="Ask anything about the document...",
248
  lines=2
249
  ),
250
  gr.State([]) # For chat history
251
  ],
252
  outputs=gr.Textbox(label="Answer", lines=10),
253
- title="🤖 Enhanced Document Q&A System",
254
  description="""
255
- ### Advanced Document Question-Answering System
256
-
257
- **Available Query Types:**
258
- - 📝 **General Questions**: Just ask normally
259
- - 📊 **Summaries**: Include words like "summarize" or "overview"
260
- - 🔧 **Technical Details**: Use words like "technical" or "explain how"
261
- - 🔄 **Comparisons**: Ask to "compare" or use "versus"
262
-
263
- *The system will automatically select the best response format based on your question.*
264
  """,
265
  examples=[
266
- ["Can you summarize the main points of the document?"],
267
- ["What are the technical details about the implementation?"],
268
- ["Compare the different approaches mentioned in the text."],
269
- ["What are the key concepts discussed?"]
270
  ],
271
  theme=gr.themes.Soft()
272
  )
 
2
  from typing import List, Dict
3
  from langchain_core.prompts import ChatPromptTemplate
4
  from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
5
+ from transformers import pipeline
 
 
 
6
  import chromadb
7
  from chromadb.utils import embedding_functions
8
+ from sentence_transformers import SentenceTransformer
9
+ import torch
10
  from tqdm import tqdm
11
  import os
 
 
12
 
13
+ class LegalSearchSystem:
 
 
 
 
 
 
 
14
  def __init__(self):
15
+ print("Initializing Legal Search System...")
16
+
17
  # Initialize ChromaDB
18
  self.chroma_client = chromadb.Client()
19
 
20
+ # Initialize embedding function
21
  self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
22
  model_name="all-MiniLM-L6-v2"
23
  )
24
 
25
+ # Initialize the model for text generation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  pipe = pipeline(
27
  "text-generation",
28
+ model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
 
29
  max_new_tokens=512,
30
  temperature=0.7,
31
  top_p=0.95,
32
+ repetition_penalty=1.15
 
33
  )
 
34
  self.llm = HuggingFacePipeline(pipeline=pipe)
35
 
36
+ # Create or get collection
37
+ self.collection = self.chroma_client.create_collection(
38
+ name="text_collection",
39
+ embedding_function=self.embedding_function,
40
+ metadata={"hnsw:space": "cosine"}
41
+ )
42
+
43
+ # Initialize chat templates
44
  self.templates = {
45
  "default": """
46
+ You are a legal assistant providing information about the Bharatiya Nyaya Sanhita, 2023.
 
 
 
 
 
 
 
47
 
48
  Context: {context}
49
  Chat History: {chat_history}
50
  Question: {question}
51
 
52
+ Instructions:
53
+ 1. Answer based ONLY on the provided context
54
+ 2. If information isn't in context, say "I don't have enough information"
55
+ 3. Be precise and cite specific sections when possible
56
+ 4. Use clear, legal terminology
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ Answer:""",
59
 
60
+ "summary": """
61
+ Provide a summary of the legal provisions from the context.
62
 
63
  Context: {context}
64
  Question: {question}
65
 
66
+ Format:
67
+ 1. Main Points
68
+ 2. Key Provisions
69
+ 3. Important Definitions
 
70
 
71
+ Summary:"""
72
  }
73
 
74
  self.chat_history = []
75
+ self.initialized = False
76
 
77
+ def initialize_embeddings(self) -> str:
78
+ """Initialize the system by loading and embedding documents"""
 
 
 
79
  try:
80
+ if self.initialized:
81
+ return "System already initialized!"
82
+
83
+ print("Loading documents and creating embeddings...")
84
+
85
+ # Read main text file
86
+ with open('a2023-45.txt', 'r', encoding='utf-8') as f:
87
+ text_content = f.read()
88
+
89
+ # Read index file
90
+ with open('index.txt', 'r', encoding='utf-8') as f:
91
+ index_lines = f.readlines()
92
 
93
+ # Create chunks
94
+ chunk_size = 512
95
  chunks = []
96
+ for i in range(0, len(text_content), chunk_size):
97
+ chunk = text_content[i:i + chunk_size]
98
  chunks.append(chunk)
99
 
100
+ # Add documents to collection
101
+ print(f"Processing {len(chunks)} chunks...")
102
+ for i, chunk in enumerate(chunks):
103
+ # Get corresponding index line if available
104
+ index_text = index_lines[i].strip() if i < len(index_lines) else f"Chunk {i+1}"
105
+
106
  self.collection.add(
107
  documents=[chunk],
108
+ ids=[f"doc_{i}"],
109
+ metadatas=[{
110
+ "index": index_text,
111
+ "chunk_number": i
112
+ }]
113
  )
114
 
115
+ self.initialized = True
116
+ return f"Successfully loaded {len(chunks)} chunks into the system!"
117
 
118
  except Exception as e:
119
+ return f"Error initializing system: {str(e)}"
 
120
 
121
+ def verify_system(self) -> str:
122
+ """Verify system is working properly"""
123
+ try:
124
+ # Check document count
125
+ count = self.collection.count()
126
+ if count == 0:
127
+ return "Error: No documents found in the system!"
128
+
129
+ # Test basic query
130
+ test_query = "What is criminal conspiracy?"
131
+ results = self.collection.query(
132
+ query_texts=[test_query],
133
+ n_results=1
134
+ )
135
+
136
+ if not results['documents'][0]:
137
+ return "Error: Search functionality not working properly!"
138
+
139
+ return f"System verification successful! Found {count} documents."
140
+
141
+ except Exception as e:
142
+ return f"System verification failed: {str(e)}"
143
+
144
+ def search(self, query: str, n_results: int = 3) -> List[Dict]:
145
  """Search for relevant documents"""
146
+ if not self.initialized:
147
+ return [{"error": "System not initialized! Please wait."}]
148
+
149
  try:
150
  results = self.collection.query(
151
  query_texts=[query],
 
157
  {
158
  "content": doc,
159
  "metadata": meta,
160
+ "similarity": 1 - dist
161
  }
162
  for doc, meta, dist in zip(
163
  results['documents'][0],
 
166
  )
167
  ]
168
  except Exception as e:
169
+ return [{"error": f"Search error: {str(e)}"}]
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
  def chat(self, query: str, history) -> str:
172
+ """Process query and return response"""
173
  try:
174
+ if not self.initialized:
175
+ init_msg = self.initialize_embeddings()
176
+ if "Error" in init_msg:
177
+ return init_msg
178
 
179
  # Search for relevant content
180
+ search_results = self.search(query)
181
 
182
+ if "error" in search_results[0]:
183
+ return search_results[0]["error"]
184
 
185
+ # Prepare context
186
+ context = "\n\n".join([
187
+ f"[Section {r['metadata']['index']}]\n{r['content']}"
188
+ for r in search_results
189
+ ])
 
 
190
 
191
+ # Select template
192
+ template_type = "summary" if "summarize" in query.lower() else "default"
193
  prompt = ChatPromptTemplate.from_template(self.templates[template_type])
194
 
195
  # Generate response
 
209
  except Exception as e:
210
  return f"Error processing query: {str(e)}"
211
 
212
+ # Initialize the system
213
+ system = LegalSearchSystem()
214
 
215
  # Create Gradio interface
216
  demo = gr.Interface(
217
+ fn=system.chat,
218
  inputs=[
219
  gr.Textbox(
220
  label="Your Question",
221
+ placeholder="Ask about the Bharatiya Nyaya Sanhita, 2023...",
222
  lines=2
223
  ),
224
  gr.State([]) # For chat history
225
  ],
226
  outputs=gr.Textbox(label="Answer", lines=10),
227
+ title="🔍 Bharatiya Nyaya Sanhita, 2023 - Legal Search System",
228
  description="""
229
+ Ask questions about the Bharatiya Nyaya Sanhita, 2023:
230
+ - For summaries, include the word "summarize" in your question
231
+ - For specific provisions, ask directly about the topic
232
+ - System will automatically initialize on first query
 
 
 
 
 
233
  """,
234
  examples=[
235
+ ["What is the definition of criminal conspiracy?"],
236
+ ["Summarize the provisions related to theft"],
237
+ ["What are the punishments for corruption?"],
238
+ ["Explain the concept of culpable homicide"]
239
  ],
240
  theme=gr.themes.Soft()
241
  )