veerukhannan commited on
Commit
eca979c
·
verified ·
1 Parent(s): 14bc0aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -44
app.py CHANGED
@@ -5,12 +5,21 @@ from openai import OpenAI
5
  import json
6
  from typing import List, Dict
7
  import re
 
 
8
 
9
  class LegalAssistant:
10
  def __init__(self):
11
  # Initialize ChromaDB
12
  self.chroma_client = chromadb.Client()
13
- self.collection = self.chroma_client.get_or_create_collection("legal_documents")
 
 
 
 
 
 
 
14
 
15
  # Initialize Mistral AI client
16
  self.mistral_client = OpenAI(
@@ -19,40 +28,104 @@ class LegalAssistant:
19
  )
20
 
21
  # Define system prompt with strict rules
22
- self.system_prompt = """You are a specialized legal assistant trained on Indian law. You MUST follow these strict rules:
 
 
 
23
 
24
  RESPONSE FORMAT RULES:
25
  1. ALWAYS structure your response in this exact JSON format:
26
  {
27
- "answer": "Your detailed answer here",
28
- "reference_sections": ["Section X of Act Y", ...],
29
- "summary": "2-3 line summary",
30
- "confidence": "HIGH/MEDIUM/LOW"
 
 
 
 
 
 
 
 
 
 
31
  }
 
 
 
32
 
33
- CONTENT RULES:
34
- 1. NEVER make assumptions or provide information not supported by Indian law
35
- 2. ALWAYS cite specific sections, acts, and legal precedents
36
- 3. If information is insufficient, explicitly state "Insufficient information" in answer
37
- 4. NEVER provide legal advice, only legal information
38
- 5. For any constitutional matters, ALWAYS cite relevant Articles
39
-
40
- ACCURACY RULES:
41
- 1. If confidence is less than 80%, mark as LOW confidence
42
- 2. If multiple interpretations exist, list ALL with citations
43
- 3. If law has been amended, specify the latest amendment date
44
- 4. For case law, cite the full case reference
45
-
46
- PROHIBITED:
47
- 1. NO personal opinions
48
- 2. NO hypothetical scenarios
49
- 3. NO interpretation of ongoing cases
50
- 4. NO advice on specific legal situations
51
 
52
  ERROR HANDLING:
53
- 1. If query is unclear: Request clarification
54
- 2. If outside Indian law scope: State "Outside scope of Indian law"
55
- 3. If conflicting laws exist: List all applicable laws"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  def validate_query(self, query: str) -> tuple[bool, str]:
58
  """Validate the input query"""
@@ -69,17 +142,27 @@ ERROR HANDLING:
69
  try:
70
  results = self.collection.query(
71
  query_texts=[query],
72
- n_results=3
 
73
  )
74
 
75
  if results and results['documents']:
76
  documents = results['documents'][0]
77
- metadata = results.get('metadatas', [[]])[0]
78
- sources = [m.get('source', 'Unknown') for m in metadata]
79
- return "\n\n".join(documents), sources
 
 
 
 
 
 
 
 
80
  return "", []
 
81
  except Exception as e:
82
- print(f"Search error: {str(e)}")
83
  return "", []
84
 
85
  def get_response(self, query: str) -> Dict:
@@ -98,10 +181,26 @@ ERROR HANDLING:
98
  # Get relevant context from ChromaDB
99
  context, sources = self._search_documents(query)
100
 
101
- # Prepare content
102
- content = f"""Context: {context}
103
- Sources: {', '.join(sources)}
104
- Question: {query}""" if context else query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  # Get response from Mistral AI
107
  response = self.mistral_client.chat.completions.create(
@@ -124,13 +223,29 @@ Question: {query}""" if context else query
124
  if response.choices and len(response.choices) > 0:
125
  try:
126
  result = json.loads(response.choices[0].message.content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  return {
128
  "answer": result.get("answer", "No answer provided"),
129
- "references": result.get("reference_sections", []),
130
  "summary": result.get("summary", ""),
131
  "confidence": result.get("confidence", "LOW")
132
  }
133
  except json.JSONDecodeError:
 
134
  return {
135
  "answer": "Error: Response format invalid",
136
  "references": [],
@@ -139,13 +254,14 @@ Question: {query}""" if context else query
139
  }
140
 
141
  return {
142
- "answer": "No response received",
143
  "references": [],
144
  "summary": "Response generation failed",
145
  "confidence": "LOW"
146
  }
147
 
148
  except Exception as e:
 
149
  return {
150
  "answer": f"Error: {str(e)}",
151
  "references": [],
@@ -158,6 +274,7 @@ assistant = LegalAssistant()
158
 
159
  # Create Gradio interface
160
  def process_query(query: str) -> tuple:
 
161
  response = assistant.get_response(query)
162
  return (
163
  response["answer"],
@@ -180,7 +297,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
180
  with gr.Row():
181
  query_input = gr.Textbox(
182
  label="Enter your legal query",
183
- placeholder="e.g., What is the legal age for marriage in India as per current laws?"
184
  )
185
 
186
  with gr.Row():
@@ -194,15 +311,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
194
 
195
  with gr.Row():
196
  with gr.Column():
197
- references_output = gr.Textbox(label="Legal References", lines=3)
198
  with gr.Column():
199
  summary_output = gr.Textbox(label="Summary", lines=2)
200
 
201
  gr.Markdown("""
202
  ### Important Notes:
203
- - This assistant provides legal information, not legal advice
204
- - Always verify information with a qualified legal professional
205
- - Information is based on Indian law only
206
  """)
207
 
208
  submit_btn.click(
@@ -212,4 +329,5 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
212
  )
213
 
214
  # Launch the app
215
- demo.launch()
 
 
5
  import json
6
  from typing import List, Dict
7
  import re
8
+ from sentence_transformers import SentenceTransformer
9
+ from loguru import logger
10
 
11
  class LegalAssistant:
12
  def __init__(self):
13
  # Initialize ChromaDB
14
  self.chroma_client = chromadb.Client()
15
+ self.collection = self.chroma_client.get_or_create_collection(
16
+ name="legal_documents",
17
+ embedding_function=SentenceTransformer('all-MiniLM-L6-v2')
18
+ )
19
+
20
+ # Load documents if collection is empty
21
+ if self.collection.count() == 0:
22
+ self._load_documents()
23
 
24
  # Initialize Mistral AI client
25
  self.mistral_client = OpenAI(
 
28
  )
29
 
30
  # Define system prompt with strict rules
31
+ self.system_prompt = """You are a specialized legal assistant that MUST follow these STRICT rules:
32
+
33
+ CRITICAL RULE:
34
+ YOU MUST ONLY USE INFORMATION FROM THE PROVIDED CONTEXT. DO NOT USE ANY EXTERNAL KNOWLEDGE, INCLUDING KNOWLEDGE ABOUT IPC, CONSTITUTION, OR ANY OTHER LEGAL DOCUMENTS.
35
 
36
  RESPONSE FORMAT RULES:
37
  1. ALWAYS structure your response in this exact JSON format:
38
  {
39
+ "answer": "Your detailed answer here using ONLY information from the provided context",
40
+ "reference_sections": ["Exact section titles from the context"],
41
+ "summary": "2-3 line summary using ONLY information from context",
42
+ "confidence": "HIGH/MEDIUM/LOW based on context match"
43
+ }
44
+
45
+ STRICT CONTENT RULES:
46
+ 1. NEVER mention or reference IPC, Constitution, or any laws not present in the context
47
+ 2. If the information is not in the context, respond ONLY with:
48
+ {
49
+ "answer": "This information is not present in the provided document.",
50
+ "reference_sections": [],
51
+ "summary": "Information not found in document",
52
+ "confidence": "LOW"
53
  }
54
+ 3. ONLY cite sections that are explicitly present in the provided context
55
+ 4. DO NOT make assumptions or inferences beyond the context
56
+ 5. DO NOT combine information from external knowledge
57
 
58
+ CONTEXT USAGE RULES:
59
+ 1. HIGH confidence: Only when exact information is found in context
60
+ 2. MEDIUM confidence: When partial information is found
61
+ 3. LOW confidence: When information is unclear or not found
62
+ 4. If multiple sections are relevant, cite ALL relevant sections from context
63
+
64
+ PROHIBITED ACTIONS:
65
+ 1. NO references to IPC sections
66
+ 2. NO references to Constitutional articles
67
+ 3. NO mentions of case law not in context
68
+ 4. NO legal interpretations beyond context
69
+ 5. NO combining document information with external knowledge
 
 
 
 
 
 
70
 
71
  ERROR HANDLING:
72
+ 1. If query is about laws not in context: State "This topic is not covered in the provided document"
73
+ 2. If query is unclear: Request specific clarification about which part of the document to check
74
+ 3. If context is insufficient: State "The document does not contain this information"
75
+ """
76
+
77
+ def _load_documents(self):
78
+ """Load and index documents from a2023-45.txt and index.txt"""
79
+ try:
80
+ # Read the main document
81
+ with open('a2023-45.txt', 'r', encoding='utf-8') as f:
82
+ document = f.read()
83
+
84
+ # Read the index
85
+ with open('index.txt', 'r', encoding='utf-8') as f:
86
+ index_content = f.readlines()
87
+
88
+ # Parse index and split document
89
+ sections = []
90
+ current_section = ""
91
+ current_title = ""
92
+
93
+ for line in document.split('\n'):
94
+ if any(index_line.strip() in line for index_line in index_content):
95
+ if current_section:
96
+ sections.append({
97
+ "title": current_title,
98
+ "content": current_section.strip()
99
+ })
100
+ current_title = line.strip()
101
+ current_section = ""
102
+ else:
103
+ current_section += line + "\n"
104
+
105
+ # Add the last section
106
+ if current_section:
107
+ sections.append({
108
+ "title": current_title,
109
+ "content": current_section.strip()
110
+ })
111
+
112
+ # Add to ChromaDB
113
+ for i, section in enumerate(sections):
114
+ self.collection.add(
115
+ documents=[section["content"]],
116
+ metadatas=[{
117
+ "title": section["title"],
118
+ "source": "a2023-45.txt",
119
+ "section_number": i + 1
120
+ }],
121
+ ids=[f"section_{i+1}"]
122
+ )
123
+
124
+ logger.info(f"Loaded {len(sections)} sections into ChromaDB")
125
+
126
+ except Exception as e:
127
+ logger.error(f"Error loading documents: {str(e)}")
128
+ raise
129
 
130
  def validate_query(self, query: str) -> tuple[bool, str]:
131
  """Validate the input query"""
 
142
  try:
143
  results = self.collection.query(
144
  query_texts=[query],
145
+ n_results=3,
146
+ include=["metadatas", "documents"]
147
  )
148
 
149
  if results and results['documents']:
150
  documents = results['documents'][0]
151
+ metadata = results['metadatas'][0]
152
+
153
+ # Format the context with section titles
154
+ formatted_docs = []
155
+ references = []
156
+
157
+ for doc, meta in zip(documents, metadata):
158
+ formatted_docs.append(f"{meta['title']}:\n{doc}")
159
+ references.append(f"{meta['title']} (Section {meta['section_number']})")
160
+
161
+ return "\n\n".join(formatted_docs), references
162
  return "", []
163
+
164
  except Exception as e:
165
+ logger.error(f"Search error: {str(e)}")
166
  return "", []
167
 
168
  def get_response(self, query: str) -> Dict:
 
181
  # Get relevant context from ChromaDB
182
  context, sources = self._search_documents(query)
183
 
184
+ if not context:
185
+ return {
186
+ "answer": "This information is not present in the provided document.",
187
+ "references": [],
188
+ "summary": "Information not found in document",
189
+ "confidence": "LOW"
190
+ }
191
+
192
+ # Prepare content with explicit instructions
193
+ content = f"""IMPORTANT: ONLY use information from the following context to answer the question. DO NOT use any external knowledge.
194
+
195
+ Context Sections:
196
+ {context}
197
+
198
+ Available Document Sections:
199
+ {', '.join(sources)}
200
+
201
+ Question: {query}
202
+
203
+ Remember: ONLY use information from the above context. If the information is not in the context, state that it's not in the document."""
204
 
205
  # Get response from Mistral AI
206
  response = self.mistral_client.chat.completions.create(
 
223
  if response.choices and len(response.choices) > 0:
224
  try:
225
  result = json.loads(response.choices[0].message.content)
226
+
227
+ # Validate that references only contain sections from sources
228
+ valid_references = [ref for ref in result.get("reference_sections", [])
229
+ if any(source in ref for source in sources)]
230
+
231
+ # If references mention unauthorized sources, return error
232
+ if len(valid_references) != len(result.get("reference_sections", [])):
233
+ logger.warning("Response contained unauthorized references")
234
+ return {
235
+ "answer": "Error: Response contained unauthorized references. Only information from the provided document is allowed.",
236
+ "references": [],
237
+ "summary": "Invalid response generated",
238
+ "confidence": "LOW"
239
+ }
240
+
241
  return {
242
  "answer": result.get("answer", "No answer provided"),
243
+ "references": valid_references,
244
  "summary": result.get("summary", ""),
245
  "confidence": result.get("confidence", "LOW")
246
  }
247
  except json.JSONDecodeError:
248
+ logger.error("Failed to parse response JSON")
249
  return {
250
  "answer": "Error: Response format invalid",
251
  "references": [],
 
254
  }
255
 
256
  return {
257
+ "answer": "No valid response received",
258
  "references": [],
259
  "summary": "Response generation failed",
260
  "confidence": "LOW"
261
  }
262
 
263
  except Exception as e:
264
+ logger.error(f"Error in get_response: {str(e)}")
265
  return {
266
  "answer": f"Error: {str(e)}",
267
  "references": [],
 
274
 
275
  # Create Gradio interface
276
  def process_query(query: str) -> tuple:
277
+ """Process the query and return formatted response"""
278
  response = assistant.get_response(query)
279
  return (
280
  response["answer"],
 
297
  with gr.Row():
298
  query_input = gr.Textbox(
299
  label="Enter your legal query",
300
+ placeholder="e.g., What are the main provisions in this document?"
301
  )
302
 
303
  with gr.Row():
 
311
 
312
  with gr.Row():
313
  with gr.Column():
314
+ references_output = gr.Textbox(label="Document References", lines=3)
315
  with gr.Column():
316
  summary_output = gr.Textbox(label="Summary", lines=2)
317
 
318
  gr.Markdown("""
319
  ### Important Notes:
320
+ - Responses are based ONLY on the provided document
321
+ - No external legal knowledge is used
322
+ - All references are from the document itself
323
  """)
324
 
325
  submit_btn.click(
 
329
  )
330
 
331
  # Launch the app
332
+ if __name__ == "__main__":
333
+ demo.launch()