veerukhannan commited on
Commit
361f75f
·
verified ·
1 Parent(s): dbb74f9

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -346
app.py DELETED
@@ -1,346 +0,0 @@
1
- import gradio as gr
2
- import chromadb
3
- import os
4
- from openai import OpenAI
5
- import json
6
- from typing import List, Dict
7
- import re
8
- from sentence_transformers import SentenceTransformer
9
- from loguru import logger
10
-
11
- class SentenceTransformerEmbeddings:
12
- def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
13
- self.model = SentenceTransformer(model_name)
14
-
15
- def __call__(self, input: List[str]) -> List[List[float]]:
16
- embeddings = self.model.encode(input)
17
- return embeddings.tolist()
18
-
19
- class LegalAssistant:
20
- def __init__(self):
21
- # Initialize ChromaDB
22
- self.chroma_client = chromadb.Client()
23
-
24
- # Initialize embedding function
25
- self.embedding_function = SentenceTransformerEmbeddings()
26
-
27
- # Create or get collection with proper embedding function
28
- self.collection = self.chroma_client.get_or_create_collection(
29
- name="legal_documents",
30
- embedding_function=self.embedding_function
31
- )
32
-
33
- # Load documents if collection is empty
34
- if self.collection.count() == 0:
35
- self._load_documents()
36
-
37
- # Initialize Mistral AI client
38
- self.mistral_client = OpenAI(
39
- api_key=os.environ.get("MISTRAL_API_KEY", "dfb2j1YDsa298GXTgZo3juSjZLGUCfwi"),
40
- base_url="https://api.mistral.ai/v1"
41
- )
42
-
43
- # Define system prompt with strict rules
44
- self.system_prompt = """You are a specialized legal assistant that MUST follow these STRICT rules:
45
-
46
- CRITICAL RULE:
47
- YOU MUST ONLY USE INFORMATION FROM THE PROVIDED CONTEXT. DO NOT USE ANY EXTERNAL KNOWLEDGE, INCLUDING KNOWLEDGE ABOUT IPC, CONSTITUTION, OR ANY OTHER LEGAL DOCUMENTS.
48
-
49
- RESPONSE FORMAT RULES:
50
- 1. ALWAYS structure your response in this exact JSON format:
51
- {
52
- "answer": "Your detailed answer here using ONLY information from the provided context",
53
- "reference_sections": ["Exact section titles from the context"],
54
- "summary": "2-3 line summary using ONLY information from context",
55
- "confidence": "HIGH/MEDIUM/LOW based on context match"
56
- }
57
-
58
- STRICT CONTENT RULES:
59
- 1. NEVER mention or reference IPC, Constitution, or any laws not present in the context
60
- 2. If the information is not in the context, respond ONLY with:
61
- {
62
- "answer": "This information is not present in the provided document.",
63
- "reference_sections": [],
64
- "summary": "Information not found in document",
65
- "confidence": "LOW"
66
- }
67
- 3. ONLY cite sections that are explicitly present in the provided context
68
- 4. DO NOT make assumptions or inferences beyond the context
69
- 5. DO NOT combine information from external knowledge
70
-
71
- CONTEXT USAGE RULES:
72
- 1. HIGH confidence: Only when exact information is found in context
73
- 2. MEDIUM confidence: When partial information is found
74
- 3. LOW confidence: When information is unclear or not found
75
- 4. If multiple sections are relevant, cite ALL relevant sections from context
76
-
77
- PROHIBITED ACTIONS:
78
- 1. NO references to IPC sections
79
- 2. NO references to Constitutional articles
80
- 3. NO mentions of case law not in context
81
- 4. NO legal interpretations beyond context
82
- 5. NO combining document information with external knowledge
83
-
84
- ERROR HANDLING:
85
- 1. If query is about laws not in context: State "This topic is not covered in the provided document"
86
- 2. If query is unclear: Request specific clarification about which part of the document to check
87
- 3. If context is insufficient: State "The document does not contain this information"
88
- """
89
-
90
- def _load_documents(self):
91
- """Load and index documents from a2023-45.txt and index.txt"""
92
- try:
93
- # Read the main document
94
- with open('a2023-45.txt', 'r', encoding='utf-8') as f:
95
- document = f.read()
96
-
97
- # Read the index
98
- with open('index.txt', 'r', encoding='utf-8') as f:
99
- index_content = f.readlines()
100
-
101
- # Parse index and split document
102
- sections = []
103
- current_section = ""
104
- current_title = ""
105
-
106
- for line in document.split('\n'):
107
- if any(index_line.strip() in line for index_line in index_content):
108
- if current_section:
109
- sections.append({
110
- "title": current_title,
111
- "content": current_section.strip()
112
- })
113
- current_title = line.strip()
114
- current_section = ""
115
- else:
116
- current_section += line + "\n"
117
-
118
- # Add the last section
119
- if current_section:
120
- sections.append({
121
- "title": current_title,
122
- "content": current_section.strip()
123
- })
124
-
125
- # Add to ChromaDB
126
- documents = [section["content"] for section in sections]
127
- metadatas = [{"title": section["title"], "source": "a2023-45.txt", "section_number": i + 1}
128
- for i, section in enumerate(sections)]
129
- ids = [f"section_{i+1}" for i in range(len(sections))]
130
-
131
- self.collection.add(
132
- documents=documents,
133
- metadatas=metadatas,
134
- ids=ids
135
- )
136
-
137
- logger.info(f"Loaded {len(sections)} sections into ChromaDB")
138
-
139
- except Exception as e:
140
- logger.error(f"Error loading documents: {str(e)}")
141
- raise
142
-
143
- def validate_query(self, query: str) -> tuple[bool, str]:
144
- """Validate the input query"""
145
- if not query or len(query.strip()) < 10:
146
- return False, "Query too short. Please provide more details (minimum 10 characters)."
147
- if len(query) > 500:
148
- return False, "Query too long. Please be more concise (maximum 500 characters)."
149
- if not re.search(r'[?.]$', query):
150
- return False, "Query must end with a question mark or period."
151
- return True, ""
152
-
153
- def _search_documents(self, query: str) -> tuple[str, List[str]]:
154
- """Search ChromaDB for relevant documents"""
155
- try:
156
- results = self.collection.query(
157
- query_texts=[query],
158
- n_results=3
159
- )
160
-
161
- if results and results['documents']:
162
- documents = results['documents'][0]
163
- metadata = results['metadatas'][0]
164
-
165
- # Format the context with section titles
166
- formatted_docs = []
167
- references = []
168
-
169
- for doc, meta in zip(documents, metadata):
170
- formatted_docs.append(f"{meta['title']}:\n{doc}")
171
- references.append(f"{meta['title']} (Section {meta['section_number']})")
172
-
173
- return "\n\n".join(formatted_docs), references
174
- return "", []
175
-
176
- except Exception as e:
177
- logger.error(f"Search error: {str(e)}")
178
- return "", []
179
-
180
- def get_response(self, query: str) -> Dict:
181
- """Get response from Mistral AI with context from ChromaDB"""
182
- # Validate query
183
- is_valid, error_message = self.validate_query(query)
184
- if not is_valid:
185
- return {
186
- "answer": error_message,
187
- "references": [],
188
- "summary": "Invalid query",
189
- "confidence": "LOW"
190
- }
191
-
192
- try:
193
- # Get relevant context from ChromaDB
194
- context, sources = self._search_documents(query)
195
-
196
- if not context:
197
- return {
198
- "answer": "This information is not present in the provided document.",
199
- "references": [],
200
- "summary": "Information not found in document",
201
- "confidence": "LOW"
202
- }
203
-
204
- # Prepare content with explicit instructions
205
- content = f"""IMPORTANT: ONLY use information from the following context to answer the question. DO NOT use any external knowledge.
206
-
207
- Context Sections:
208
- {context}
209
-
210
- Available Document Sections:
211
- {', '.join(sources)}
212
-
213
- Question: {query}
214
-
215
- Remember: ONLY use information from the above context. If the information is not in the context, state that it's not in the document."""
216
-
217
- # Get response from Mistral AI
218
- response = self.mistral_client.chat.completions.create(
219
- model="mistral-medium",
220
- messages=[
221
- {
222
- "role": "system",
223
- "content": self.system_prompt
224
- },
225
- {
226
- "role": "user",
227
- "content": content
228
- }
229
- ],
230
- temperature=0.1,
231
- max_tokens=1000
232
- )
233
-
234
- # Parse response
235
- if response.choices and len(response.choices) > 0:
236
- try:
237
- result = json.loads(response.choices[0].message.content)
238
-
239
- # Validate that references only contain sections from sources
240
- valid_references = [ref for ref in result.get("reference_sections", [])
241
- if any(source.split(" (Section")[0] in ref for source in sources)]
242
-
243
- # If references mention unauthorized sources, return error
244
- if len(valid_references) != len(result.get("reference_sections", [])):
245
- logger.warning("Response contained unauthorized references")
246
- return {
247
- "answer": "Error: Response contained unauthorized references. Only information from the provided document is allowed.",
248
- "references": [],
249
- "summary": "Invalid response generated",
250
- "confidence": "LOW"
251
- }
252
-
253
- return {
254
- "answer": result.get("answer", "No answer provided"),
255
- "references": valid_references,
256
- "summary": result.get("summary", ""),
257
- "confidence": result.get("confidence", "LOW")
258
- }
259
- except json.JSONDecodeError:
260
- logger.error("Failed to parse response JSON")
261
- return {
262
- "answer": "Error: Response format invalid",
263
- "references": [],
264
- "summary": "Response parsing failed",
265
- "confidence": "LOW"
266
- }
267
-
268
- return {
269
- "answer": "No valid response received",
270
- "references": [],
271
- "summary": "Response generation failed",
272
- "confidence": "LOW"
273
- }
274
-
275
- except Exception as e:
276
- logger.error(f"Error in get_response: {str(e)}")
277
- return {
278
- "answer": f"Error: {str(e)}",
279
- "references": [],
280
- "summary": "System error occurred",
281
- "confidence": "LOW"
282
- }
283
-
284
- # Initialize the assistant
285
- assistant = LegalAssistant()
286
-
287
- # Create Gradio interface
288
- def process_query(query: str) -> tuple:
289
- """Process the query and return formatted response"""
290
- response = assistant.get_response(query)
291
- return (
292
- response["answer"],
293
- ", ".join(response["references"]) if response["references"] else "No specific references",
294
- response["summary"] if response["summary"] else "No summary available",
295
- response["confidence"]
296
- )
297
-
298
- # Create the Gradio interface with a professional theme
299
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
300
- gr.Markdown("""
301
- # Indian Legal Assistant
302
- ## Guidelines for Queries:
303
- 1. Be specific and clear in your questions
304
- 2. End questions with a question mark or period
305
- 3. Keep queries between 10-500 characters
306
- 4. Questions will be answered based ONLY on the provided legal document
307
- """)
308
-
309
- with gr.Row():
310
- query_input = gr.Textbox(
311
- label="Enter your legal query",
312
- placeholder="e.g., What are the main provisions in this document?"
313
- )
314
-
315
- with gr.Row():
316
- submit_btn = gr.Button("Submit", variant="primary")
317
-
318
- with gr.Row():
319
- confidence_output = gr.Textbox(label="Confidence Level")
320
-
321
- with gr.Row():
322
- answer_output = gr.Textbox(label="Answer", lines=5)
323
-
324
- with gr.Row():
325
- with gr.Column():
326
- references_output = gr.Textbox(label="Document References", lines=3)
327
- with gr.Column():
328
- summary_output = gr.Textbox(label="Summary", lines=2)
329
-
330
- gr.Markdown("""
331
- ### Important Notes:
332
- - Responses are based ONLY on the provided document
333
- - No external legal knowledge is used
334
- - All references are from the document itself
335
- - Confidence levels indicate how well the answer matches the document content
336
- """)
337
-
338
- submit_btn.click(
339
- fn=process_query,
340
- inputs=[query_input],
341
- outputs=[answer_output, references_output, summary_output, confidence_output]
342
- )
343
-
344
- # Launch the app
345
- if __name__ == "__main__":
346
- demo.launch()