Spaces:
Sleeping
Sleeping
Delete app.py
Browse files
app.py
DELETED
@@ -1,346 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import chromadb
|
3 |
-
import os
|
4 |
-
from openai import OpenAI
|
5 |
-
import json
|
6 |
-
from typing import List, Dict
|
7 |
-
import re
|
8 |
-
from sentence_transformers import SentenceTransformer
|
9 |
-
from loguru import logger
|
10 |
-
|
11 |
-
class SentenceTransformerEmbeddings:
|
12 |
-
def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
|
13 |
-
self.model = SentenceTransformer(model_name)
|
14 |
-
|
15 |
-
def __call__(self, input: List[str]) -> List[List[float]]:
|
16 |
-
embeddings = self.model.encode(input)
|
17 |
-
return embeddings.tolist()
|
18 |
-
|
19 |
-
class LegalAssistant:
|
20 |
-
def __init__(self):
|
21 |
-
# Initialize ChromaDB
|
22 |
-
self.chroma_client = chromadb.Client()
|
23 |
-
|
24 |
-
# Initialize embedding function
|
25 |
-
self.embedding_function = SentenceTransformerEmbeddings()
|
26 |
-
|
27 |
-
# Create or get collection with proper embedding function
|
28 |
-
self.collection = self.chroma_client.get_or_create_collection(
|
29 |
-
name="legal_documents",
|
30 |
-
embedding_function=self.embedding_function
|
31 |
-
)
|
32 |
-
|
33 |
-
# Load documents if collection is empty
|
34 |
-
if self.collection.count() == 0:
|
35 |
-
self._load_documents()
|
36 |
-
|
37 |
-
# Initialize Mistral AI client
|
38 |
-
self.mistral_client = OpenAI(
|
39 |
-
api_key=os.environ.get("MISTRAL_API_KEY", "dfb2j1YDsa298GXTgZo3juSjZLGUCfwi"),
|
40 |
-
base_url="https://api.mistral.ai/v1"
|
41 |
-
)
|
42 |
-
|
43 |
-
# Define system prompt with strict rules
|
44 |
-
self.system_prompt = """You are a specialized legal assistant that MUST follow these STRICT rules:
|
45 |
-
|
46 |
-
CRITICAL RULE:
|
47 |
-
YOU MUST ONLY USE INFORMATION FROM THE PROVIDED CONTEXT. DO NOT USE ANY EXTERNAL KNOWLEDGE, INCLUDING KNOWLEDGE ABOUT IPC, CONSTITUTION, OR ANY OTHER LEGAL DOCUMENTS.
|
48 |
-
|
49 |
-
RESPONSE FORMAT RULES:
|
50 |
-
1. ALWAYS structure your response in this exact JSON format:
|
51 |
-
{
|
52 |
-
"answer": "Your detailed answer here using ONLY information from the provided context",
|
53 |
-
"reference_sections": ["Exact section titles from the context"],
|
54 |
-
"summary": "2-3 line summary using ONLY information from context",
|
55 |
-
"confidence": "HIGH/MEDIUM/LOW based on context match"
|
56 |
-
}
|
57 |
-
|
58 |
-
STRICT CONTENT RULES:
|
59 |
-
1. NEVER mention or reference IPC, Constitution, or any laws not present in the context
|
60 |
-
2. If the information is not in the context, respond ONLY with:
|
61 |
-
{
|
62 |
-
"answer": "This information is not present in the provided document.",
|
63 |
-
"reference_sections": [],
|
64 |
-
"summary": "Information not found in document",
|
65 |
-
"confidence": "LOW"
|
66 |
-
}
|
67 |
-
3. ONLY cite sections that are explicitly present in the provided context
|
68 |
-
4. DO NOT make assumptions or inferences beyond the context
|
69 |
-
5. DO NOT combine information from external knowledge
|
70 |
-
|
71 |
-
CONTEXT USAGE RULES:
|
72 |
-
1. HIGH confidence: Only when exact information is found in context
|
73 |
-
2. MEDIUM confidence: When partial information is found
|
74 |
-
3. LOW confidence: When information is unclear or not found
|
75 |
-
4. If multiple sections are relevant, cite ALL relevant sections from context
|
76 |
-
|
77 |
-
PROHIBITED ACTIONS:
|
78 |
-
1. NO references to IPC sections
|
79 |
-
2. NO references to Constitutional articles
|
80 |
-
3. NO mentions of case law not in context
|
81 |
-
4. NO legal interpretations beyond context
|
82 |
-
5. NO combining document information with external knowledge
|
83 |
-
|
84 |
-
ERROR HANDLING:
|
85 |
-
1. If query is about laws not in context: State "This topic is not covered in the provided document"
|
86 |
-
2. If query is unclear: Request specific clarification about which part of the document to check
|
87 |
-
3. If context is insufficient: State "The document does not contain this information"
|
88 |
-
"""
|
89 |
-
|
90 |
-
def _load_documents(self):
|
91 |
-
"""Load and index documents from a2023-45.txt and index.txt"""
|
92 |
-
try:
|
93 |
-
# Read the main document
|
94 |
-
with open('a2023-45.txt', 'r', encoding='utf-8') as f:
|
95 |
-
document = f.read()
|
96 |
-
|
97 |
-
# Read the index
|
98 |
-
with open('index.txt', 'r', encoding='utf-8') as f:
|
99 |
-
index_content = f.readlines()
|
100 |
-
|
101 |
-
# Parse index and split document
|
102 |
-
sections = []
|
103 |
-
current_section = ""
|
104 |
-
current_title = ""
|
105 |
-
|
106 |
-
for line in document.split('\n'):
|
107 |
-
if any(index_line.strip() in line for index_line in index_content):
|
108 |
-
if current_section:
|
109 |
-
sections.append({
|
110 |
-
"title": current_title,
|
111 |
-
"content": current_section.strip()
|
112 |
-
})
|
113 |
-
current_title = line.strip()
|
114 |
-
current_section = ""
|
115 |
-
else:
|
116 |
-
current_section += line + "\n"
|
117 |
-
|
118 |
-
# Add the last section
|
119 |
-
if current_section:
|
120 |
-
sections.append({
|
121 |
-
"title": current_title,
|
122 |
-
"content": current_section.strip()
|
123 |
-
})
|
124 |
-
|
125 |
-
# Add to ChromaDB
|
126 |
-
documents = [section["content"] for section in sections]
|
127 |
-
metadatas = [{"title": section["title"], "source": "a2023-45.txt", "section_number": i + 1}
|
128 |
-
for i, section in enumerate(sections)]
|
129 |
-
ids = [f"section_{i+1}" for i in range(len(sections))]
|
130 |
-
|
131 |
-
self.collection.add(
|
132 |
-
documents=documents,
|
133 |
-
metadatas=metadatas,
|
134 |
-
ids=ids
|
135 |
-
)
|
136 |
-
|
137 |
-
logger.info(f"Loaded {len(sections)} sections into ChromaDB")
|
138 |
-
|
139 |
-
except Exception as e:
|
140 |
-
logger.error(f"Error loading documents: {str(e)}")
|
141 |
-
raise
|
142 |
-
|
143 |
-
def validate_query(self, query: str) -> tuple[bool, str]:
|
144 |
-
"""Validate the input query"""
|
145 |
-
if not query or len(query.strip()) < 10:
|
146 |
-
return False, "Query too short. Please provide more details (minimum 10 characters)."
|
147 |
-
if len(query) > 500:
|
148 |
-
return False, "Query too long. Please be more concise (maximum 500 characters)."
|
149 |
-
if not re.search(r'[?.]$', query):
|
150 |
-
return False, "Query must end with a question mark or period."
|
151 |
-
return True, ""
|
152 |
-
|
153 |
-
def _search_documents(self, query: str) -> tuple[str, List[str]]:
|
154 |
-
"""Search ChromaDB for relevant documents"""
|
155 |
-
try:
|
156 |
-
results = self.collection.query(
|
157 |
-
query_texts=[query],
|
158 |
-
n_results=3
|
159 |
-
)
|
160 |
-
|
161 |
-
if results and results['documents']:
|
162 |
-
documents = results['documents'][0]
|
163 |
-
metadata = results['metadatas'][0]
|
164 |
-
|
165 |
-
# Format the context with section titles
|
166 |
-
formatted_docs = []
|
167 |
-
references = []
|
168 |
-
|
169 |
-
for doc, meta in zip(documents, metadata):
|
170 |
-
formatted_docs.append(f"{meta['title']}:\n{doc}")
|
171 |
-
references.append(f"{meta['title']} (Section {meta['section_number']})")
|
172 |
-
|
173 |
-
return "\n\n".join(formatted_docs), references
|
174 |
-
return "", []
|
175 |
-
|
176 |
-
except Exception as e:
|
177 |
-
logger.error(f"Search error: {str(e)}")
|
178 |
-
return "", []
|
179 |
-
|
180 |
-
def get_response(self, query: str) -> Dict:
|
181 |
-
"""Get response from Mistral AI with context from ChromaDB"""
|
182 |
-
# Validate query
|
183 |
-
is_valid, error_message = self.validate_query(query)
|
184 |
-
if not is_valid:
|
185 |
-
return {
|
186 |
-
"answer": error_message,
|
187 |
-
"references": [],
|
188 |
-
"summary": "Invalid query",
|
189 |
-
"confidence": "LOW"
|
190 |
-
}
|
191 |
-
|
192 |
-
try:
|
193 |
-
# Get relevant context from ChromaDB
|
194 |
-
context, sources = self._search_documents(query)
|
195 |
-
|
196 |
-
if not context:
|
197 |
-
return {
|
198 |
-
"answer": "This information is not present in the provided document.",
|
199 |
-
"references": [],
|
200 |
-
"summary": "Information not found in document",
|
201 |
-
"confidence": "LOW"
|
202 |
-
}
|
203 |
-
|
204 |
-
# Prepare content with explicit instructions
|
205 |
-
content = f"""IMPORTANT: ONLY use information from the following context to answer the question. DO NOT use any external knowledge.
|
206 |
-
|
207 |
-
Context Sections:
|
208 |
-
{context}
|
209 |
-
|
210 |
-
Available Document Sections:
|
211 |
-
{', '.join(sources)}
|
212 |
-
|
213 |
-
Question: {query}
|
214 |
-
|
215 |
-
Remember: ONLY use information from the above context. If the information is not in the context, state that it's not in the document."""
|
216 |
-
|
217 |
-
# Get response from Mistral AI
|
218 |
-
response = self.mistral_client.chat.completions.create(
|
219 |
-
model="mistral-medium",
|
220 |
-
messages=[
|
221 |
-
{
|
222 |
-
"role": "system",
|
223 |
-
"content": self.system_prompt
|
224 |
-
},
|
225 |
-
{
|
226 |
-
"role": "user",
|
227 |
-
"content": content
|
228 |
-
}
|
229 |
-
],
|
230 |
-
temperature=0.1,
|
231 |
-
max_tokens=1000
|
232 |
-
)
|
233 |
-
|
234 |
-
# Parse response
|
235 |
-
if response.choices and len(response.choices) > 0:
|
236 |
-
try:
|
237 |
-
result = json.loads(response.choices[0].message.content)
|
238 |
-
|
239 |
-
# Validate that references only contain sections from sources
|
240 |
-
valid_references = [ref for ref in result.get("reference_sections", [])
|
241 |
-
if any(source.split(" (Section")[0] in ref for source in sources)]
|
242 |
-
|
243 |
-
# If references mention unauthorized sources, return error
|
244 |
-
if len(valid_references) != len(result.get("reference_sections", [])):
|
245 |
-
logger.warning("Response contained unauthorized references")
|
246 |
-
return {
|
247 |
-
"answer": "Error: Response contained unauthorized references. Only information from the provided document is allowed.",
|
248 |
-
"references": [],
|
249 |
-
"summary": "Invalid response generated",
|
250 |
-
"confidence": "LOW"
|
251 |
-
}
|
252 |
-
|
253 |
-
return {
|
254 |
-
"answer": result.get("answer", "No answer provided"),
|
255 |
-
"references": valid_references,
|
256 |
-
"summary": result.get("summary", ""),
|
257 |
-
"confidence": result.get("confidence", "LOW")
|
258 |
-
}
|
259 |
-
except json.JSONDecodeError:
|
260 |
-
logger.error("Failed to parse response JSON")
|
261 |
-
return {
|
262 |
-
"answer": "Error: Response format invalid",
|
263 |
-
"references": [],
|
264 |
-
"summary": "Response parsing failed",
|
265 |
-
"confidence": "LOW"
|
266 |
-
}
|
267 |
-
|
268 |
-
return {
|
269 |
-
"answer": "No valid response received",
|
270 |
-
"references": [],
|
271 |
-
"summary": "Response generation failed",
|
272 |
-
"confidence": "LOW"
|
273 |
-
}
|
274 |
-
|
275 |
-
except Exception as e:
|
276 |
-
logger.error(f"Error in get_response: {str(e)}")
|
277 |
-
return {
|
278 |
-
"answer": f"Error: {str(e)}",
|
279 |
-
"references": [],
|
280 |
-
"summary": "System error occurred",
|
281 |
-
"confidence": "LOW"
|
282 |
-
}
|
283 |
-
|
284 |
-
# Initialize the assistant
|
285 |
-
assistant = LegalAssistant()
|
286 |
-
|
287 |
-
# Create Gradio interface
|
288 |
-
def process_query(query: str) -> tuple:
|
289 |
-
"""Process the query and return formatted response"""
|
290 |
-
response = assistant.get_response(query)
|
291 |
-
return (
|
292 |
-
response["answer"],
|
293 |
-
", ".join(response["references"]) if response["references"] else "No specific references",
|
294 |
-
response["summary"] if response["summary"] else "No summary available",
|
295 |
-
response["confidence"]
|
296 |
-
)
|
297 |
-
|
298 |
-
# Create the Gradio interface with a professional theme
|
299 |
-
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
300 |
-
gr.Markdown("""
|
301 |
-
# Indian Legal Assistant
|
302 |
-
## Guidelines for Queries:
|
303 |
-
1. Be specific and clear in your questions
|
304 |
-
2. End questions with a question mark or period
|
305 |
-
3. Keep queries between 10-500 characters
|
306 |
-
4. Questions will be answered based ONLY on the provided legal document
|
307 |
-
""")
|
308 |
-
|
309 |
-
with gr.Row():
|
310 |
-
query_input = gr.Textbox(
|
311 |
-
label="Enter your legal query",
|
312 |
-
placeholder="e.g., What are the main provisions in this document?"
|
313 |
-
)
|
314 |
-
|
315 |
-
with gr.Row():
|
316 |
-
submit_btn = gr.Button("Submit", variant="primary")
|
317 |
-
|
318 |
-
with gr.Row():
|
319 |
-
confidence_output = gr.Textbox(label="Confidence Level")
|
320 |
-
|
321 |
-
with gr.Row():
|
322 |
-
answer_output = gr.Textbox(label="Answer", lines=5)
|
323 |
-
|
324 |
-
with gr.Row():
|
325 |
-
with gr.Column():
|
326 |
-
references_output = gr.Textbox(label="Document References", lines=3)
|
327 |
-
with gr.Column():
|
328 |
-
summary_output = gr.Textbox(label="Summary", lines=2)
|
329 |
-
|
330 |
-
gr.Markdown("""
|
331 |
-
### Important Notes:
|
332 |
-
- Responses are based ONLY on the provided document
|
333 |
-
- No external legal knowledge is used
|
334 |
-
- All references are from the document itself
|
335 |
-
- Confidence levels indicate how well the answer matches the document content
|
336 |
-
""")
|
337 |
-
|
338 |
-
submit_btn.click(
|
339 |
-
fn=process_query,
|
340 |
-
inputs=[query_input],
|
341 |
-
outputs=[answer_output, references_output, summary_output, confidence_output]
|
342 |
-
)
|
343 |
-
|
344 |
-
# Launch the app
|
345 |
-
if __name__ == "__main__":
|
346 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|