Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,12 +5,21 @@ from openai import OpenAI
|
|
5 |
import json
|
6 |
from typing import List, Dict
|
7 |
import re
|
|
|
|
|
8 |
|
9 |
class LegalAssistant:
|
10 |
def __init__(self):
|
11 |
# Initialize ChromaDB
|
12 |
self.chroma_client = chromadb.Client()
|
13 |
-
self.collection = self.chroma_client.get_or_create_collection(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
# Initialize Mistral AI client
|
16 |
self.mistral_client = OpenAI(
|
@@ -19,40 +28,104 @@ class LegalAssistant:
|
|
19 |
)
|
20 |
|
21 |
# Define system prompt with strict rules
|
22 |
-
self.system_prompt = """You are a specialized legal assistant
|
|
|
|
|
|
|
23 |
|
24 |
RESPONSE FORMAT RULES:
|
25 |
1. ALWAYS structure your response in this exact JSON format:
|
26 |
{
|
27 |
-
"answer": "Your detailed answer here",
|
28 |
-
"reference_sections": ["
|
29 |
-
"summary": "2-3 line summary",
|
30 |
-
"confidence": "HIGH/MEDIUM/LOW"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
}
|
|
|
|
|
|
|
32 |
|
33 |
-
|
34 |
-
1.
|
35 |
-
2.
|
36 |
-
3.
|
37 |
-
4.
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
PROHIBITED:
|
47 |
-
1. NO personal opinions
|
48 |
-
2. NO hypothetical scenarios
|
49 |
-
3. NO interpretation of ongoing cases
|
50 |
-
4. NO advice on specific legal situations
|
51 |
|
52 |
ERROR HANDLING:
|
53 |
-
1. If query is
|
54 |
-
2. If
|
55 |
-
3. If
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
def validate_query(self, query: str) -> tuple[bool, str]:
|
58 |
"""Validate the input query"""
|
@@ -69,17 +142,27 @@ ERROR HANDLING:
|
|
69 |
try:
|
70 |
results = self.collection.query(
|
71 |
query_texts=[query],
|
72 |
-
n_results=3
|
|
|
73 |
)
|
74 |
|
75 |
if results and results['documents']:
|
76 |
documents = results['documents'][0]
|
77 |
-
metadata = results
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
return "", []
|
|
|
81 |
except Exception as e:
|
82 |
-
|
83 |
return "", []
|
84 |
|
85 |
def get_response(self, query: str) -> Dict:
|
@@ -98,10 +181,26 @@ ERROR HANDLING:
|
|
98 |
# Get relevant context from ChromaDB
|
99 |
context, sources = self._search_documents(query)
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
# Get response from Mistral AI
|
107 |
response = self.mistral_client.chat.completions.create(
|
@@ -124,13 +223,29 @@ Question: {query}""" if context else query
|
|
124 |
if response.choices and len(response.choices) > 0:
|
125 |
try:
|
126 |
result = json.loads(response.choices[0].message.content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
return {
|
128 |
"answer": result.get("answer", "No answer provided"),
|
129 |
-
"references":
|
130 |
"summary": result.get("summary", ""),
|
131 |
"confidence": result.get("confidence", "LOW")
|
132 |
}
|
133 |
except json.JSONDecodeError:
|
|
|
134 |
return {
|
135 |
"answer": "Error: Response format invalid",
|
136 |
"references": [],
|
@@ -139,13 +254,14 @@ Question: {query}""" if context else query
|
|
139 |
}
|
140 |
|
141 |
return {
|
142 |
-
"answer": "No response received",
|
143 |
"references": [],
|
144 |
"summary": "Response generation failed",
|
145 |
"confidence": "LOW"
|
146 |
}
|
147 |
|
148 |
except Exception as e:
|
|
|
149 |
return {
|
150 |
"answer": f"Error: {str(e)}",
|
151 |
"references": [],
|
@@ -158,6 +274,7 @@ assistant = LegalAssistant()
|
|
158 |
|
159 |
# Create Gradio interface
|
160 |
def process_query(query: str) -> tuple:
|
|
|
161 |
response = assistant.get_response(query)
|
162 |
return (
|
163 |
response["answer"],
|
@@ -180,7 +297,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
180 |
with gr.Row():
|
181 |
query_input = gr.Textbox(
|
182 |
label="Enter your legal query",
|
183 |
-
placeholder="e.g., What
|
184 |
)
|
185 |
|
186 |
with gr.Row():
|
@@ -194,15 +311,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
194 |
|
195 |
with gr.Row():
|
196 |
with gr.Column():
|
197 |
-
references_output = gr.Textbox(label="
|
198 |
with gr.Column():
|
199 |
summary_output = gr.Textbox(label="Summary", lines=2)
|
200 |
|
201 |
gr.Markdown("""
|
202 |
### Important Notes:
|
203 |
-
-
|
204 |
-
-
|
205 |
-
-
|
206 |
""")
|
207 |
|
208 |
submit_btn.click(
|
@@ -212,4 +329,5 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
212 |
)
|
213 |
|
214 |
# Launch the app
|
215 |
-
|
|
|
|
5 |
import json
|
6 |
from typing import List, Dict
|
7 |
import re
|
8 |
+
from sentence_transformers import SentenceTransformer
|
9 |
+
from loguru import logger
|
10 |
|
11 |
class LegalAssistant:
|
12 |
def __init__(self):
|
13 |
# Initialize ChromaDB
|
14 |
self.chroma_client = chromadb.Client()
|
15 |
+
self.collection = self.chroma_client.get_or_create_collection(
|
16 |
+
name="legal_documents",
|
17 |
+
embedding_function=SentenceTransformer('all-MiniLM-L6-v2')
|
18 |
+
)
|
19 |
+
|
20 |
+
# Load documents if collection is empty
|
21 |
+
if self.collection.count() == 0:
|
22 |
+
self._load_documents()
|
23 |
|
24 |
# Initialize Mistral AI client
|
25 |
self.mistral_client = OpenAI(
|
|
|
28 |
)
|
29 |
|
30 |
# Define system prompt with strict rules
|
31 |
+
self.system_prompt = """You are a specialized legal assistant that MUST follow these STRICT rules:
|
32 |
+
|
33 |
+
CRITICAL RULE:
|
34 |
+
YOU MUST ONLY USE INFORMATION FROM THE PROVIDED CONTEXT. DO NOT USE ANY EXTERNAL KNOWLEDGE, INCLUDING KNOWLEDGE ABOUT IPC, CONSTITUTION, OR ANY OTHER LEGAL DOCUMENTS.
|
35 |
|
36 |
RESPONSE FORMAT RULES:
|
37 |
1. ALWAYS structure your response in this exact JSON format:
|
38 |
{
|
39 |
+
"answer": "Your detailed answer here using ONLY information from the provided context",
|
40 |
+
"reference_sections": ["Exact section titles from the context"],
|
41 |
+
"summary": "2-3 line summary using ONLY information from context",
|
42 |
+
"confidence": "HIGH/MEDIUM/LOW based on context match"
|
43 |
+
}
|
44 |
+
|
45 |
+
STRICT CONTENT RULES:
|
46 |
+
1. NEVER mention or reference IPC, Constitution, or any laws not present in the context
|
47 |
+
2. If the information is not in the context, respond ONLY with:
|
48 |
+
{
|
49 |
+
"answer": "This information is not present in the provided document.",
|
50 |
+
"reference_sections": [],
|
51 |
+
"summary": "Information not found in document",
|
52 |
+
"confidence": "LOW"
|
53 |
}
|
54 |
+
3. ONLY cite sections that are explicitly present in the provided context
|
55 |
+
4. DO NOT make assumptions or inferences beyond the context
|
56 |
+
5. DO NOT combine information from external knowledge
|
57 |
|
58 |
+
CONTEXT USAGE RULES:
|
59 |
+
1. HIGH confidence: Only when exact information is found in context
|
60 |
+
2. MEDIUM confidence: When partial information is found
|
61 |
+
3. LOW confidence: When information is unclear or not found
|
62 |
+
4. If multiple sections are relevant, cite ALL relevant sections from context
|
63 |
+
|
64 |
+
PROHIBITED ACTIONS:
|
65 |
+
1. NO references to IPC sections
|
66 |
+
2. NO references to Constitutional articles
|
67 |
+
3. NO mentions of case law not in context
|
68 |
+
4. NO legal interpretations beyond context
|
69 |
+
5. NO combining document information with external knowledge
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
ERROR HANDLING:
|
72 |
+
1. If query is about laws not in context: State "This topic is not covered in the provided document"
|
73 |
+
2. If query is unclear: Request specific clarification about which part of the document to check
|
74 |
+
3. If context is insufficient: State "The document does not contain this information"
|
75 |
+
"""
|
76 |
+
|
77 |
+
def _load_documents(self):
|
78 |
+
"""Load and index documents from a2023-45.txt and index.txt"""
|
79 |
+
try:
|
80 |
+
# Read the main document
|
81 |
+
with open('a2023-45.txt', 'r', encoding='utf-8') as f:
|
82 |
+
document = f.read()
|
83 |
+
|
84 |
+
# Read the index
|
85 |
+
with open('index.txt', 'r', encoding='utf-8') as f:
|
86 |
+
index_content = f.readlines()
|
87 |
+
|
88 |
+
# Parse index and split document
|
89 |
+
sections = []
|
90 |
+
current_section = ""
|
91 |
+
current_title = ""
|
92 |
+
|
93 |
+
for line in document.split('\n'):
|
94 |
+
if any(index_line.strip() in line for index_line in index_content):
|
95 |
+
if current_section:
|
96 |
+
sections.append({
|
97 |
+
"title": current_title,
|
98 |
+
"content": current_section.strip()
|
99 |
+
})
|
100 |
+
current_title = line.strip()
|
101 |
+
current_section = ""
|
102 |
+
else:
|
103 |
+
current_section += line + "\n"
|
104 |
+
|
105 |
+
# Add the last section
|
106 |
+
if current_section:
|
107 |
+
sections.append({
|
108 |
+
"title": current_title,
|
109 |
+
"content": current_section.strip()
|
110 |
+
})
|
111 |
+
|
112 |
+
# Add to ChromaDB
|
113 |
+
for i, section in enumerate(sections):
|
114 |
+
self.collection.add(
|
115 |
+
documents=[section["content"]],
|
116 |
+
metadatas=[{
|
117 |
+
"title": section["title"],
|
118 |
+
"source": "a2023-45.txt",
|
119 |
+
"section_number": i + 1
|
120 |
+
}],
|
121 |
+
ids=[f"section_{i+1}"]
|
122 |
+
)
|
123 |
+
|
124 |
+
logger.info(f"Loaded {len(sections)} sections into ChromaDB")
|
125 |
+
|
126 |
+
except Exception as e:
|
127 |
+
logger.error(f"Error loading documents: {str(e)}")
|
128 |
+
raise
|
129 |
|
130 |
def validate_query(self, query: str) -> tuple[bool, str]:
|
131 |
"""Validate the input query"""
|
|
|
142 |
try:
|
143 |
results = self.collection.query(
|
144 |
query_texts=[query],
|
145 |
+
n_results=3,
|
146 |
+
include=["metadatas", "documents"]
|
147 |
)
|
148 |
|
149 |
if results and results['documents']:
|
150 |
documents = results['documents'][0]
|
151 |
+
metadata = results['metadatas'][0]
|
152 |
+
|
153 |
+
# Format the context with section titles
|
154 |
+
formatted_docs = []
|
155 |
+
references = []
|
156 |
+
|
157 |
+
for doc, meta in zip(documents, metadata):
|
158 |
+
formatted_docs.append(f"{meta['title']}:\n{doc}")
|
159 |
+
references.append(f"{meta['title']} (Section {meta['section_number']})")
|
160 |
+
|
161 |
+
return "\n\n".join(formatted_docs), references
|
162 |
return "", []
|
163 |
+
|
164 |
except Exception as e:
|
165 |
+
logger.error(f"Search error: {str(e)}")
|
166 |
return "", []
|
167 |
|
168 |
def get_response(self, query: str) -> Dict:
|
|
|
181 |
# Get relevant context from ChromaDB
|
182 |
context, sources = self._search_documents(query)
|
183 |
|
184 |
+
if not context:
|
185 |
+
return {
|
186 |
+
"answer": "This information is not present in the provided document.",
|
187 |
+
"references": [],
|
188 |
+
"summary": "Information not found in document",
|
189 |
+
"confidence": "LOW"
|
190 |
+
}
|
191 |
+
|
192 |
+
# Prepare content with explicit instructions
|
193 |
+
content = f"""IMPORTANT: ONLY use information from the following context to answer the question. DO NOT use any external knowledge.
|
194 |
+
|
195 |
+
Context Sections:
|
196 |
+
{context}
|
197 |
+
|
198 |
+
Available Document Sections:
|
199 |
+
{', '.join(sources)}
|
200 |
+
|
201 |
+
Question: {query}
|
202 |
+
|
203 |
+
Remember: ONLY use information from the above context. If the information is not in the context, state that it's not in the document."""
|
204 |
|
205 |
# Get response from Mistral AI
|
206 |
response = self.mistral_client.chat.completions.create(
|
|
|
223 |
if response.choices and len(response.choices) > 0:
|
224 |
try:
|
225 |
result = json.loads(response.choices[0].message.content)
|
226 |
+
|
227 |
+
# Validate that references only contain sections from sources
|
228 |
+
valid_references = [ref for ref in result.get("reference_sections", [])
|
229 |
+
if any(source in ref for source in sources)]
|
230 |
+
|
231 |
+
# If references mention unauthorized sources, return error
|
232 |
+
if len(valid_references) != len(result.get("reference_sections", [])):
|
233 |
+
logger.warning("Response contained unauthorized references")
|
234 |
+
return {
|
235 |
+
"answer": "Error: Response contained unauthorized references. Only information from the provided document is allowed.",
|
236 |
+
"references": [],
|
237 |
+
"summary": "Invalid response generated",
|
238 |
+
"confidence": "LOW"
|
239 |
+
}
|
240 |
+
|
241 |
return {
|
242 |
"answer": result.get("answer", "No answer provided"),
|
243 |
+
"references": valid_references,
|
244 |
"summary": result.get("summary", ""),
|
245 |
"confidence": result.get("confidence", "LOW")
|
246 |
}
|
247 |
except json.JSONDecodeError:
|
248 |
+
logger.error("Failed to parse response JSON")
|
249 |
return {
|
250 |
"answer": "Error: Response format invalid",
|
251 |
"references": [],
|
|
|
254 |
}
|
255 |
|
256 |
return {
|
257 |
+
"answer": "No valid response received",
|
258 |
"references": [],
|
259 |
"summary": "Response generation failed",
|
260 |
"confidence": "LOW"
|
261 |
}
|
262 |
|
263 |
except Exception as e:
|
264 |
+
logger.error(f"Error in get_response: {str(e)}")
|
265 |
return {
|
266 |
"answer": f"Error: {str(e)}",
|
267 |
"references": [],
|
|
|
274 |
|
275 |
# Create Gradio interface
|
276 |
def process_query(query: str) -> tuple:
|
277 |
+
"""Process the query and return formatted response"""
|
278 |
response = assistant.get_response(query)
|
279 |
return (
|
280 |
response["answer"],
|
|
|
297 |
with gr.Row():
|
298 |
query_input = gr.Textbox(
|
299 |
label="Enter your legal query",
|
300 |
+
placeholder="e.g., What are the main provisions in this document?"
|
301 |
)
|
302 |
|
303 |
with gr.Row():
|
|
|
311 |
|
312 |
with gr.Row():
|
313 |
with gr.Column():
|
314 |
+
references_output = gr.Textbox(label="Document References", lines=3)
|
315 |
with gr.Column():
|
316 |
summary_output = gr.Textbox(label="Summary", lines=2)
|
317 |
|
318 |
gr.Markdown("""
|
319 |
### Important Notes:
|
320 |
+
- Responses are based ONLY on the provided document
|
321 |
+
- No external legal knowledge is used
|
322 |
+
- All references are from the document itself
|
323 |
""")
|
324 |
|
325 |
submit_btn.click(
|
|
|
329 |
)
|
330 |
|
331 |
# Launch the app
|
332 |
+
if __name__ == "__main__":
|
333 |
+
demo.launch()
|