vishalsh13 commited on
Commit
0ef9213
·
1 Parent(s): 8b38151

Cde upload

Browse files
Files changed (3) hide show
  1. Dockerfile +25 -0
  2. app.py +382 -0
  3. requirements.txt +20 -0
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.10-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the requirements file into the container at /app
8
+ COPY requirements.txt /app/
9
+
10
+ # Install any needed packages specified in requirements.txt
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy the current directory contents into the container at /app
14
+ COPY . /app/
15
+
16
+ # Create a directory for file uploads and set permissions
17
+ # Using chown for better security (assuming UID/GID 1000:1000)
18
+ # If this doesn't work, you might need to try chmod 777, but chown is preferred.
19
+ RUN mkdir -p /app/uploads && chown -R 1000:1000 /app/uploads
20
+
21
+ # Make port 7860 available to the world outside this container
22
+ EXPOSE 7860
23
+
24
+ # Run app.py when the container launches
25
+ CMD ["gradio", "app.py", "--server-name", "0.0.0.0", "--server-port", "7860"]
app.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import docx
4
+ import PyPDF2
5
+ from docx.shared import RGBColor, Pt
6
+ from io import BytesIO, IOBase
7
+ import tempfile
8
+ import re
9
+ import datetime
10
+ import gradio as gr
11
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
12
+
13
+
14
+ ###############################################################################
15
+ # 1) Logging Configuration
16
+ ###############################################################################
17
+ logging.basicConfig(
18
+ level=logging.INFO, # or logging.DEBUG for more verbose logs
19
+ format="%(asctime)s [%(levelname)s] %(name)s - %(message)s"
20
+ )
21
+ logger = logging.getLogger("LLM-Legal-App")
22
+
23
+ ###############################################################################
24
+ # 2) Retrieve API Key (Hugging Face)
25
+ ###############################################################################
26
+
27
+ # Use os.environ to get the API key
28
+ api_key = os.environ.get("HUGGINGFACE_API_KEY")
29
+
30
+ if not api_key:
31
+ logger.error("Hugging Face API key not found in environment variables.")
32
+ raise ValueError("Hugging Face API key not found. Set it with `os.environ['HUGGINGFACE_API_KEY'] = 'your_api_key'`")
33
+
34
+ logger.info("Successfully retrieved Hugging Face API key.")
35
+
36
+
37
+ ###############################################################################
38
+ # 3) Hugging Face Model and Utility Functions
39
+ ###############################################################################
40
+
41
+ # Initialize the Hugging Face model and tokenizer.
42
+ model_name = "Daemontatox/DocumentCogito"
43
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
44
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name, token=api_key)
45
+ generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device_map="auto") # Use device_map="auto"
46
+
47
+
48
+ def generate_legal_document(doc_type, party_a, party_b, context, country):
49
+ """
50
+ Uses the Hugging Face model to generate a legal document.
51
+ """
52
+ logger.info(f"Starting generation for doc_type={doc_type!r}.")
53
+ party_a = party_a if party_a else "[Party A Not Provided]"
54
+ party_b = party_b if party_b else "[Party B Not Provided]"
55
+ context = context if context else "[Context Not Provided]"
56
+
57
+ prompt = f"""
58
+ Generate a {doc_type} for:
59
+ 1) {party_a}
60
+ 2) {party_b}
61
+
62
+ Context/brief of the agreement:
63
+ {context}.
64
+
65
+ The document should include:
66
+ - Purpose of the {doc_type}
67
+ - Responsibilities and obligations of each party
68
+ - Confidentiality terms
69
+ - Payment terms (use [To Be Determined] if not specified)
70
+ - Term (duration) and termination
71
+ - Governing law: {country}
72
+ - Jurisdiction: [Appropriate region in {country} if not provided]
73
+ - Signature blocks
74
+
75
+ Use formal language, but keep it relatively clear and readable.
76
+ For any missing information, use placeholders like [To Be Determined].
77
+ Include a disclaimer that this is a draft and not legally binding until reviewed and signed.
78
+ """
79
+ logger.debug(f"Generated prompt:\n{prompt}")
80
+
81
+ try:
82
+ # Use the Hugging Face pipeline
83
+ generated_text = generator(
84
+ prompt,
85
+ max_length=1400, # Adjust as needed
86
+ num_return_sequences=1,
87
+ temperature=0.3, # Adjust as needed
88
+ )[0]['generated_text']
89
+
90
+ logger.info("Document generation complete.")
91
+ return generated_text
92
+
93
+ except Exception as e:
94
+ logger.exception("Error generating legal document.")
95
+ return f"Error generating document: {e}"
96
+
97
+
98
+ def review_legal_document(doc_text, doc_type, party_a, party_b):
99
+ """Reviews document using the Hugging Face model."""
100
+ logger.info("Starting document review (rule-based and wording).")
101
+
102
+ # --- Rule-Based Review ---
103
+ rule_based_prompt = f"""
104
+ Review the following document and provide feedback based on these rules:
105
+
106
+ Document text:
107
+ \"\"\"
108
+ {doc_text}
109
+ \"\"\"
110
+ 1) Parties and Authority: ... (rest of prompt from previous turns) ...
111
+ """
112
+ logger.debug(f"Generated rule-based review prompt:\n{rule_based_prompt}")
113
+
114
+ try:
115
+ rule_based_review = generator(
116
+ rule_based_prompt,
117
+ max_length=2000,
118
+ num_return_sequences=1,
119
+ temperature=0.3,
120
+ )[0]['generated_text']
121
+ except Exception as e:
122
+ logger.exception("Error during rule-based review.")
123
+ return f"Error during rule-based review: {e}"
124
+
125
+ # --- Wording Analysis ---
126
+ wording_analysis_prompt = f"""
127
+ Analyze the wording of the following legal document:
128
+
129
+ Document text:
130
+ \"\"\"
131
+ {doc_text}
132
+ \"\"\"
133
+
134
+ Provide an analysis covering: ... (rest of prompt from previous turns) ...
135
+ """
136
+ logger.debug(f"Generated wording analysis prompt:\n{wording_analysis_prompt}")
137
+
138
+ try:
139
+ wording_analysis = generator(
140
+ wording_analysis_prompt,
141
+ max_length=1400,
142
+ num_return_sequences=1,
143
+ temperature=0.3,
144
+ )[0]['generated_text']
145
+ except Exception as e:
146
+ logger.exception("Error during wording analysis.")
147
+ return f"Error during wording analysis: {e}"
148
+
149
+ combined_review = f"Rule-Based Analysis:\n\n{rule_based_review}\n\nWording Analysis:\n\n{wording_analysis}"
150
+ return combined_review
151
+
152
+
153
+ ###############################################################################
154
+ # 4) File Parsing (PDF, DOCX)
155
+ ###############################################################################
156
+
157
+ def parse_bytesio(file_data: BytesIO) -> str:
158
+ """Parses a BytesIO object."""
159
+ logger.info("Parsing BytesIO object...")
160
+ # ... (rest of parse_bytesio function from previous turns) ...
161
+ try:
162
+ # Attempt to determine file type from content
163
+ try:
164
+ doc_obj = docx.Document(file_data)
165
+ return "\n".join([para.text for para in doc_obj.paragraphs]).strip()
166
+ except docx.opc.exceptions.PackageNotFoundError:
167
+ logger.info("BytesIO is not DOCX, trying PDF.")
168
+ file_data.seek(0)
169
+ try:
170
+ pdf_reader = PyPDF2.PdfReader(file_data)
171
+ return "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()]).strip()
172
+ except Exception as e:
173
+ logger.exception(f"Error parsing BytesIO as PDF: {e}")
174
+ return f"Error parsing BytesIO as PDF: {e}"
175
+ except Exception as e:
176
+ logger.exception(f"Error processing BytesIO: {e}")
177
+ return f"Error processing file content: {e}"
178
+ except Exception as e:
179
+ logger.exception(f"Error parsing BytesIO: {e}")
180
+ return f"Error parsing BytesIO: {e}"
181
+
182
+ def parse_uploaded_file_path(file_data) -> str:
183
+ """Takes file data, determines type, extracts text."""
184
+ # ... (rest of parse_uploaded_file_path from previous turns)
185
+ if not file_data:
186
+ logger.warning("No file provided.")
187
+ return ""
188
+ if isinstance(file_data, str):
189
+ file_path = file_data
190
+ logger.info(f"Received filepath: {file_path}")
191
+ elif isinstance(file_data, dict) and 'name' in file_data:
192
+ file_path = file_data['name']
193
+ logger.info(f"Received file object with name: {file_path}")
194
+ elif isinstance(file_data, (BytesIO, IOBase)):
195
+ return parse_bytesio(file_data)
196
+ else:
197
+ logger.error(f"Unexpected file_data type: {type(file_data)}")
198
+ return "Error: Unexpected file data format."
199
+
200
+ logger.info(f"Attempting to parse file at {file_path}")
201
+ try:
202
+ _, ext = os.path.splitext(file_path)
203
+ ext = ext.lower()
204
+ if ext == ".pdf":
205
+ with open(file_path, "rb") as f:
206
+ pdf_reader = PyPDF2.PdfReader(f)
207
+ return "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()]).strip()
208
+ elif ext == ".docx":
209
+ doc_obj = docx.Document(file_path)
210
+ return "\n".join([para.text for para in doc_obj.paragraphs]).strip()
211
+ else:
212
+ return "Unsupported file format."
213
+ except Exception as e:
214
+ logger.exception(f"Error parsing file: {e}")
215
+ return f"Error parsing file: {e}"
216
+ finally:
217
+ pass
218
+
219
+ ###############################################################################
220
+ # 5) DOCX Creation and Saving
221
+ ###############################################################################
222
+
223
+ def clean_markdown(text):
224
+ """Removes common Markdown formatting."""
225
+ # ... (rest of clean_markdown from previous turns)
226
+ if not text: return ""
227
+ text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
228
+ text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text)
229
+ text = re.sub(r'(\*|_)(.*?)(\*|_)', r'\2', text)
230
+ text = re.sub(r'^[\-\+\*]\s+', '', text, flags=re.MULTILINE)
231
+ text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE)
232
+ text = re.sub(r'^[-_*]{3,}$', '', text, flags=re.MULTILINE)
233
+ text = re.sub(r'!\[(.*?)\]\((.*?)\)', '', text)
234
+ text = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1', text)
235
+ return text.strip()
236
+
237
+ def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a="Party A", party_b="Party B"):
238
+ """Creates DOCX, adds review, saves to temp file, returns path."""
239
+ logger.debug("Creating and saving DOCX.")
240
+ # ... (rest of create_and_save_docx from previous turns) ...
241
+ document = docx.Document()
242
+
243
+ now = datetime.datetime.now()
244
+ timestamp = now.strftime("%Y%m%d_%H%M%S")
245
+ file_name = f"GEN_AI_Review_{doc_type}_{timestamp}.docx"
246
+
247
+ title = f"Gen AI Analysis of {doc_type} between companies {party_a} and {party_b}"
248
+ document.add_heading(title, level=1)
249
+
250
+ if doc_text:
251
+ document.add_heading("Generated Document", level=2)
252
+ for para in clean_markdown(doc_text).split("\n"):
253
+ document.add_paragraph(para)
254
+
255
+ if review_text:
256
+ document.add_heading("LLM Review", level=2)
257
+ for section in review_text.split("\n\n"): # Split into sections
258
+ if section.startswith("Rule-Based Analysis:"):
259
+ analysis_heading = document.add_paragraph()
260
+ analysis_run = analysis_heading.add_run("Rule-Based Analysis")
261
+ analysis_run.font.size = Pt(14)
262
+ analysis_run.font.color.rgb = RGBColor(0xFF, 0x00, 0x00)
263
+ for para in section[len("Rule-Based Analysis:"):].split("\n"):
264
+ if re.match(r"^\d+\)", para): # Check for numbered points
265
+ p = document.add_paragraph(style='List Number')
266
+ p.add_run(para).font.color.rgb = RGBColor(0xFF, 0x00, 0x00) #red
267
+ else:
268
+ document.add_paragraph(para)
269
+
270
+ elif section.startswith("Wording Analysis:"):
271
+ analysis_heading = document.add_paragraph()
272
+ analysis_run = analysis_heading.add_run("Wording Analysis")
273
+ analysis_run.font.size = Pt(14)
274
+ analysis_run.font.color.rgb = RGBColor(0xFF, 0x00, 0x00)
275
+ for para in section[len("Wording Analysis:"):].split("\n"):
276
+ document.add_paragraph(para) # black, no numbering
277
+ else: # Other sections (if any)
278
+ document.add_paragraph(section)
279
+
280
+
281
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file_name}") as tmpfile:
282
+ document.save(tmpfile.name)
283
+ logger.debug(f"DOCX saved to: {tmpfile.name}")
284
+ return tmpfile.name
285
+
286
+ ###############################################################################
287
+ # 6) Gradio Interface Functions
288
+ ###############################################################################
289
+
290
+ def generate_document_interface(doc_type, party_a, party_b, context, country):
291
+ """Handles document generation."""
292
+ logger.info(f"User requested doc generation: {doc_type}, {country}")
293
+ doc_text = generate_legal_document(doc_type, party_a, party_b, context, country)
294
+ if doc_text.startswith("Error"):
295
+ return doc_text, None
296
+ docx_file_path = create_and_save_docx(doc_text, doc_type=doc_type, party_a=party_a, party_b=party_b)
297
+ return doc_text, docx_file_path
298
+
299
+ def review_document_interface(file_data, doc_type, party_a, party_b):
300
+ """Handles document review."""
301
+ logger.info("User requested review.")
302
+ if not file_data:
303
+ return "No file uploaded.", None
304
+
305
+ original_text = parse_uploaded_file_path(file_data)
306
+ if original_text.startswith("Error") or original_text.startswith("Unsupported"):
307
+ return original_text, None
308
+
309
+ review_text = review_legal_document(original_text, doc_type, party_a, party_b)
310
+ if review_text.startswith("Error"):
311
+ return review_text, None
312
+
313
+ docx_file_path = create_and_save_docx(None, review_text, doc_type, party_a, party_b)
314
+ return review_text, docx_file_path
315
+
316
+ ###############################################################################
317
+ # 7) Build & Launch Gradio App
318
+ ###############################################################################
319
+ custom_css = """
320
+ .tab-one {
321
+ background-color: #D1EEFC; /* Light blue */
322
+ color: #333;
323
+ }
324
+ .tab-two {
325
+ background-color: #FCEED1; /* Light orange */
326
+ color: #333;
327
+ }
328
+ """
329
+
330
+ def build_app():
331
+ with gr.Blocks(css=custom_css) as demo:
332
+ gr.Markdown(
333
+ """
334
+ # UST Global LLM-based Legal Reviewer
335
+
336
+ **Review an Existing MOU, SOW, MSA in PDF/DOCX format**: Upload a document for analysis.
337
+
338
+ **Disclaimer**: This tool provides assistance but is not a substitute for professional legal advice.
339
+ """
340
+ )
341
+ with gr.Tabs(selected=1):
342
+ with gr.Tab("Generate Document",visible=False):
343
+ doc_type = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU")
344
+ party_a = gr.Textbox(label="Party A Name", placeholder="e.g., Tech Innovations LLC")
345
+ party_b = gr.Textbox(label="Party B Name", placeholder="e.g., Global Consulting Corp")
346
+ context = gr.Textbox(label="Context/Brief", placeholder="Short summary of the agreement...")
347
+ country = gr.Dropdown(label="Governing Law (Country)", choices=["India", "Malaysia", "US", "UK", "Singapore", "Japan"], value="India")
348
+ gen_button = gr.Button("Generate Document")
349
+ gen_output_text = gr.Textbox(label="Generated Document", lines=15, placeholder="Generated document will appear here...")
350
+ gen_output_file = gr.File(label="Download DOCX", type="filepath")
351
+ gen_button.click(
352
+ generate_document_interface,
353
+ inputs=[doc_type, party_a, party_b, context, country],
354
+ outputs=[gen_output_text, gen_output_file]
355
+ )
356
+
357
+ with gr.Tab("Review Document",elem_classes="tab-one", id=1):
358
+ # Hidden inputs to store values from Generate tab
359
+ doc_type_review = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU", visible=False)
360
+ party_a_review = gr.Textbox(label="Party A Name", visible=False)
361
+ party_b_review = gr.Textbox(label="Party B Name", visible=False)
362
+
363
+ file_input = gr.File(label="Upload PDF/DOCX for Review", type="filepath")
364
+ review_button = gr.Button("Review Document")
365
+ review_output_text = gr.Textbox(label="Review", lines=15, placeholder="Review will appear here...")
366
+ review_output_file = gr.File(label="Download Reviewed DOCX", type="filepath")
367
+ review_button.click(
368
+ review_document_interface,
369
+ inputs=[file_input, doc_type_review, party_a_review, party_b_review],
370
+ outputs=[review_output_text, review_output_file]
371
+ )
372
+ # Copy values from Generate to Review tab (hidden fields)
373
+ gen_button.click(lambda x, y, z: (x, y, z), [doc_type, party_a, party_b], [doc_type_review, party_a_review, party_b_review])
374
+
375
+ gr.Markdown("**Note:** Scanned PDFs may not parse correctly. .docx is generally preferred.")
376
+
377
+ return demo
378
+
379
+ logger.info("Initializing Gradio interface...")
380
+ demo = build_app()
381
+ logger.info("Launching Gradio app.")
382
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Libraries
2
+ gradio>=4.0,<5.0 # UI framework (tested with 4.14.0, but should work with 4.x)
3
+ docx>=0.2.4 # For creating DOCX files (python-docx)
4
+ PyPDF2>=3.0.0 # For reading PDF files
5
+ transformers>=4.36.0,<4.37 # Hugging Face Transformers (version range for compatibility)
6
+ sentencepiece>=0.1.9,<0.2.0 # Required by some Hugging Face tokenizers
7
+ torch>=2.1.0 # PyTorch (at least 2.1 for recent transformers compatibility)
8
+
9
+ # Other commonly used packages (you might already have these)
10
+ # - If you encounter installation issues, you can try removing or commenting out
11
+ # lines for packages you believe are already installed correctly in your
12
+ # environment. However, it's generally good practice to include them
13
+ # for reproducibility.
14
+ typing-extensions>=4.0.0 # For type hints and compatibility (often a dependency)
15
+ requests>=2.0.0 # Used by transformers and other libraries
16
+ filelock>=3.0.0 # Used by transformers for managing cache
17
+ packaging>=20.0 # For version handling
18
+ regex!=2019.12.17 # Used for text processing
19
+ tqdm>=4.27 # For progress bars (used by transformers)
20
+ numpy>=1.17 # Fundamental numerical computing