Commit
·
9cb0411
1
Parent(s):
f7ab2c4
changed code
Browse files- Dockerfile +27 -13
- app.py +202 -94
- requirements.txt +7 -20
Dockerfile
CHANGED
@@ -1,18 +1,32 @@
|
|
1 |
-
FROM
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
WORKDIR /app
|
4 |
-
COPY requirements.txt /app/
|
5 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
6 |
-
COPY . /app/
|
7 |
|
8 |
-
#
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
13 |
|
14 |
-
|
|
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
CMD ["python", "app.py"]
|
|
|
1 |
+
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
|
2 |
|
3 |
+
# Set environment variables
|
4 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
5 |
+
ENV PYTHONUNBUFFERED=1
|
6 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
7 |
+
|
8 |
+
# Install system dependencies
|
9 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
10 |
+
python3 \
|
11 |
+
python3-pip \
|
12 |
+
python3-dev \
|
13 |
+
build-essential \
|
14 |
+
git \
|
15 |
+
&& apt-get clean \
|
16 |
+
&& rm -rf /var/lib/apt/lists/*
|
17 |
+
|
18 |
+
# Set the working directory
|
19 |
WORKDIR /app
|
|
|
|
|
|
|
20 |
|
21 |
+
# Copy requirements file
|
22 |
+
COPY requirements.txt .
|
23 |
+
|
24 |
+
# Install Python dependencies
|
25 |
+
RUN pip3 install --no-cache-dir -U pip setuptools wheel
|
26 |
+
RUN pip3 install --no-cache-dir -r requirements.txt
|
27 |
|
28 |
+
# Copy application code
|
29 |
+
COPY app.py .
|
30 |
|
31 |
+
# Set the default command to run the application
|
32 |
+
CMD ["python3", "app.py"]
|
|
app.py
CHANGED
@@ -7,9 +7,11 @@ from io import BytesIO, IOBase
|
|
7 |
import tempfile
|
8 |
import re
|
9 |
import datetime
|
10 |
-
import
|
11 |
-
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
12 |
|
|
|
|
|
|
|
13 |
|
14 |
###############################################################################
|
15 |
# 1) Logging Configuration
|
@@ -21,127 +23,234 @@ logging.basicConfig(
|
|
21 |
logger = logging.getLogger("LLM-Legal-App")
|
22 |
|
23 |
###############################################################################
|
24 |
-
# 2)
|
25 |
###############################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
#
|
28 |
-
|
29 |
-
|
30 |
-
if not api_key:
|
31 |
-
logger.error("Hugging Face API key not found in environment variables.")
|
32 |
-
raise ValueError("Hugging Face API key not found. Set it with `os.environ['HUGGINGFACE_API_KEY'] = 'your_api_key'`")
|
33 |
-
|
34 |
-
logger.info("Successfully retrieved Hugging Face API key.")
|
35 |
-
|
36 |
|
37 |
###############################################################################
|
38 |
-
# 3)
|
39 |
###############################################################################
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
def generate_legal_document(doc_type, party_a, party_b, context, country):
|
49 |
"""
|
50 |
-
Uses
|
51 |
"""
|
52 |
logger.info(f"Starting generation for doc_type={doc_type!r}.")
|
|
|
53 |
party_a = party_a if party_a else "[Party A Not Provided]"
|
54 |
party_b = party_b if party_b else "[Party B Not Provided]"
|
55 |
context = context if context else "[Context Not Provided]"
|
56 |
|
57 |
prompt = f"""
|
58 |
-
Generate a {doc_type} for:
|
59 |
-
1) {party_a}
|
60 |
-
2) {party_b}
|
61 |
-
|
62 |
-
Context/brief of the agreement:
|
63 |
-
{context}.
|
64 |
-
|
65 |
-
The document should include:
|
66 |
-
- Purpose of the {doc_type}
|
67 |
-
- Responsibilities and obligations of each party
|
68 |
-
- Confidentiality terms
|
69 |
-
- Payment terms (use [To Be Determined] if not specified)
|
70 |
-
- Term (duration) and termination
|
71 |
-
- Governing law: {country}
|
72 |
-
- Jurisdiction: [Appropriate region in {country} if not provided]
|
73 |
-
- Signature blocks
|
74 |
-
|
75 |
-
Use formal language, but keep it relatively clear and readable.
|
76 |
-
For any missing information, use placeholders like [To Be Determined].
|
77 |
-
Include a disclaimer that this is a draft and not legally binding until reviewed and signed.
|
78 |
-
"""
|
79 |
logger.debug(f"Generated prompt:\n{prompt}")
|
80 |
|
81 |
-
|
82 |
-
# Use the Hugging Face pipeline
|
83 |
-
generated_text = generator(
|
84 |
-
prompt,
|
85 |
-
max_length=1400, # Adjust as needed
|
86 |
-
num_return_sequences=1,
|
87 |
-
temperature=0.3, # Adjust as needed
|
88 |
-
)[0]['generated_text']
|
89 |
-
|
90 |
-
logger.info("Document generation complete.")
|
91 |
-
return generated_text
|
92 |
-
|
93 |
-
except Exception as e:
|
94 |
-
logger.exception("Error generating legal document.")
|
95 |
-
return f"Error generating document: {e}"
|
96 |
-
|
97 |
|
98 |
def review_legal_document(doc_text, doc_type, party_a, party_b):
|
99 |
-
"""
|
|
|
|
|
100 |
logger.info("Starting document review (rule-based and wording).")
|
101 |
|
102 |
# --- Rule-Based Review ---
|
103 |
rule_based_prompt = f"""
|
104 |
-
|
|
|
|
|
105 |
|
106 |
Document text:
|
107 |
\"\"\"
|
108 |
{doc_text}
|
109 |
\"\"\"
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
"""
|
112 |
logger.debug(f"Generated rule-based review prompt:\n{rule_based_prompt}")
|
113 |
|
114 |
try:
|
115 |
-
rule_based_review =
|
116 |
-
rule_based_prompt,
|
117 |
-
max_length=2000,
|
118 |
-
num_return_sequences=1,
|
119 |
-
temperature=0.3,
|
120 |
-
)[0]['generated_text']
|
121 |
except Exception as e:
|
122 |
logger.exception("Error during rule-based review.")
|
123 |
return f"Error during rule-based review: {e}"
|
124 |
|
125 |
# --- Wording Analysis ---
|
126 |
wording_analysis_prompt = f"""
|
127 |
-
|
128 |
|
129 |
Document text:
|
130 |
\"\"\"
|
131 |
{doc_text}
|
132 |
\"\"\"
|
133 |
|
134 |
-
Provide
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
"""
|
136 |
logger.debug(f"Generated wording analysis prompt:\n{wording_analysis_prompt}")
|
137 |
|
138 |
try:
|
139 |
-
wording_analysis =
|
140 |
-
wording_analysis_prompt,
|
141 |
-
max_length=1400,
|
142 |
-
num_return_sequences=1,
|
143 |
-
temperature=0.3,
|
144 |
-
)[0]['generated_text']
|
145 |
except Exception as e:
|
146 |
logger.exception("Error during wording analysis.")
|
147 |
return f"Error during wording analysis: {e}"
|
@@ -149,15 +258,13 @@ Provide an analysis covering: ... (rest of prompt from previous turns) ...
|
|
149 |
combined_review = f"Rule-Based Analysis:\n\n{rule_based_review}\n\nWording Analysis:\n\n{wording_analysis}"
|
150 |
return combined_review
|
151 |
|
152 |
-
|
153 |
###############################################################################
|
154 |
# 4) File Parsing (PDF, DOCX)
|
155 |
###############################################################################
|
156 |
|
157 |
def parse_bytesio(file_data: BytesIO) -> str:
|
158 |
-
"""Parses a BytesIO object."""
|
159 |
logger.info("Parsing BytesIO object...")
|
160 |
-
# ... (rest of parse_bytesio function from previous turns) ...
|
161 |
try:
|
162 |
# Attempt to determine file type from content
|
163 |
try:
|
@@ -181,7 +288,6 @@ def parse_bytesio(file_data: BytesIO) -> str:
|
|
181 |
|
182 |
def parse_uploaded_file_path(file_data) -> str:
|
183 |
"""Takes file data, determines type, extracts text."""
|
184 |
-
# ... (rest of parse_uploaded_file_path from previous turns)
|
185 |
if not file_data:
|
186 |
logger.warning("No file provided.")
|
187 |
return ""
|
@@ -222,7 +328,6 @@ def parse_uploaded_file_path(file_data) -> str:
|
|
222 |
|
223 |
def clean_markdown(text):
|
224 |
"""Removes common Markdown formatting."""
|
225 |
-
# ... (rest of clean_markdown from previous turns)
|
226 |
if not text: return ""
|
227 |
text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
|
228 |
text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text)
|
@@ -237,14 +342,13 @@ def clean_markdown(text):
|
|
237 |
def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a="Party A", party_b="Party B"):
|
238 |
"""Creates DOCX, adds review, saves to temp file, returns path."""
|
239 |
logger.debug("Creating and saving DOCX.")
|
240 |
-
# ... (rest of create_and_save_docx from previous turns) ...
|
241 |
document = docx.Document()
|
242 |
|
243 |
now = datetime.datetime.now()
|
244 |
timestamp = now.strftime("%Y%m%d_%H%M%S")
|
245 |
-
file_name = f"
|
246 |
|
247 |
-
title = f"
|
248 |
document.add_heading(title, level=1)
|
249 |
|
250 |
if doc_text:
|
@@ -277,7 +381,6 @@ def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a
|
|
277 |
else: # Other sections (if any)
|
278 |
document.add_paragraph(section)
|
279 |
|
280 |
-
|
281 |
with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file_name}") as tmpfile:
|
282 |
document.save(tmpfile.name)
|
283 |
logger.debug(f"DOCX saved to: {tmpfile.name}")
|
@@ -292,7 +395,7 @@ def generate_document_interface(doc_type, party_a, party_b, context, country):
|
|
292 |
logger.info(f"User requested doc generation: {doc_type}, {country}")
|
293 |
doc_text = generate_legal_document(doc_type, party_a, party_b, context, country)
|
294 |
if doc_text.startswith("Error"):
|
295 |
-
|
296 |
docx_file_path = create_and_save_docx(doc_text, doc_type=doc_type, party_a=party_a, party_b=party_b)
|
297 |
return doc_text, docx_file_path
|
298 |
|
@@ -316,6 +419,7 @@ def review_document_interface(file_data, doc_type, party_a, party_b):
|
|
316 |
###############################################################################
|
317 |
# 7) Build & Launch Gradio App
|
318 |
###############################################################################
|
|
|
319 |
custom_css = """
|
320 |
.tab-one {
|
321 |
background-color: #D1EEFC; /* Light blue */
|
@@ -325,13 +429,15 @@ custom_css = """
|
|
325 |
background-color: #FCEED1; /* Light orange */
|
326 |
color: #333;
|
327 |
}
|
|
|
|
|
328 |
"""
|
329 |
|
330 |
def build_app():
|
331 |
with gr.Blocks(css=custom_css) as demo:
|
332 |
gr.Markdown(
|
333 |
"""
|
334 |
-
# UST Global
|
335 |
|
336 |
**Review an Existing MOU, SOW, MSA in PDF/DOCX format**: Upload a document for analysis.
|
337 |
|
@@ -339,7 +445,7 @@ def build_app():
|
|
339 |
"""
|
340 |
)
|
341 |
with gr.Tabs(selected=1):
|
342 |
-
with gr.Tab("Generate Document",visible=False):
|
343 |
doc_type = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU")
|
344 |
party_a = gr.Textbox(label="Party A Name", placeholder="e.g., Tech Innovations LLC")
|
345 |
party_b = gr.Textbox(label="Party B Name", placeholder="e.g., Global Consulting Corp")
|
@@ -354,7 +460,7 @@ def build_app():
|
|
354 |
outputs=[gen_output_text, gen_output_file]
|
355 |
)
|
356 |
|
357 |
-
with gr.Tab("Review Document",elem_classes="tab-one", id=1):
|
358 |
# Hidden inputs to store values from Generate tab
|
359 |
doc_type_review = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU", visible=False)
|
360 |
party_a_review = gr.Textbox(label="Party A Name", visible=False)
|
@@ -372,11 +478,13 @@ def build_app():
|
|
372 |
# Copy values from Generate to Review tab (hidden fields)
|
373 |
gen_button.click(lambda x, y, z: (x, y, z), [doc_type, party_a, party_b], [doc_type_review, party_a_review, party_b_review])
|
374 |
|
375 |
-
|
376 |
-
|
377 |
return demo
|
378 |
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
|
|
|
|
|
|
|
7 |
import tempfile
|
8 |
import re
|
9 |
import datetime
|
10 |
+
import torch
|
|
|
11 |
|
12 |
+
import gradio as gr
|
13 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
14 |
+
import huggingface_hub
|
15 |
|
16 |
###############################################################################
|
17 |
# 1) Logging Configuration
|
|
|
23 |
logger = logging.getLogger("LLM-Legal-App")
|
24 |
|
25 |
###############################################################################
|
26 |
+
# 2) Initialize Hugging Face Model
|
27 |
###############################################################################
|
28 |
+
def initialize_model():
|
29 |
+
"""Initialize the DocumentCogito model and tokenizer from HuggingFace."""
|
30 |
+
logger.info("Initializing DocumentCogito model and tokenizer...")
|
31 |
+
try:
|
32 |
+
# Access token might be needed for some models
|
33 |
+
# token = huggingface_hub.get_token()
|
34 |
+
|
35 |
+
model_name = "Daemontatox/DocumentCogito"
|
36 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
37 |
+
model = AutoModelForCausalLM.from_pretrained(
|
38 |
+
model_name,
|
39 |
+
torch_dtype=torch.float16,
|
40 |
+
device_map="auto",
|
41 |
+
trust_remote_code=True
|
42 |
+
)
|
43 |
+
logger.info("Successfully initialized DocumentCogito model and tokenizer.")
|
44 |
+
return model, tokenizer
|
45 |
+
except Exception as e:
|
46 |
+
logger.exception("Error initializing Hugging Face model.")
|
47 |
+
raise ValueError(f"Failed to initialize model: {e}")
|
48 |
|
49 |
+
# Initialize model and tokenizer
|
50 |
+
model, tokenizer = initialize_model()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
###############################################################################
|
53 |
+
# 3) LLM Utility Functions (Generation & Review)
|
54 |
###############################################################################
|
55 |
+
def generate_with_model(prompt, max_length=1400, temperature=0.3):
|
56 |
+
"""Generate text using the Hugging Face model."""
|
57 |
+
logger.info("Generating text with DocumentCogito model.")
|
58 |
+
|
59 |
+
try:
|
60 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
61 |
+
|
62 |
+
# Generate with parameters similar to the original OpenAI call
|
63 |
+
generation_config = {
|
64 |
+
"max_new_tokens": max_length,
|
65 |
+
"temperature": temperature,
|
66 |
+
"top_p": 0.9,
|
67 |
+
"do_sample": temperature > 0,
|
68 |
+
"pad_token_id": tokenizer.eos_token_id
|
69 |
+
}
|
70 |
+
|
71 |
+
with torch.no_grad():
|
72 |
+
outputs = model.generate(**inputs, **generation_config)
|
73 |
+
|
74 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
75 |
+
|
76 |
+
# Remove the prompt from the response
|
77 |
+
if response.startswith(prompt):
|
78 |
+
response = response[len(prompt):].strip()
|
79 |
+
|
80 |
+
logger.info("Text generation complete.")
|
81 |
+
return response
|
82 |
+
|
83 |
+
except Exception as e:
|
84 |
+
logger.exception("Error during text generation.")
|
85 |
+
return f"Error generating text: {e}"
|
86 |
|
87 |
def generate_legal_document(doc_type, party_a, party_b, context, country):
|
88 |
"""
|
89 |
+
Uses DocumentCogito to generate a legal document. Returns the document text.
|
90 |
"""
|
91 |
logger.info(f"Starting generation for doc_type={doc_type!r}.")
|
92 |
+
# Fill placeholders if fields are missing
|
93 |
party_a = party_a if party_a else "[Party A Not Provided]"
|
94 |
party_b = party_b if party_b else "[Party B Not Provided]"
|
95 |
context = context if context else "[Context Not Provided]"
|
96 |
|
97 |
prompt = f"""
|
98 |
+
You are a helpful legal assistant. Generate a {doc_type} for:
|
99 |
+
1) {party_a}
|
100 |
+
2) {party_b}
|
101 |
+
|
102 |
+
Context/brief of the agreement:
|
103 |
+
{context}.
|
104 |
+
|
105 |
+
The document should include:
|
106 |
+
- Purpose of the {doc_type}
|
107 |
+
- Responsibilities and obligations of each party
|
108 |
+
- Confidentiality terms
|
109 |
+
- Payment terms (use [To Be Determined] if not specified)
|
110 |
+
- Term (duration) and termination
|
111 |
+
- Governing law: {country}
|
112 |
+
- Jurisdiction: [Appropriate region in {country} if not provided]
|
113 |
+
- Signature blocks
|
114 |
+
|
115 |
+
Use formal language, but keep it relatively clear and readable.
|
116 |
+
For any missing information, use placeholders like [To Be Determined].
|
117 |
+
Include a disclaimer that this is a draft and not legally binding until reviewed and signed.
|
118 |
+
"""
|
119 |
logger.debug(f"Generated prompt:\n{prompt}")
|
120 |
|
121 |
+
return generate_with_model(prompt, max_length=1400, temperature=0.3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
def review_legal_document(doc_text, doc_type, party_a, party_b):
|
124 |
+
"""
|
125 |
+
Reviews document: first with rule-based checks, then wording analysis.
|
126 |
+
"""
|
127 |
logger.info("Starting document review (rule-based and wording).")
|
128 |
|
129 |
# --- Rule-Based Review ---
|
130 |
rule_based_prompt = f"""
|
131 |
+
You are a legal AI assistant reviewing a document. Provide a review,
|
132 |
+
structured into the following numbered sections. Be concise and factual. Do NOT
|
133 |
+
use Markdown. Use plain text labels for each section.
|
134 |
|
135 |
Document text:
|
136 |
\"\"\"
|
137 |
{doc_text}
|
138 |
\"\"\"
|
139 |
+
|
140 |
+
Review Sections:
|
141 |
+
|
142 |
+
1) Parties and Authority:
|
143 |
+
- Confirm the full legal names of all parties.
|
144 |
+
- Make sure the people signing can legally commit their organizations.
|
145 |
+
|
146 |
+
2) Scope of Work / Obligations:
|
147 |
+
- Check that the contract clearly describes what each side must do.
|
148 |
+
- Look for deadlines, milestones, or deliverables.
|
149 |
+
- Ensure everything is realistic and not overly vague.
|
150 |
+
|
151 |
+
3) Definitions and Key Terms:
|
152 |
+
- See if there's a section that explains important terms.
|
153 |
+
- Ensure those terms are used the same way throughout the contract.
|
154 |
+
- Avoid or clarify any ambiguous language.
|
155 |
+
|
156 |
+
4) Payment Terms (If Applicable):
|
157 |
+
- Check how much is owed, the currency, and when it's due.
|
158 |
+
- Look for penalties, interest, or late fees.
|
159 |
+
- Note how and when invoices are sent or paid.
|
160 |
+
|
161 |
+
5) Term and Termination:
|
162 |
+
- Identify when the contract starts and ends.
|
163 |
+
- Understand how it can be renewed.
|
164 |
+
- See the conditions and notice required for ending the contract early.
|
165 |
+
|
166 |
+
6) Intellectual Property (IP) Rights:
|
167 |
+
- Confirm who owns any work created under the agreement.
|
168 |
+
- Note if licenses are granted for using the IP, and for how long.
|
169 |
+
|
170 |
+
7) Confidentiality and Privacy:
|
171 |
+
- Check what is considered confidential information.
|
172 |
+
- Look for exceptions (like already public info).
|
173 |
+
- See how long the confidentiality rules apply.
|
174 |
+
|
175 |
+
8) Warranties and Representations:
|
176 |
+
- Note any performance guarantees or quality promises.
|
177 |
+
- Look for disclaimers (like "as is" clauses).
|
178 |
+
|
179 |
+
9) Indemnification:
|
180 |
+
- See who will pay legal costs or damages if there's a lawsuit or claim.
|
181 |
+
- Check any limits on what's covered.
|
182 |
+
|
183 |
+
10) Limitation of Liability:
|
184 |
+
- Check if there's a maximum amount one side can claim in damages.
|
185 |
+
- Look for excluded damages, like lost profits.
|
186 |
+
|
187 |
+
11) Dispute Resolution and Governing Law:
|
188 |
+
- See if disputes go to arbitration, mediation, or court.
|
189 |
+
- Note which state or country's laws will apply.
|
190 |
+
|
191 |
+
12) Force Majeure (Unforeseen Events):
|
192 |
+
- Look for events like natural disasters or war that could suspend obligations.
|
193 |
+
- See if there are notice requirements for these events.
|
194 |
+
|
195 |
+
13) Notices and Amendments:
|
196 |
+
- Check how official notices must be sent (email, mail, etc.).
|
197 |
+
- Find out how to properly change the contract (in writing, signatures, etc.).
|
198 |
+
|
199 |
+
14) Entire Agreement and Severability:
|
200 |
+
- Confirm that this contract replaces all previous agreements.
|
201 |
+
- Ensure that if one clause is invalid, the rest still stands.
|
202 |
+
|
203 |
+
15) Signatures and Dates:
|
204 |
+
- Make sure the right people sign in their proper roles.
|
205 |
+
- Verify the date of signature and when the contract goes into effect.
|
206 |
+
|
207 |
+
16) Ambiguities, Contradictions, and Hidden Clauses:
|
208 |
+
- Watch for contradictory statements or clauses that conflict.
|
209 |
+
- Beware of vague phrases like "best efforts" without clear guidelines.
|
210 |
+
- Check for hidden or "buried" clauses in fine print or attachments.
|
211 |
+
|
212 |
+
17) Compliance and Regulatory Alignment:
|
213 |
+
- Ensure the contract follows relevant laws and rules.
|
214 |
+
- Check for industry-specific requirements.
|
215 |
+
|
216 |
+
18) Practical Considerations:
|
217 |
+
- Make sure deadlines and other requirements are doable.
|
218 |
+
- Confirm all negotiations are reflected in writing.
|
219 |
+
- Avoid blank or undefined items (like fees or dates "to be decided").
|
220 |
"""
|
221 |
logger.debug(f"Generated rule-based review prompt:\n{rule_based_prompt}")
|
222 |
|
223 |
try:
|
224 |
+
rule_based_review = generate_with_model(rule_based_prompt, max_length=2000, temperature=0.3)
|
|
|
|
|
|
|
|
|
|
|
225 |
except Exception as e:
|
226 |
logger.exception("Error during rule-based review.")
|
227 |
return f"Error during rule-based review: {e}"
|
228 |
|
229 |
# --- Wording Analysis ---
|
230 |
wording_analysis_prompt = f"""
|
231 |
+
You are a legal AI assistant. Analyze the following legal document for its wording:
|
232 |
|
233 |
Document text:
|
234 |
\"\"\"
|
235 |
{doc_text}
|
236 |
\"\"\"
|
237 |
|
238 |
+
Provide a comprehensive analysis of the document's wording, covering these aspects for the ENTIRE document text:
|
239 |
+
|
240 |
+
1. **Clarity and Precision:** Identify ambiguous or vague language, and suggest improvements.
|
241 |
+
2. **Readability:** Assess the overall readability and suggest improvements for clarity, including sentence structure and complexity.
|
242 |
+
3. **Formal Tone:** Check if the language maintains a formal and professional tone appropriate for a legal document, and suggest changes if needed.
|
243 |
+
4. **Consistency:** Ensure consistent use of terms and phrasing throughout the document. Point out any inconsistencies.
|
244 |
+
5. **Redundancy:** Identify any unnecessary repetition of words or phrases.
|
245 |
+
6. **Jargon and Technical Terms:** Identify jargon or technical terms that might be unclear to a non-expert, and suggest clearer alternatives where appropriate.
|
246 |
+
7. **Overall Recommendations:** Give overall recommendations for improving the document's wording.
|
247 |
+
|
248 |
+
Provide your analysis in plain text, without using Markdown. Label each section of your analysis clearly (e.g., "Clarity and Precision:", "Readability:", etc.).
|
249 |
"""
|
250 |
logger.debug(f"Generated wording analysis prompt:\n{wording_analysis_prompt}")
|
251 |
|
252 |
try:
|
253 |
+
wording_analysis = generate_with_model(wording_analysis_prompt, max_length=1000, temperature=0.3)
|
|
|
|
|
|
|
|
|
|
|
254 |
except Exception as e:
|
255 |
logger.exception("Error during wording analysis.")
|
256 |
return f"Error during wording analysis: {e}"
|
|
|
258 |
combined_review = f"Rule-Based Analysis:\n\n{rule_based_review}\n\nWording Analysis:\n\n{wording_analysis}"
|
259 |
return combined_review
|
260 |
|
|
|
261 |
###############################################################################
|
262 |
# 4) File Parsing (PDF, DOCX)
|
263 |
###############################################################################
|
264 |
|
265 |
def parse_bytesio(file_data: BytesIO) -> str:
|
266 |
+
"""Parses a BytesIO object representing a PDF or DOCX."""
|
267 |
logger.info("Parsing BytesIO object...")
|
|
|
268 |
try:
|
269 |
# Attempt to determine file type from content
|
270 |
try:
|
|
|
288 |
|
289 |
def parse_uploaded_file_path(file_data) -> str:
|
290 |
"""Takes file data, determines type, extracts text."""
|
|
|
291 |
if not file_data:
|
292 |
logger.warning("No file provided.")
|
293 |
return ""
|
|
|
328 |
|
329 |
def clean_markdown(text):
|
330 |
"""Removes common Markdown formatting."""
|
|
|
331 |
if not text: return ""
|
332 |
text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
|
333 |
text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text)
|
|
|
342 |
def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a="Party A", party_b="Party B"):
|
343 |
"""Creates DOCX, adds review, saves to temp file, returns path."""
|
344 |
logger.debug("Creating and saving DOCX.")
|
|
|
345 |
document = docx.Document()
|
346 |
|
347 |
now = datetime.datetime.now()
|
348 |
timestamp = now.strftime("%Y%m%d_%H%M%S")
|
349 |
+
file_name = f"HF_AI_Review_{doc_type}_{timestamp}.docx"
|
350 |
|
351 |
+
title = f"DocumentCogito Analysis of {doc_type} between companies {party_a} and {party_b}"
|
352 |
document.add_heading(title, level=1)
|
353 |
|
354 |
if doc_text:
|
|
|
381 |
else: # Other sections (if any)
|
382 |
document.add_paragraph(section)
|
383 |
|
|
|
384 |
with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file_name}") as tmpfile:
|
385 |
document.save(tmpfile.name)
|
386 |
logger.debug(f"DOCX saved to: {tmpfile.name}")
|
|
|
395 |
logger.info(f"User requested doc generation: {doc_type}, {country}")
|
396 |
doc_text = generate_legal_document(doc_type, party_a, party_b, context, country)
|
397 |
if doc_text.startswith("Error"):
|
398 |
+
return doc_text, None
|
399 |
docx_file_path = create_and_save_docx(doc_text, doc_type=doc_type, party_a=party_a, party_b=party_b)
|
400 |
return doc_text, docx_file_path
|
401 |
|
|
|
419 |
###############################################################################
|
420 |
# 7) Build & Launch Gradio App
|
421 |
###############################################################################
|
422 |
+
# Define custom CSS in a string.
|
423 |
custom_css = """
|
424 |
.tab-one {
|
425 |
background-color: #D1EEFC; /* Light blue */
|
|
|
429 |
background-color: #FCEED1; /* Light orange */
|
430 |
color: #333;
|
431 |
}
|
432 |
+
/* If you want to style the tab label differently, you may need to target
|
433 |
+
specific child elements (like a .tab__header) within the class. */
|
434 |
"""
|
435 |
|
436 |
def build_app():
|
437 |
with gr.Blocks(css=custom_css) as demo:
|
438 |
gr.Markdown(
|
439 |
"""
|
440 |
+
# UST Global Legal Document Analyzer (Hugging Face Version)
|
441 |
|
442 |
**Review an Existing MOU, SOW, MSA in PDF/DOCX format**: Upload a document for analysis.
|
443 |
|
|
|
445 |
"""
|
446 |
)
|
447 |
with gr.Tabs(selected=1):
|
448 |
+
with gr.Tab("Generate Document", visible=False):
|
449 |
doc_type = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU")
|
450 |
party_a = gr.Textbox(label="Party A Name", placeholder="e.g., Tech Innovations LLC")
|
451 |
party_b = gr.Textbox(label="Party B Name", placeholder="e.g., Global Consulting Corp")
|
|
|
460 |
outputs=[gen_output_text, gen_output_file]
|
461 |
)
|
462 |
|
463 |
+
with gr.Tab("Review Document", elem_classes="tab-one", id=1):
|
464 |
# Hidden inputs to store values from Generate tab
|
465 |
doc_type_review = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU", visible=False)
|
466 |
party_a_review = gr.Textbox(label="Party A Name", visible=False)
|
|
|
478 |
# Copy values from Generate to Review tab (hidden fields)
|
479 |
gen_button.click(lambda x, y, z: (x, y, z), [doc_type, party_a, party_b], [doc_type_review, party_a_review, party_b_review])
|
480 |
|
481 |
+
gr.Markdown("**Note:** Scanned PDFs may not parse correctly. .docx is generally preferred.")
|
|
|
482 |
return demo
|
483 |
|
484 |
+
# For Hugging Face Spaces deployment
|
485 |
+
if __name__ == "__main__":
|
486 |
+
# create_requirements_file()
|
487 |
+
logger.info("Initializing Gradio interface...")
|
488 |
+
demo = build_app()
|
489 |
+
logger.info("Launching Gradio app.")
|
490 |
+
demo.launch()
|
requirements.txt
CHANGED
@@ -1,20 +1,7 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
# Other commonly used packages (you might already have these)
|
10 |
-
# - If you encounter installation issues, you can try removing or commenting out
|
11 |
-
# lines for packages you believe are already installed correctly in your
|
12 |
-
# environment. However, it's generally good practice to include them
|
13 |
-
# for reproducibility.
|
14 |
-
typing-extensions>=4.0.0 # For type hints and compatibility (often a dependency)
|
15 |
-
requests>=2.0.0 # Used by transformers and other libraries
|
16 |
-
filelock>=3.0.0 # Used by transformers for managing cache
|
17 |
-
packaging>=20.0 # For version handling
|
18 |
-
regex!=2019.12.17 # Used for text processing
|
19 |
-
tqdm>=4.27 # For progress bars (used by transformers)
|
20 |
-
numpy>=1.17 # Fundamental numerical computing
|
|
|
1 |
+
gradio>=3.50.2
|
2 |
+
transformers>=4.35.0
|
3 |
+
torch>=2.0.0
|
4 |
+
python-docx>=0.8.11
|
5 |
+
PyPDF2>=3.0.0
|
6 |
+
huggingface_hub>=0.19.0
|
7 |
+
accelerate>=0.20.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|