vishalsh13 commited on
Commit
9cb0411
·
1 Parent(s): f7ab2c4

changed code

Browse files
Files changed (3) hide show
  1. Dockerfile +27 -13
  2. app.py +202 -94
  3. requirements.txt +7 -20
Dockerfile CHANGED
@@ -1,18 +1,32 @@
1
- FROM python:3.10-slim
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  WORKDIR /app
4
- COPY requirements.txt /app/
5
- RUN pip install --no-cache-dir -r requirements.txt
6
- COPY . /app/
7
 
8
- # Create both /app/uploads and /app/.cache, and set permissions
9
- # Try chown first; if it fails, fall back to chmod (less secure)
10
- RUN mkdir -p /app/uploads /app/.cache && \
11
- chown -R 1000:1000 /app/uploads /app/.cache || \
12
- chmod -R 777 /app/uploads /app/.cache
 
13
 
14
- ENV HF_HOME=/app/.cache
 
15
 
16
- EXPOSE 7860
17
- # Run app.py when the container launches
18
- CMD ["python", "app.py"]
 
1
+ FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
2
 
3
+ # Set environment variables
4
+ ENV PYTHONDONTWRITEBYTECODE=1
5
+ ENV PYTHONUNBUFFERED=1
6
+ ENV DEBIAN_FRONTEND=noninteractive
7
+
8
+ # Install system dependencies
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ python3 \
11
+ python3-pip \
12
+ python3-dev \
13
+ build-essential \
14
+ git \
15
+ && apt-get clean \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ # Set the working directory
19
  WORKDIR /app
 
 
 
20
 
21
+ # Copy requirements file
22
+ COPY requirements.txt .
23
+
24
+ # Install Python dependencies
25
+ RUN pip3 install --no-cache-dir -U pip setuptools wheel
26
+ RUN pip3 install --no-cache-dir -r requirements.txt
27
 
28
+ # Copy application code
29
+ COPY app.py .
30
 
31
+ # Set the default command to run the application
32
+ CMD ["python3", "app.py"]
 
app.py CHANGED
@@ -7,9 +7,11 @@ from io import BytesIO, IOBase
7
  import tempfile
8
  import re
9
  import datetime
10
- import gradio as gr
11
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
12
 
 
 
 
13
 
14
  ###############################################################################
15
  # 1) Logging Configuration
@@ -21,127 +23,234 @@ logging.basicConfig(
21
  logger = logging.getLogger("LLM-Legal-App")
22
 
23
  ###############################################################################
24
- # 2) Retrieve API Key (Hugging Face)
25
  ###############################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- # Use os.environ to get the API key
28
- api_key = os.environ.get("HUGGINGFACE_API_KEY")
29
-
30
- if not api_key:
31
- logger.error("Hugging Face API key not found in environment variables.")
32
- raise ValueError("Hugging Face API key not found. Set it with `os.environ['HUGGINGFACE_API_KEY'] = 'your_api_key'`")
33
-
34
- logger.info("Successfully retrieved Hugging Face API key.")
35
-
36
 
37
  ###############################################################################
38
- # 3) Hugging Face Model and Utility Functions
39
  ###############################################################################
40
-
41
- # Initialize the Hugging Face model and tokenizer.
42
- model_name = "Daemontatox/DocumentCogito"
43
- tokenizer = AutoTokenizer.from_pretrained(model_name)
44
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name, token=api_key)
45
- generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device_map="auto") # Use device_map="auto"
46
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  def generate_legal_document(doc_type, party_a, party_b, context, country):
49
  """
50
- Uses the Hugging Face model to generate a legal document.
51
  """
52
  logger.info(f"Starting generation for doc_type={doc_type!r}.")
 
53
  party_a = party_a if party_a else "[Party A Not Provided]"
54
  party_b = party_b if party_b else "[Party B Not Provided]"
55
  context = context if context else "[Context Not Provided]"
56
 
57
  prompt = f"""
58
- Generate a {doc_type} for:
59
- 1) {party_a}
60
- 2) {party_b}
61
-
62
- Context/brief of the agreement:
63
- {context}.
64
-
65
- The document should include:
66
- - Purpose of the {doc_type}
67
- - Responsibilities and obligations of each party
68
- - Confidentiality terms
69
- - Payment terms (use [To Be Determined] if not specified)
70
- - Term (duration) and termination
71
- - Governing law: {country}
72
- - Jurisdiction: [Appropriate region in {country} if not provided]
73
- - Signature blocks
74
-
75
- Use formal language, but keep it relatively clear and readable.
76
- For any missing information, use placeholders like [To Be Determined].
77
- Include a disclaimer that this is a draft and not legally binding until reviewed and signed.
78
- """
79
  logger.debug(f"Generated prompt:\n{prompt}")
80
 
81
- try:
82
- # Use the Hugging Face pipeline
83
- generated_text = generator(
84
- prompt,
85
- max_length=1400, # Adjust as needed
86
- num_return_sequences=1,
87
- temperature=0.3, # Adjust as needed
88
- )[0]['generated_text']
89
-
90
- logger.info("Document generation complete.")
91
- return generated_text
92
-
93
- except Exception as e:
94
- logger.exception("Error generating legal document.")
95
- return f"Error generating document: {e}"
96
-
97
 
98
  def review_legal_document(doc_text, doc_type, party_a, party_b):
99
- """Reviews document using the Hugging Face model."""
 
 
100
  logger.info("Starting document review (rule-based and wording).")
101
 
102
  # --- Rule-Based Review ---
103
  rule_based_prompt = f"""
104
- Review the following document and provide feedback based on these rules:
 
 
105
 
106
  Document text:
107
  \"\"\"
108
  {doc_text}
109
  \"\"\"
110
- 1) Parties and Authority: ... (rest of prompt from previous turns) ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  """
112
  logger.debug(f"Generated rule-based review prompt:\n{rule_based_prompt}")
113
 
114
  try:
115
- rule_based_review = generator(
116
- rule_based_prompt,
117
- max_length=2000,
118
- num_return_sequences=1,
119
- temperature=0.3,
120
- )[0]['generated_text']
121
  except Exception as e:
122
  logger.exception("Error during rule-based review.")
123
  return f"Error during rule-based review: {e}"
124
 
125
  # --- Wording Analysis ---
126
  wording_analysis_prompt = f"""
127
- Analyze the wording of the following legal document:
128
 
129
  Document text:
130
  \"\"\"
131
  {doc_text}
132
  \"\"\"
133
 
134
- Provide an analysis covering: ... (rest of prompt from previous turns) ...
 
 
 
 
 
 
 
 
 
 
135
  """
136
  logger.debug(f"Generated wording analysis prompt:\n{wording_analysis_prompt}")
137
 
138
  try:
139
- wording_analysis = generator(
140
- wording_analysis_prompt,
141
- max_length=1400,
142
- num_return_sequences=1,
143
- temperature=0.3,
144
- )[0]['generated_text']
145
  except Exception as e:
146
  logger.exception("Error during wording analysis.")
147
  return f"Error during wording analysis: {e}"
@@ -149,15 +258,13 @@ Provide an analysis covering: ... (rest of prompt from previous turns) ...
149
  combined_review = f"Rule-Based Analysis:\n\n{rule_based_review}\n\nWording Analysis:\n\n{wording_analysis}"
150
  return combined_review
151
 
152
-
153
  ###############################################################################
154
  # 4) File Parsing (PDF, DOCX)
155
  ###############################################################################
156
 
157
  def parse_bytesio(file_data: BytesIO) -> str:
158
- """Parses a BytesIO object."""
159
  logger.info("Parsing BytesIO object...")
160
- # ... (rest of parse_bytesio function from previous turns) ...
161
  try:
162
  # Attempt to determine file type from content
163
  try:
@@ -181,7 +288,6 @@ def parse_bytesio(file_data: BytesIO) -> str:
181
 
182
  def parse_uploaded_file_path(file_data) -> str:
183
  """Takes file data, determines type, extracts text."""
184
- # ... (rest of parse_uploaded_file_path from previous turns)
185
  if not file_data:
186
  logger.warning("No file provided.")
187
  return ""
@@ -222,7 +328,6 @@ def parse_uploaded_file_path(file_data) -> str:
222
 
223
  def clean_markdown(text):
224
  """Removes common Markdown formatting."""
225
- # ... (rest of clean_markdown from previous turns)
226
  if not text: return ""
227
  text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
228
  text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text)
@@ -237,14 +342,13 @@ def clean_markdown(text):
237
  def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a="Party A", party_b="Party B"):
238
  """Creates DOCX, adds review, saves to temp file, returns path."""
239
  logger.debug("Creating and saving DOCX.")
240
- # ... (rest of create_and_save_docx from previous turns) ...
241
  document = docx.Document()
242
 
243
  now = datetime.datetime.now()
244
  timestamp = now.strftime("%Y%m%d_%H%M%S")
245
- file_name = f"GEN_AI_Review_{doc_type}_{timestamp}.docx"
246
 
247
- title = f"Gen AI Analysis of {doc_type} between companies {party_a} and {party_b}"
248
  document.add_heading(title, level=1)
249
 
250
  if doc_text:
@@ -277,7 +381,6 @@ def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a
277
  else: # Other sections (if any)
278
  document.add_paragraph(section)
279
 
280
-
281
  with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file_name}") as tmpfile:
282
  document.save(tmpfile.name)
283
  logger.debug(f"DOCX saved to: {tmpfile.name}")
@@ -292,7 +395,7 @@ def generate_document_interface(doc_type, party_a, party_b, context, country):
292
  logger.info(f"User requested doc generation: {doc_type}, {country}")
293
  doc_text = generate_legal_document(doc_type, party_a, party_b, context, country)
294
  if doc_text.startswith("Error"):
295
- return doc_text, None
296
  docx_file_path = create_and_save_docx(doc_text, doc_type=doc_type, party_a=party_a, party_b=party_b)
297
  return doc_text, docx_file_path
298
 
@@ -316,6 +419,7 @@ def review_document_interface(file_data, doc_type, party_a, party_b):
316
  ###############################################################################
317
  # 7) Build & Launch Gradio App
318
  ###############################################################################
 
319
  custom_css = """
320
  .tab-one {
321
  background-color: #D1EEFC; /* Light blue */
@@ -325,13 +429,15 @@ custom_css = """
325
  background-color: #FCEED1; /* Light orange */
326
  color: #333;
327
  }
 
 
328
  """
329
 
330
  def build_app():
331
  with gr.Blocks(css=custom_css) as demo:
332
  gr.Markdown(
333
  """
334
- # UST Global LLM-based Legal Reviewer
335
 
336
  **Review an Existing MOU, SOW, MSA in PDF/DOCX format**: Upload a document for analysis.
337
 
@@ -339,7 +445,7 @@ def build_app():
339
  """
340
  )
341
  with gr.Tabs(selected=1):
342
- with gr.Tab("Generate Document",visible=False):
343
  doc_type = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU")
344
  party_a = gr.Textbox(label="Party A Name", placeholder="e.g., Tech Innovations LLC")
345
  party_b = gr.Textbox(label="Party B Name", placeholder="e.g., Global Consulting Corp")
@@ -354,7 +460,7 @@ def build_app():
354
  outputs=[gen_output_text, gen_output_file]
355
  )
356
 
357
- with gr.Tab("Review Document",elem_classes="tab-one", id=1):
358
  # Hidden inputs to store values from Generate tab
359
  doc_type_review = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU", visible=False)
360
  party_a_review = gr.Textbox(label="Party A Name", visible=False)
@@ -372,11 +478,13 @@ def build_app():
372
  # Copy values from Generate to Review tab (hidden fields)
373
  gen_button.click(lambda x, y, z: (x, y, z), [doc_type, party_a, party_b], [doc_type_review, party_a_review, party_b_review])
374
 
375
- gr.Markdown("**Note:** Scanned PDFs may not parse correctly. .docx is generally preferred.")
376
-
377
  return demo
378
 
379
- logger.info("Initializing Gradio interface...")
380
- demo = build_app()
381
- logger.info("Launching Gradio app.")
382
- demo.launch(debug=True)
 
 
 
 
7
  import tempfile
8
  import re
9
  import datetime
10
+ import torch
 
11
 
12
+ import gradio as gr
13
+ from transformers import AutoModelForCausalLM, AutoTokenizer
14
+ import huggingface_hub
15
 
16
  ###############################################################################
17
  # 1) Logging Configuration
 
23
  logger = logging.getLogger("LLM-Legal-App")
24
 
25
  ###############################################################################
26
+ # 2) Initialize Hugging Face Model
27
  ###############################################################################
28
+ def initialize_model():
29
+ """Initialize the DocumentCogito model and tokenizer from HuggingFace."""
30
+ logger.info("Initializing DocumentCogito model and tokenizer...")
31
+ try:
32
+ # Access token might be needed for some models
33
+ # token = huggingface_hub.get_token()
34
+
35
+ model_name = "Daemontatox/DocumentCogito"
36
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
37
+ model = AutoModelForCausalLM.from_pretrained(
38
+ model_name,
39
+ torch_dtype=torch.float16,
40
+ device_map="auto",
41
+ trust_remote_code=True
42
+ )
43
+ logger.info("Successfully initialized DocumentCogito model and tokenizer.")
44
+ return model, tokenizer
45
+ except Exception as e:
46
+ logger.exception("Error initializing Hugging Face model.")
47
+ raise ValueError(f"Failed to initialize model: {e}")
48
 
49
+ # Initialize model and tokenizer
50
+ model, tokenizer = initialize_model()
 
 
 
 
 
 
 
51
 
52
  ###############################################################################
53
+ # 3) LLM Utility Functions (Generation & Review)
54
  ###############################################################################
55
+ def generate_with_model(prompt, max_length=1400, temperature=0.3):
56
+ """Generate text using the Hugging Face model."""
57
+ logger.info("Generating text with DocumentCogito model.")
58
+
59
+ try:
60
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
61
+
62
+ # Generate with parameters similar to the original OpenAI call
63
+ generation_config = {
64
+ "max_new_tokens": max_length,
65
+ "temperature": temperature,
66
+ "top_p": 0.9,
67
+ "do_sample": temperature > 0,
68
+ "pad_token_id": tokenizer.eos_token_id
69
+ }
70
+
71
+ with torch.no_grad():
72
+ outputs = model.generate(**inputs, **generation_config)
73
+
74
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
75
+
76
+ # Remove the prompt from the response
77
+ if response.startswith(prompt):
78
+ response = response[len(prompt):].strip()
79
+
80
+ logger.info("Text generation complete.")
81
+ return response
82
+
83
+ except Exception as e:
84
+ logger.exception("Error during text generation.")
85
+ return f"Error generating text: {e}"
86
 
87
  def generate_legal_document(doc_type, party_a, party_b, context, country):
88
  """
89
+ Uses DocumentCogito to generate a legal document. Returns the document text.
90
  """
91
  logger.info(f"Starting generation for doc_type={doc_type!r}.")
92
+ # Fill placeholders if fields are missing
93
  party_a = party_a if party_a else "[Party A Not Provided]"
94
  party_b = party_b if party_b else "[Party B Not Provided]"
95
  context = context if context else "[Context Not Provided]"
96
 
97
  prompt = f"""
98
+ You are a helpful legal assistant. Generate a {doc_type} for:
99
+ 1) {party_a}
100
+ 2) {party_b}
101
+
102
+ Context/brief of the agreement:
103
+ {context}.
104
+
105
+ The document should include:
106
+ - Purpose of the {doc_type}
107
+ - Responsibilities and obligations of each party
108
+ - Confidentiality terms
109
+ - Payment terms (use [To Be Determined] if not specified)
110
+ - Term (duration) and termination
111
+ - Governing law: {country}
112
+ - Jurisdiction: [Appropriate region in {country} if not provided]
113
+ - Signature blocks
114
+
115
+ Use formal language, but keep it relatively clear and readable.
116
+ For any missing information, use placeholders like [To Be Determined].
117
+ Include a disclaimer that this is a draft and not legally binding until reviewed and signed.
118
+ """
119
  logger.debug(f"Generated prompt:\n{prompt}")
120
 
121
+ return generate_with_model(prompt, max_length=1400, temperature=0.3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  def review_legal_document(doc_text, doc_type, party_a, party_b):
124
+ """
125
+ Reviews document: first with rule-based checks, then wording analysis.
126
+ """
127
  logger.info("Starting document review (rule-based and wording).")
128
 
129
  # --- Rule-Based Review ---
130
  rule_based_prompt = f"""
131
+ You are a legal AI assistant reviewing a document. Provide a review,
132
+ structured into the following numbered sections. Be concise and factual. Do NOT
133
+ use Markdown. Use plain text labels for each section.
134
 
135
  Document text:
136
  \"\"\"
137
  {doc_text}
138
  \"\"\"
139
+
140
+ Review Sections:
141
+
142
+ 1) Parties and Authority:
143
+ - Confirm the full legal names of all parties.
144
+ - Make sure the people signing can legally commit their organizations.
145
+
146
+ 2) Scope of Work / Obligations:
147
+ - Check that the contract clearly describes what each side must do.
148
+ - Look for deadlines, milestones, or deliverables.
149
+ - Ensure everything is realistic and not overly vague.
150
+
151
+ 3) Definitions and Key Terms:
152
+ - See if there's a section that explains important terms.
153
+ - Ensure those terms are used the same way throughout the contract.
154
+ - Avoid or clarify any ambiguous language.
155
+
156
+ 4) Payment Terms (If Applicable):
157
+ - Check how much is owed, the currency, and when it's due.
158
+ - Look for penalties, interest, or late fees.
159
+ - Note how and when invoices are sent or paid.
160
+
161
+ 5) Term and Termination:
162
+ - Identify when the contract starts and ends.
163
+ - Understand how it can be renewed.
164
+ - See the conditions and notice required for ending the contract early.
165
+
166
+ 6) Intellectual Property (IP) Rights:
167
+ - Confirm who owns any work created under the agreement.
168
+ - Note if licenses are granted for using the IP, and for how long.
169
+
170
+ 7) Confidentiality and Privacy:
171
+ - Check what is considered confidential information.
172
+ - Look for exceptions (like already public info).
173
+ - See how long the confidentiality rules apply.
174
+
175
+ 8) Warranties and Representations:
176
+ - Note any performance guarantees or quality promises.
177
+ - Look for disclaimers (like "as is" clauses).
178
+
179
+ 9) Indemnification:
180
+ - See who will pay legal costs or damages if there's a lawsuit or claim.
181
+ - Check any limits on what's covered.
182
+
183
+ 10) Limitation of Liability:
184
+ - Check if there's a maximum amount one side can claim in damages.
185
+ - Look for excluded damages, like lost profits.
186
+
187
+ 11) Dispute Resolution and Governing Law:
188
+ - See if disputes go to arbitration, mediation, or court.
189
+ - Note which state or country's laws will apply.
190
+
191
+ 12) Force Majeure (Unforeseen Events):
192
+ - Look for events like natural disasters or war that could suspend obligations.
193
+ - See if there are notice requirements for these events.
194
+
195
+ 13) Notices and Amendments:
196
+ - Check how official notices must be sent (email, mail, etc.).
197
+ - Find out how to properly change the contract (in writing, signatures, etc.).
198
+
199
+ 14) Entire Agreement and Severability:
200
+ - Confirm that this contract replaces all previous agreements.
201
+ - Ensure that if one clause is invalid, the rest still stands.
202
+
203
+ 15) Signatures and Dates:
204
+ - Make sure the right people sign in their proper roles.
205
+ - Verify the date of signature and when the contract goes into effect.
206
+
207
+ 16) Ambiguities, Contradictions, and Hidden Clauses:
208
+ - Watch for contradictory statements or clauses that conflict.
209
+ - Beware of vague phrases like "best efforts" without clear guidelines.
210
+ - Check for hidden or "buried" clauses in fine print or attachments.
211
+
212
+ 17) Compliance and Regulatory Alignment:
213
+ - Ensure the contract follows relevant laws and rules.
214
+ - Check for industry-specific requirements.
215
+
216
+ 18) Practical Considerations:
217
+ - Make sure deadlines and other requirements are doable.
218
+ - Confirm all negotiations are reflected in writing.
219
+ - Avoid blank or undefined items (like fees or dates "to be decided").
220
  """
221
  logger.debug(f"Generated rule-based review prompt:\n{rule_based_prompt}")
222
 
223
  try:
224
+ rule_based_review = generate_with_model(rule_based_prompt, max_length=2000, temperature=0.3)
 
 
 
 
 
225
  except Exception as e:
226
  logger.exception("Error during rule-based review.")
227
  return f"Error during rule-based review: {e}"
228
 
229
  # --- Wording Analysis ---
230
  wording_analysis_prompt = f"""
231
+ You are a legal AI assistant. Analyze the following legal document for its wording:
232
 
233
  Document text:
234
  \"\"\"
235
  {doc_text}
236
  \"\"\"
237
 
238
+ Provide a comprehensive analysis of the document's wording, covering these aspects for the ENTIRE document text:
239
+
240
+ 1. **Clarity and Precision:** Identify ambiguous or vague language, and suggest improvements.
241
+ 2. **Readability:** Assess the overall readability and suggest improvements for clarity, including sentence structure and complexity.
242
+ 3. **Formal Tone:** Check if the language maintains a formal and professional tone appropriate for a legal document, and suggest changes if needed.
243
+ 4. **Consistency:** Ensure consistent use of terms and phrasing throughout the document. Point out any inconsistencies.
244
+ 5. **Redundancy:** Identify any unnecessary repetition of words or phrases.
245
+ 6. **Jargon and Technical Terms:** Identify jargon or technical terms that might be unclear to a non-expert, and suggest clearer alternatives where appropriate.
246
+ 7. **Overall Recommendations:** Give overall recommendations for improving the document's wording.
247
+
248
+ Provide your analysis in plain text, without using Markdown. Label each section of your analysis clearly (e.g., "Clarity and Precision:", "Readability:", etc.).
249
  """
250
  logger.debug(f"Generated wording analysis prompt:\n{wording_analysis_prompt}")
251
 
252
  try:
253
+ wording_analysis = generate_with_model(wording_analysis_prompt, max_length=1000, temperature=0.3)
 
 
 
 
 
254
  except Exception as e:
255
  logger.exception("Error during wording analysis.")
256
  return f"Error during wording analysis: {e}"
 
258
  combined_review = f"Rule-Based Analysis:\n\n{rule_based_review}\n\nWording Analysis:\n\n{wording_analysis}"
259
  return combined_review
260
 
 
261
  ###############################################################################
262
  # 4) File Parsing (PDF, DOCX)
263
  ###############################################################################
264
 
265
  def parse_bytesio(file_data: BytesIO) -> str:
266
+ """Parses a BytesIO object representing a PDF or DOCX."""
267
  logger.info("Parsing BytesIO object...")
 
268
  try:
269
  # Attempt to determine file type from content
270
  try:
 
288
 
289
  def parse_uploaded_file_path(file_data) -> str:
290
  """Takes file data, determines type, extracts text."""
 
291
  if not file_data:
292
  logger.warning("No file provided.")
293
  return ""
 
328
 
329
  def clean_markdown(text):
330
  """Removes common Markdown formatting."""
 
331
  if not text: return ""
332
  text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
333
  text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text)
 
342
  def create_and_save_docx(doc_text, review_text=None, doc_type="Unknown", party_a="Party A", party_b="Party B"):
343
  """Creates DOCX, adds review, saves to temp file, returns path."""
344
  logger.debug("Creating and saving DOCX.")
 
345
  document = docx.Document()
346
 
347
  now = datetime.datetime.now()
348
  timestamp = now.strftime("%Y%m%d_%H%M%S")
349
+ file_name = f"HF_AI_Review_{doc_type}_{timestamp}.docx"
350
 
351
+ title = f"DocumentCogito Analysis of {doc_type} between companies {party_a} and {party_b}"
352
  document.add_heading(title, level=1)
353
 
354
  if doc_text:
 
381
  else: # Other sections (if any)
382
  document.add_paragraph(section)
383
 
 
384
  with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{file_name}") as tmpfile:
385
  document.save(tmpfile.name)
386
  logger.debug(f"DOCX saved to: {tmpfile.name}")
 
395
  logger.info(f"User requested doc generation: {doc_type}, {country}")
396
  doc_text = generate_legal_document(doc_type, party_a, party_b, context, country)
397
  if doc_text.startswith("Error"):
398
+ return doc_text, None
399
  docx_file_path = create_and_save_docx(doc_text, doc_type=doc_type, party_a=party_a, party_b=party_b)
400
  return doc_text, docx_file_path
401
 
 
419
  ###############################################################################
420
  # 7) Build & Launch Gradio App
421
  ###############################################################################
422
+ # Define custom CSS in a string.
423
  custom_css = """
424
  .tab-one {
425
  background-color: #D1EEFC; /* Light blue */
 
429
  background-color: #FCEED1; /* Light orange */
430
  color: #333;
431
  }
432
+ /* If you want to style the tab label differently, you may need to target
433
+ specific child elements (like a .tab__header) within the class. */
434
  """
435
 
436
  def build_app():
437
  with gr.Blocks(css=custom_css) as demo:
438
  gr.Markdown(
439
  """
440
+ # UST Global Legal Document Analyzer (Hugging Face Version)
441
 
442
  **Review an Existing MOU, SOW, MSA in PDF/DOCX format**: Upload a document for analysis.
443
 
 
445
  """
446
  )
447
  with gr.Tabs(selected=1):
448
+ with gr.Tab("Generate Document", visible=False):
449
  doc_type = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU")
450
  party_a = gr.Textbox(label="Party A Name", placeholder="e.g., Tech Innovations LLC")
451
  party_b = gr.Textbox(label="Party B Name", placeholder="e.g., Global Consulting Corp")
 
460
  outputs=[gen_output_text, gen_output_file]
461
  )
462
 
463
+ with gr.Tab("Review Document", elem_classes="tab-one", id=1):
464
  # Hidden inputs to store values from Generate tab
465
  doc_type_review = gr.Dropdown(label="Document Type", choices=["MOU", "MSA", "SoW", "NDA"], value="MOU", visible=False)
466
  party_a_review = gr.Textbox(label="Party A Name", visible=False)
 
478
  # Copy values from Generate to Review tab (hidden fields)
479
  gen_button.click(lambda x, y, z: (x, y, z), [doc_type, party_a, party_b], [doc_type_review, party_a_review, party_b_review])
480
 
481
+ gr.Markdown("**Note:** Scanned PDFs may not parse correctly. .docx is generally preferred.")
 
482
  return demo
483
 
484
+ # For Hugging Face Spaces deployment
485
+ if __name__ == "__main__":
486
+ # create_requirements_file()
487
+ logger.info("Initializing Gradio interface...")
488
+ demo = build_app()
489
+ logger.info("Launching Gradio app.")
490
+ demo.launch()
requirements.txt CHANGED
@@ -1,20 +1,7 @@
1
- # Core Libraries
2
- gradio>=4.0,<5.0 # UI framework (tested with 4.14.0, but should work with 4.x)
3
- python-docx # For creating DOCX files (python-docx)
4
- PyPDF2>=3.0.0 # For reading PDF files
5
- transformers>=4.36.0,<4.37 # Hugging Face Transformers (version range for compatibility)
6
- sentencepiece>=0.1.9,<0.2.0 # Required by some Hugging Face tokenizers
7
- torch>=2.1.0 # PyTorch (at least 2.1 for recent transformers compatibility)
8
-
9
- # Other commonly used packages (you might already have these)
10
- # - If you encounter installation issues, you can try removing or commenting out
11
- # lines for packages you believe are already installed correctly in your
12
- # environment. However, it's generally good practice to include them
13
- # for reproducibility.
14
- typing-extensions>=4.0.0 # For type hints and compatibility (often a dependency)
15
- requests>=2.0.0 # Used by transformers and other libraries
16
- filelock>=3.0.0 # Used by transformers for managing cache
17
- packaging>=20.0 # For version handling
18
- regex!=2019.12.17 # Used for text processing
19
- tqdm>=4.27 # For progress bars (used by transformers)
20
- numpy>=1.17 # Fundamental numerical computing
 
1
+ gradio>=3.50.2
2
+ transformers>=4.35.0
3
+ torch>=2.0.0
4
+ python-docx>=0.8.11
5
+ PyPDF2>=3.0.0
6
+ huggingface_hub>=0.19.0
7
+ accelerate>=0.20.0