tejash300 commited on
Commit
b6de26f
Β·
verified Β·
1 Parent(s): 21289a5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -33
app.py CHANGED
@@ -1,4 +1,6 @@
1
  import os
 
 
2
  import io
3
  import torch
4
  import uvicorn
@@ -19,7 +21,7 @@ from pyngrok import ngrok
19
  from threading import Thread
20
  import time
21
  import uuid
22
- import subprocess # Used for running ffmpeg commands
23
 
24
  # βœ… Ensure compatibility with Google Colab
25
  try:
@@ -49,7 +51,7 @@ app.add_middleware(
49
 
50
  # βœ… Initialize document storage
51
  document_storage = {}
52
- chat_history = [] # βœ… Added global chat history
53
 
54
  # βœ… Function to store document context by task ID
55
  def store_document_context(task_id, text):
@@ -68,26 +70,18 @@ def load_document_context(task_id):
68
 
69
  def fine_tune_cuad_model():
70
  """
71
- Fine tunes a question-answering model on the CUAD (Contract Understanding Atticus Dataset)
72
- for detailed clause extraction. This demo function uses one epoch for demonstration;
73
- adjust training parameters as needed.
74
  """
75
  from datasets import load_dataset
76
  import numpy as np
77
- # Optionally, load a metric (here we leave metrics out for brevity)
78
- from transformers import Trainer, TrainingArguments
79
- from transformers import AutoModelForQuestionAnswering
80
 
81
  print("βœ… Loading CUAD dataset for fine tuning...")
82
- # Load the CUAD QA dataset (SQuAD-style) with custom code allowed
83
  dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
84
 
85
- # Use the train split with a larger subset for production fine tuning
86
  if "train" in dataset:
87
- # Select a larger subset for training, e.g., 1000 examples
88
  train_dataset = dataset["train"].select(range(1000))
89
-
90
- # For validation, you might select around 200 examples
91
  if "validation" in dataset:
92
  val_dataset = dataset["validation"].select(range(200))
93
  else:
@@ -99,12 +93,10 @@ def fine_tune_cuad_model():
99
 
100
  print("βœ… Preparing training features...")
101
 
102
- # Load a QA model and its tokenizer. Here we use deepset/roberta-base-squad2.
103
  tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
104
  model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
105
 
106
  def prepare_train_features(examples):
107
- # Tokenize with question and context; use truncation only on the context.
108
  tokenized_examples = tokenizer(
109
  examples["question"],
110
  examples["context"],
@@ -153,11 +145,9 @@ def fine_tune_cuad_model():
153
  train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
154
  val_dataset = val_dataset.map(prepare_train_features, batched=True, remove_columns=val_dataset.column_names)
155
 
156
- # Set format for PyTorch QA training
157
  train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
158
  val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
159
 
160
- # For QA tasks, computing metrics can be more complex; here we skip metrics for brevity.
161
  training_args = TrainingArguments(
162
  output_dir="./fine_tuned_legal_qa",
163
  evaluation_strategy="steps",
@@ -170,7 +160,7 @@ def fine_tune_cuad_model():
170
  logging_steps=50,
171
  save_steps=100,
172
  load_best_model_at_end=True,
173
- report_to=[] # Disables wandb logging to avoid related issues
174
  )
175
 
176
  print("βœ… Starting fine tuning on CUAD QA dataset...")
@@ -203,8 +193,7 @@ try:
203
  nlp = spacy.load("en_core_web_sm")
204
  print("βœ… Loading NLP models...")
205
 
206
- # Updated summarizer initialization with a slow tokenizer
207
- from transformers import AutoTokenizer
208
  summarizer = pipeline(
209
  "summarization",
210
  model="nsi319/legal-pegasus",
@@ -213,14 +202,11 @@ try:
213
  )
214
 
215
  embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
216
- ner_model = pipeline("ner", model="dslim/bert-base-NER",
217
- device=0 if torch.cuda.is_available() else -1)
218
- speech_to_text = pipeline("automatic-speech-recognition",
219
- model="openai/whisper-medium",
220
- chunk_length_s=30,
221
- device_map="auto" if torch.cuda.is_available() else "cpu")
222
-
223
- # βœ… Load or Fine Tune CUAD QA Model
224
  if os.path.exists("fine_tuned_legal_qa"):
225
  print("βœ… Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
226
  cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
@@ -239,7 +225,6 @@ except Exception as e:
239
  raise RuntimeError(f"Error loading models: {str(e)}")
240
 
241
  from transformers import pipeline
242
-
243
  qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
244
 
245
  def legal_chatbot(user_input, context):
@@ -260,11 +245,10 @@ def extract_text_from_pdf(pdf_file):
260
  raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
261
 
262
  def process_video_to_text(video_file_path):
263
- """Extract audio from video using ffmpeg and convert to text."""
264
  try:
265
  print(f"Processing video file at {video_file_path}")
266
  temp_audio_path = os.path.join("temp", "extracted_audio.wav")
267
- # Use ffmpeg command to extract audio from the video file
268
  cmd = [
269
  "ffmpeg", "-i", video_file_path, "-vn",
270
  "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
@@ -283,7 +267,7 @@ def process_video_to_text(video_file_path):
283
  raise HTTPException(status_code=400, detail=f"Video processing failed: {str(e)}")
284
 
285
  def process_audio_to_text(audio_file_path):
286
- """Process audio file and convert to text."""
287
  try:
288
  print(f"Processing audio file at {audio_file_path}")
289
  result = speech_to_text(audio_file_path)
@@ -429,7 +413,7 @@ def analyze_contract_clauses(text):
429
  inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
430
  with torch.no_grad():
431
  outputs = cuad_model(**inputs)
432
- predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0] # Using start_logits for example
433
  for idx, confidence in enumerate(predictions):
434
  if confidence > 0.5 and idx < len(clause_types):
435
  clauses_detected.append({"type": clause_types[idx], "confidence": float(confidence)})
 
1
  import os
2
+ os.environ["TRANSFORMERS_NO_FAST"] = "1" # Force use of slow tokenizers
3
+
4
  import io
5
  import torch
6
  import uvicorn
 
21
  from threading import Thread
22
  import time
23
  import uuid
24
+ import subprocess # For running ffmpeg commands
25
 
26
  # βœ… Ensure compatibility with Google Colab
27
  try:
 
51
 
52
  # βœ… Initialize document storage
53
  document_storage = {}
54
+ chat_history = [] # Global chat history
55
 
56
  # βœ… Function to store document context by task ID
57
  def store_document_context(task_id, text):
 
70
 
71
  def fine_tune_cuad_model():
72
  """
73
+ Fine tunes a QA model on the CUAD dataset for clause extraction.
74
+ This demo uses one epoch; adjust parameters as needed.
 
75
  """
76
  from datasets import load_dataset
77
  import numpy as np
78
+ from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering
 
 
79
 
80
  print("βœ… Loading CUAD dataset for fine tuning...")
 
81
  dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
82
 
 
83
  if "train" in dataset:
 
84
  train_dataset = dataset["train"].select(range(1000))
 
 
85
  if "validation" in dataset:
86
  val_dataset = dataset["validation"].select(range(200))
87
  else:
 
93
 
94
  print("βœ… Preparing training features...")
95
 
 
96
  tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
97
  model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
98
 
99
  def prepare_train_features(examples):
 
100
  tokenized_examples = tokenizer(
101
  examples["question"],
102
  examples["context"],
 
145
  train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
146
  val_dataset = val_dataset.map(prepare_train_features, batched=True, remove_columns=val_dataset.column_names)
147
 
 
148
  train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
149
  val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
150
 
 
151
  training_args = TrainingArguments(
152
  output_dir="./fine_tuned_legal_qa",
153
  evaluation_strategy="steps",
 
160
  logging_steps=50,
161
  save_steps=100,
162
  load_best_model_at_end=True,
163
+ report_to=[] # Disable wandb logging
164
  )
165
 
166
  print("βœ… Starting fine tuning on CUAD QA dataset...")
 
193
  nlp = spacy.load("en_core_web_sm")
194
  print("βœ… Loading NLP models...")
195
 
196
+ # Initialize summarizer with a slow tokenizer
 
197
  summarizer = pipeline(
198
  "summarization",
199
  model="nsi319/legal-pegasus",
 
202
  )
203
 
204
  embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
205
+ ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1)
206
+ speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-medium", chunk_length_s=30,
207
+ device_map="auto" if torch.cuda.is_available() else "cpu")
208
+
209
+ # Load or fine tune CUAD QA model
 
 
 
210
  if os.path.exists("fine_tuned_legal_qa"):
211
  print("βœ… Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
212
  cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
 
225
  raise RuntimeError(f"Error loading models: {str(e)}")
226
 
227
  from transformers import pipeline
 
228
  qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
229
 
230
  def legal_chatbot(user_input, context):
 
245
  raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
246
 
247
  def process_video_to_text(video_file_path):
248
+ """Extracts audio from video using ffmpeg and converts to text."""
249
  try:
250
  print(f"Processing video file at {video_file_path}")
251
  temp_audio_path = os.path.join("temp", "extracted_audio.wav")
 
252
  cmd = [
253
  "ffmpeg", "-i", video_file_path, "-vn",
254
  "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
 
267
  raise HTTPException(status_code=400, detail=f"Video processing failed: {str(e)}")
268
 
269
  def process_audio_to_text(audio_file_path):
270
+ """Processes an audio file and converts it to text."""
271
  try:
272
  print(f"Processing audio file at {audio_file_path}")
273
  result = speech_to_text(audio_file_path)
 
413
  inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
414
  with torch.no_grad():
415
  outputs = cuad_model(**inputs)
416
+ predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
417
  for idx, confidence in enumerate(predictions):
418
  if confidence > 0.5 and idx < len(clause_types):
419
  clauses_detected.append({"type": clause_types[idx], "confidence": float(confidence)})