tejash300 commited on
Commit
a94f8aa
Β·
verified Β·
1 Parent(s): 631e3bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -35
app.py CHANGED
@@ -1,10 +1,12 @@
 
 
1
  import os
2
  import io
3
  import torch
4
  import uvicorn
5
  import spacy
6
- import subprocess # For running ffmpeg commands
7
  import pdfplumber
 
8
  import librosa
9
  import soundfile as sf
10
  import matplotlib.pyplot as plt
@@ -21,21 +23,21 @@ from threading import Thread
21
  import time
22
  import uuid
23
 
24
- # Ensure compatibility with Google Colab (if applicable)
25
  try:
26
  from google.colab import drive
27
  drive.mount('/content/drive')
28
  except:
29
  pass # Skip drive mount if not in Google Colab
30
 
31
- # Ensure required directories exist
32
  os.makedirs("static", exist_ok=True)
33
  os.makedirs("temp", exist_ok=True)
34
 
35
- # Ensure GPU usage
36
  device = "cuda" if torch.cuda.is_available() else "cpu"
37
 
38
- # Initialize FastAPI
39
  app = FastAPI(title="Legal Document and Video Analyzer")
40
 
41
  # Add CORS middleware
@@ -47,17 +49,17 @@ app.add_middleware(
47
  allow_headers=["*"],
48
  )
49
 
50
- # Initialize document storage
51
  document_storage = {}
52
- chat_history = [] # Global chat history
53
 
54
- # Function to store document context by task ID
55
  def store_document_context(task_id, text):
56
  """Store document text for retrieval by chatbot."""
57
  document_storage[task_id] = text
58
  return True
59
 
60
- # Function to load document context by task ID
61
  def load_document_context(task_id):
62
  """Retrieve document text for chatbot context."""
63
  return document_storage.get(task_id, "")
@@ -74,14 +76,20 @@ def fine_tune_cuad_model():
74
  """
75
  from datasets import load_dataset
76
  import numpy as np
 
77
  from transformers import Trainer, TrainingArguments
78
  from transformers import AutoModelForQuestionAnswering
79
 
80
  print("βœ… Loading CUAD dataset for fine tuning...")
 
81
  dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
82
 
 
83
  if "train" in dataset:
 
84
  train_dataset = dataset["train"].select(range(1000))
 
 
85
  if "validation" in dataset:
86
  val_dataset = dataset["validation"].select(range(200))
87
  else:
@@ -93,10 +101,12 @@ def fine_tune_cuad_model():
93
 
94
  print("βœ… Preparing training features...")
95
 
 
96
  tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
97
  model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
98
 
99
  def prepare_train_features(examples):
 
100
  tokenized_examples = tokenizer(
101
  examples["question"],
102
  examples["context"],
@@ -145,9 +155,11 @@ def fine_tune_cuad_model():
145
  train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
146
  val_dataset = val_dataset.map(prepare_train_features, batched=True, remove_columns=val_dataset.column_names)
147
 
 
148
  train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
149
  val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
150
 
 
151
  training_args = TrainingArguments(
152
  output_dir="./fine_tuned_legal_qa",
153
  evaluation_strategy="steps",
@@ -160,11 +172,11 @@ def fine_tune_cuad_model():
160
  logging_steps=50,
161
  save_steps=100,
162
  load_best_model_at_end=True,
163
- report_to=[]
164
  )
165
 
166
  print("βœ… Starting fine tuning on CUAD QA dataset...")
167
- from transformers import Trainer # Ensure Trainer is imported here
168
  trainer = Trainer(
169
  model=model,
170
  args=training_args,
@@ -193,23 +205,24 @@ try:
193
  nlp = spacy.load("en_core_web_sm")
194
  print("βœ… Loading NLP models...")
195
 
196
- # Updated summarizer: add trust_remote_code=True and use_fast=False to avoid Tiktoken conversion errors.
 
197
  summarizer = pipeline(
198
  "summarization",
199
  model="nsi319/legal-pegasus",
200
- trust_remote_code=True,
201
- use_fast=False,
202
  device=0 if torch.cuda.is_available() else -1
203
  )
 
204
  embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
205
  ner_model = pipeline("ner", model="dslim/bert-base-NER",
206
- device=0 if torch.cuda.is_available() else -1)
207
  speech_to_text = pipeline("automatic-speech-recognition",
208
- model="openai/whisper-medium",
209
- chunk_length_s=30,
210
- device_map="auto" if torch.cuda.is_available() else "cpu")
211
 
212
- # Load or Fine Tune CUAD QA Model
213
  if os.path.exists("fine_tuned_legal_qa"):
214
  print("βœ… Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
215
  cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
@@ -227,6 +240,8 @@ except Exception as e:
227
  print(f"⚠️ Error loading models: {str(e)}")
228
  raise RuntimeError(f"Error loading models: {str(e)}")
229
 
 
 
230
  qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
231
 
232
  def legal_chatbot(user_input, context):
@@ -247,21 +262,12 @@ def extract_text_from_pdf(pdf_file):
247
  raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
248
 
249
  def process_video_to_text(video_file_path):
250
- """Extract audio from video using ffmpeg and convert to text."""
251
  try:
252
  print(f"Processing video file at {video_file_path}")
253
  temp_audio_path = os.path.join("temp", "extracted_audio.wav")
254
- command = [
255
- "ffmpeg",
256
- "-y",
257
- "-i", video_file_path,
258
- "-vn",
259
- "-acodec", "pcm_s16le",
260
- "-ar", "44100",
261
- "-ac", "2",
262
- temp_audio_path
263
- ]
264
- subprocess.run(command, check=True)
265
  print(f"Audio extracted to {temp_audio_path}")
266
  result = speech_to_text(temp_audio_path)
267
  transcript = result["text"]
@@ -420,7 +426,7 @@ def analyze_contract_clauses(text):
420
  inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
421
  with torch.no_grad():
422
  outputs = cuad_model(**inputs)
423
- predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
424
  for idx, confidence in enumerate(predictions):
425
  if confidence > 0.5 and idx < len(clause_types):
426
  clauses_detected.append({"type": clause_types[idx], "confidence": float(confidence)})
@@ -519,7 +525,7 @@ async def analyze_legal_audio(file: UploadFile = File(...)):
519
  temp_file_path = temp_file.name
520
  print(f"Temporary file saved at: {temp_file_path}")
521
  text = process_audio_to_text(temp_file_path)
522
- if os.path.exists(temp_audio_path):
523
  os.remove(temp_file_path)
524
  if not text:
525
  return {"status": "error", "message": "No speech could be transcribed from the audio."}
@@ -608,6 +614,8 @@ def setup_ngrok():
608
  print(f"⚠️ Ngrok setup error: {e}")
609
  return None
610
 
 
 
611
  @app.get("/download_risk_chart")
612
  async def download_risk_chart():
613
  """Generate and return a risk assessment chart as an image file."""
@@ -742,5 +750,3 @@ if __name__ == "__main__":
742
  else:
743
  print("\n⚠️ Ngrok setup failed. API will only be available locally.\n")
744
  run()
745
-
746
-
 
1
+ %%writefile app.py
2
+
3
  import os
4
  import io
5
  import torch
6
  import uvicorn
7
  import spacy
 
8
  import pdfplumber
9
+ import moviepy.editor as mp
10
  import librosa
11
  import soundfile as sf
12
  import matplotlib.pyplot as plt
 
23
  import time
24
  import uuid
25
 
26
+ # βœ… Ensure compatibility with Google Colab
27
  try:
28
  from google.colab import drive
29
  drive.mount('/content/drive')
30
  except:
31
  pass # Skip drive mount if not in Google Colab
32
 
33
+ # βœ… Ensure required directories exist
34
  os.makedirs("static", exist_ok=True)
35
  os.makedirs("temp", exist_ok=True)
36
 
37
+ # βœ… Ensure GPU usage
38
  device = "cuda" if torch.cuda.is_available() else "cpu"
39
 
40
+ # βœ… Initialize FastAPI
41
  app = FastAPI(title="Legal Document and Video Analyzer")
42
 
43
  # Add CORS middleware
 
49
  allow_headers=["*"],
50
  )
51
 
52
+ # βœ… Initialize document storage
53
  document_storage = {}
54
+ chat_history = [] # βœ… Added global chat history
55
 
56
+ # βœ… Function to store document context by task ID
57
  def store_document_context(task_id, text):
58
  """Store document text for retrieval by chatbot."""
59
  document_storage[task_id] = text
60
  return True
61
 
62
+ # βœ… Function to load document context by task ID
63
  def load_document_context(task_id):
64
  """Retrieve document text for chatbot context."""
65
  return document_storage.get(task_id, "")
 
76
  """
77
  from datasets import load_dataset
78
  import numpy as np
79
+ # Optionally, load a metric (here we leave metrics out for brevity)
80
  from transformers import Trainer, TrainingArguments
81
  from transformers import AutoModelForQuestionAnswering
82
 
83
  print("βœ… Loading CUAD dataset for fine tuning...")
84
+ # Load the CUAD QA dataset (SQuAD-style) with custom code allowed
85
  dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
86
 
87
+ # Use the train split with a larger subset for production fine tuning
88
  if "train" in dataset:
89
+ # Select a larger subset for training, e.g., 1000 examples
90
  train_dataset = dataset["train"].select(range(1000))
91
+
92
+ # For validation, you might select around 200 examples
93
  if "validation" in dataset:
94
  val_dataset = dataset["validation"].select(range(200))
95
  else:
 
101
 
102
  print("βœ… Preparing training features...")
103
 
104
+ # Load a QA model and its tokenizer. Here we use deepset/roberta-base-squad2.
105
  tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
106
  model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
107
 
108
  def prepare_train_features(examples):
109
+ # Tokenize with question and context; use truncation only on the context.
110
  tokenized_examples = tokenizer(
111
  examples["question"],
112
  examples["context"],
 
155
  train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
156
  val_dataset = val_dataset.map(prepare_train_features, batched=True, remove_columns=val_dataset.column_names)
157
 
158
+ # Set format for PyTorch QA training
159
  train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
160
  val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
161
 
162
+ # For QA tasks, computing metrics can be more complex; here we skip metrics for brevity.
163
  training_args = TrainingArguments(
164
  output_dir="./fine_tuned_legal_qa",
165
  evaluation_strategy="steps",
 
172
  logging_steps=50,
173
  save_steps=100,
174
  load_best_model_at_end=True,
175
+ report_to=[] # Disables wandb logging to avoid related issues
176
  )
177
 
178
  print("βœ… Starting fine tuning on CUAD QA dataset...")
179
+ from transformers import Trainer
180
  trainer = Trainer(
181
  model=model,
182
  args=training_args,
 
205
  nlp = spacy.load("en_core_web_sm")
206
  print("βœ… Loading NLP models...")
207
 
208
+ # Updated summarizer initialization with a slow tokenizer
209
+ from transformers import AutoTokenizer
210
  summarizer = pipeline(
211
  "summarization",
212
  model="nsi319/legal-pegasus",
213
+ tokenizer=AutoTokenizer.from_pretrained("nsi319/legal-pegasus", use_fast=False),
 
214
  device=0 if torch.cuda.is_available() else -1
215
  )
216
+
217
  embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
218
  ner_model = pipeline("ner", model="dslim/bert-base-NER",
219
+ device=0 if torch.cuda.is_available() else -1)
220
  speech_to_text = pipeline("automatic-speech-recognition",
221
+ model="openai/whisper-medium",
222
+ chunk_length_s=30,
223
+ device_map="auto" if torch.cuda.is_available() else "cpu")
224
 
225
+ # βœ… Load or Fine Tune CUAD QA Model
226
  if os.path.exists("fine_tuned_legal_qa"):
227
  print("βœ… Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
228
  cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
 
240
  print(f"⚠️ Error loading models: {str(e)}")
241
  raise RuntimeError(f"Error loading models: {str(e)}")
242
 
243
+ from transformers import pipeline
244
+
245
  qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
246
 
247
  def legal_chatbot(user_input, context):
 
262
  raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
263
 
264
  def process_video_to_text(video_file_path):
265
+ """Extract audio from video and convert to text."""
266
  try:
267
  print(f"Processing video file at {video_file_path}")
268
  temp_audio_path = os.path.join("temp", "extracted_audio.wav")
269
+ video = mp.VideoFileClip(video_file_path)
270
+ video.audio.write_audiofile(temp_audio_path, codec='pcm_s16le')
 
 
 
 
 
 
 
 
 
271
  print(f"Audio extracted to {temp_audio_path}")
272
  result = speech_to_text(temp_audio_path)
273
  transcript = result["text"]
 
426
  inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
427
  with torch.no_grad():
428
  outputs = cuad_model(**inputs)
429
+ predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0] # Using start_logits for example
430
  for idx, confidence in enumerate(predictions):
431
  if confidence > 0.5 and idx < len(clause_types):
432
  clauses_detected.append({"type": clause_types[idx], "confidence": float(confidence)})
 
525
  temp_file_path = temp_file.name
526
  print(f"Temporary file saved at: {temp_file_path}")
527
  text = process_audio_to_text(temp_file_path)
528
+ if os.path.exists(temp_file_path):
529
  os.remove(temp_file_path)
530
  if not text:
531
  return {"status": "error", "message": "No speech could be transcribed from the audio."}
 
614
  print(f"⚠️ Ngrok setup error: {e}")
615
  return None
616
 
617
+ from fastapi.responses import FileResponse
618
+
619
  @app.get("/download_risk_chart")
620
  async def download_risk_chart():
621
  """Generate and return a risk assessment chart as an image file."""
 
750
  else:
751
  print("\n⚠️ Ngrok setup failed. API will only be available locally.\n")
752
  run()