Spaces:

tejash300
/

docanalyzer

Runtime error

App Files Files Community

tejash300 commited on Mar 31

Commit

a94f8aa

verified ·

1 Parent(s): 631e3bb

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -35

app.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import os
 import io
 import torch
 import uvicorn
 import spacy
-import subprocess  # For running ffmpeg commands
 import pdfplumber
 import librosa
 import soundfile as sf
 import matplotlib.pyplot as plt
@@ -21,21 +23,21 @@ from threading import Thread
 import time
 import uuid
-# Ensure compatibility with Google Colab (if applicable)
 try:
     from google.colab import drive
     drive.mount('/content/drive')
 except:
     pass  # Skip drive mount if not in Google Colab
-# Ensure required directories exist
 os.makedirs("static", exist_ok=True)
 os.makedirs("temp", exist_ok=True)
-# Ensure GPU usage
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Initialize FastAPI
 app = FastAPI(title="Legal Document and Video Analyzer")
 # Add CORS middleware
@@ -47,17 +49,17 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Initialize document storage
 document_storage = {}
-chat_history = []  # Global chat history
-# Function to store document context by task ID
 def store_document_context(task_id, text):
     """Store document text for retrieval by chatbot."""
     document_storage[task_id] = text
     return True
-# Function to load document context by task ID
 def load_document_context(task_id):
     """Retrieve document text for chatbot context."""
     return document_storage.get(task_id, "")
@@ -74,14 +76,20 @@ def fine_tune_cuad_model():
     """
     from datasets import load_dataset
     import numpy as np
     from transformers import Trainer, TrainingArguments
     from transformers import AutoModelForQuestionAnswering
     print("✅ Loading CUAD dataset for fine tuning...")
     dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
     if "train" in dataset:
         train_dataset = dataset["train"].select(range(1000))
         if "validation" in dataset:
             val_dataset = dataset["validation"].select(range(200))
         else:
@@ -93,10 +101,12 @@ def fine_tune_cuad_model():
     print("✅ Preparing training features...")
     tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
     model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
     def prepare_train_features(examples):
         tokenized_examples = tokenizer(
             examples["question"],
             examples["context"],
@@ -145,9 +155,11 @@ def fine_tune_cuad_model():
     train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
     val_dataset = val_dataset.map(prepare_train_features, batched=True, remove_columns=val_dataset.column_names)
     train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
     val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
     training_args = TrainingArguments(
         output_dir="./fine_tuned_legal_qa",
         evaluation_strategy="steps",
@@ -160,11 +172,11 @@ def fine_tune_cuad_model():
         logging_steps=50,
         save_steps=100,
         load_best_model_at_end=True,
-        report_to=[]
     )
     print("✅ Starting fine tuning on CUAD QA dataset...")
-    from transformers import Trainer  # Ensure Trainer is imported here
     trainer = Trainer(
         model=model,
         args=training_args,
@@ -193,23 +205,24 @@ try:
         nlp = spacy.load("en_core_web_sm")
     print("✅ Loading NLP models...")
-    # Updated summarizer: add trust_remote_code=True and use_fast=False to avoid Tiktoken conversion errors.
     summarizer = pipeline(
         "summarization",
         model="nsi319/legal-pegasus",
-        trust_remote_code=True,
-        use_fast=False,
         device=0 if torch.cuda.is_available() else -1
     )
     embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
     ner_model = pipeline("ner", model="dslim/bert-base-NER",
-                         device=0 if torch.cuda.is_available() else -1)
     speech_to_text = pipeline("automatic-speech-recognition",
-                              model="openai/whisper-medium",
-                              chunk_length_s=30,
-                              device_map="auto" if torch.cuda.is_available() else "cpu")
-    # Load or Fine Tune CUAD QA Model
     if os.path.exists("fine_tuned_legal_qa"):
         print("✅ Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
         cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
@@ -227,6 +240,8 @@ except Exception as e:
     print(f"⚠️ Error loading models: {str(e)}")
     raise RuntimeError(f"Error loading models: {str(e)}")
 qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
 def legal_chatbot(user_input, context):
@@ -247,21 +262,12 @@ def extract_text_from_pdf(pdf_file):
         raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
 def process_video_to_text(video_file_path):
-    """Extract audio from video using ffmpeg and convert to text."""
     try:
         print(f"Processing video file at {video_file_path}")
         temp_audio_path = os.path.join("temp", "extracted_audio.wav")
-        command = [
-            "ffmpeg",
-            "-y",
-            "-i", video_file_path,
-            "-vn",
-            "-acodec", "pcm_s16le",
-            "-ar", "44100",
-            "-ac", "2",
-            temp_audio_path
-        ]
-        subprocess.run(command, check=True)
         print(f"Audio extracted to {temp_audio_path}")
         result = speech_to_text(temp_audio_path)
         transcript = result["text"]
@@ -420,7 +426,7 @@ def analyze_contract_clauses(text):
         inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
         with torch.no_grad():
             outputs = cuad_model(**inputs)
-        predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
         for idx, confidence in enumerate(predictions):
             if confidence > 0.5 and idx < len(clause_types):
                 clauses_detected.append({"type": clause_types[idx], "confidence": float(confidence)})
@@ -519,7 +525,7 @@ async def analyze_legal_audio(file: UploadFile = File(...)):
             temp_file_path = temp_file.name
         print(f"Temporary file saved at: {temp_file_path}")
         text = process_audio_to_text(temp_file_path)
-        if os.path.exists(temp_audio_path):
             os.remove(temp_file_path)
         if not text:
             return {"status": "error", "message": "No speech could be transcribed from the audio."}
@@ -608,6 +614,8 @@ def setup_ngrok():
         print(f"⚠️ Ngrok setup error: {e}")
         return None
 @app.get("/download_risk_chart")
 async def download_risk_chart():
     """Generate and return a risk assessment chart as an image file."""
@@ -742,5 +750,3 @@ if __name__ == "__main__":
     else:
         print("\n⚠️ Ngrok setup failed. API will only be available locally.\n")
     run()

+%%writefile app.py
 import os
 import io
 import torch
 import uvicorn
 import spacy
 import pdfplumber
+import moviepy.editor as mp
 import librosa
 import soundfile as sf
 import matplotlib.pyplot as plt
 import time
 import uuid
+# ✅ Ensure compatibility with Google Colab
 try:
     from google.colab import drive
     drive.mount('/content/drive')
 except:
     pass  # Skip drive mount if not in Google Colab
+# ✅ Ensure required directories exist
 os.makedirs("static", exist_ok=True)
 os.makedirs("temp", exist_ok=True)
+# ✅ Ensure GPU usage
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# ✅ Initialize FastAPI
 app = FastAPI(title="Legal Document and Video Analyzer")
 # Add CORS middleware
     allow_headers=["*"],
 )
+# ✅ Initialize document storage
 document_storage = {}
+chat_history = []  # ✅ Added global chat history
+# ✅ Function to store document context by task ID
 def store_document_context(task_id, text):
     """Store document text for retrieval by chatbot."""
     document_storage[task_id] = text
     return True
+# ✅ Function to load document context by task ID
 def load_document_context(task_id):
     """Retrieve document text for chatbot context."""
     return document_storage.get(task_id, "")
     """
     from datasets import load_dataset
     import numpy as np
+    # Optionally, load a metric (here we leave metrics out for brevity)
     from transformers import Trainer, TrainingArguments
     from transformers import AutoModelForQuestionAnswering
     print("✅ Loading CUAD dataset for fine tuning...")
+    # Load the CUAD QA dataset (SQuAD-style) with custom code allowed
     dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
+    # Use the train split with a larger subset for production fine tuning
     if "train" in dataset:
+        # Select a larger subset for training, e.g., 1000 examples
         train_dataset = dataset["train"].select(range(1000))
+        # For validation, you might select around 200 examples
         if "validation" in dataset:
             val_dataset = dataset["validation"].select(range(200))
         else:
     print("✅ Preparing training features...")
+    # Load a QA model and its tokenizer. Here we use deepset/roberta-base-squad2.
     tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
     model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
     def prepare_train_features(examples):
+        # Tokenize with question and context; use truncation only on the context.
         tokenized_examples = tokenizer(
             examples["question"],
             examples["context"],
     train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
     val_dataset = val_dataset.map(prepare_train_features, batched=True, remove_columns=val_dataset.column_names)
+    # Set format for PyTorch QA training
     train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
     val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
+    # For QA tasks, computing metrics can be more complex; here we skip metrics for brevity.
     training_args = TrainingArguments(
         output_dir="./fine_tuned_legal_qa",
         evaluation_strategy="steps",
         logging_steps=50,
         save_steps=100,
         load_best_model_at_end=True,
+        report_to=[]  # Disables wandb logging to avoid related issues
     )
     print("✅ Starting fine tuning on CUAD QA dataset...")
+    from transformers import Trainer
     trainer = Trainer(
         model=model,
         args=training_args,
         nlp = spacy.load("en_core_web_sm")
     print("✅ Loading NLP models...")
+    # Updated summarizer initialization with a slow tokenizer
+    from transformers import AutoTokenizer
     summarizer = pipeline(
         "summarization",
         model="nsi319/legal-pegasus",
+        tokenizer=AutoTokenizer.from_pretrained("nsi319/legal-pegasus", use_fast=False),
         device=0 if torch.cuda.is_available() else -1
     )
     embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
     ner_model = pipeline("ner", model="dslim/bert-base-NER",
+                     device=0 if torch.cuda.is_available() else -1)
     speech_to_text = pipeline("automatic-speech-recognition",
+                             model="openai/whisper-medium",
+                             chunk_length_s=30,
+                             device_map="auto" if torch.cuda.is_available() else "cpu")
+    # ✅ Load or Fine Tune CUAD QA Model
     if os.path.exists("fine_tuned_legal_qa"):
         print("✅ Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
         cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
     print(f"⚠️ Error loading models: {str(e)}")
     raise RuntimeError(f"Error loading models: {str(e)}")
+from transformers import pipeline
 qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
 def legal_chatbot(user_input, context):
         raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
 def process_video_to_text(video_file_path):
+    """Extract audio from video and convert to text."""
     try:
         print(f"Processing video file at {video_file_path}")
         temp_audio_path = os.path.join("temp", "extracted_audio.wav")
+        video = mp.VideoFileClip(video_file_path)
+        video.audio.write_audiofile(temp_audio_path, codec='pcm_s16le')
         print(f"Audio extracted to {temp_audio_path}")
         result = speech_to_text(temp_audio_path)
         transcript = result["text"]
         inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
         with torch.no_grad():
             outputs = cuad_model(**inputs)
+        predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]  # Using start_logits for example
         for idx, confidence in enumerate(predictions):
             if confidence > 0.5 and idx < len(clause_types):
                 clauses_detected.append({"type": clause_types[idx], "confidence": float(confidence)})
             temp_file_path = temp_file.name
         print(f"Temporary file saved at: {temp_file_path}")
         text = process_audio_to_text(temp_file_path)
+        if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
         if not text:
             return {"status": "error", "message": "No speech could be transcribed from the audio."}
         print(f"⚠️ Ngrok setup error: {e}")
         return None
+from fastapi.responses import FileResponse
 @app.get("/download_risk_chart")
 async def download_risk_chart():
     """Generate and return a risk assessment chart as an image file."""
     else:
         print("\n⚠️ Ngrok setup failed. API will only be available locally.\n")
     run()