Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
import os
|
|
|
|
|
2 |
import io
|
3 |
import torch
|
4 |
import uvicorn
|
@@ -19,7 +21,7 @@ from pyngrok import ngrok
|
|
19 |
from threading import Thread
|
20 |
import time
|
21 |
import uuid
|
22 |
-
import subprocess #
|
23 |
|
24 |
# β
Ensure compatibility with Google Colab
|
25 |
try:
|
@@ -49,7 +51,7 @@ app.add_middleware(
|
|
49 |
|
50 |
# β
Initialize document storage
|
51 |
document_storage = {}
|
52 |
-
chat_history = [] #
|
53 |
|
54 |
# β
Function to store document context by task ID
|
55 |
def store_document_context(task_id, text):
|
@@ -68,26 +70,18 @@ def load_document_context(task_id):
|
|
68 |
|
69 |
def fine_tune_cuad_model():
|
70 |
"""
|
71 |
-
Fine tunes a
|
72 |
-
|
73 |
-
adjust training parameters as needed.
|
74 |
"""
|
75 |
from datasets import load_dataset
|
76 |
import numpy as np
|
77 |
-
|
78 |
-
from transformers import Trainer, TrainingArguments
|
79 |
-
from transformers import AutoModelForQuestionAnswering
|
80 |
|
81 |
print("β
Loading CUAD dataset for fine tuning...")
|
82 |
-
# Load the CUAD QA dataset (SQuAD-style) with custom code allowed
|
83 |
dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
|
84 |
|
85 |
-
# Use the train split with a larger subset for production fine tuning
|
86 |
if "train" in dataset:
|
87 |
-
# Select a larger subset for training, e.g., 1000 examples
|
88 |
train_dataset = dataset["train"].select(range(1000))
|
89 |
-
|
90 |
-
# For validation, you might select around 200 examples
|
91 |
if "validation" in dataset:
|
92 |
val_dataset = dataset["validation"].select(range(200))
|
93 |
else:
|
@@ -99,12 +93,10 @@ def fine_tune_cuad_model():
|
|
99 |
|
100 |
print("β
Preparing training features...")
|
101 |
|
102 |
-
# Load a QA model and its tokenizer. Here we use deepset/roberta-base-squad2.
|
103 |
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
|
104 |
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
|
105 |
|
106 |
def prepare_train_features(examples):
|
107 |
-
# Tokenize with question and context; use truncation only on the context.
|
108 |
tokenized_examples = tokenizer(
|
109 |
examples["question"],
|
110 |
examples["context"],
|
@@ -153,11 +145,9 @@ def fine_tune_cuad_model():
|
|
153 |
train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
|
154 |
val_dataset = val_dataset.map(prepare_train_features, batched=True, remove_columns=val_dataset.column_names)
|
155 |
|
156 |
-
# Set format for PyTorch QA training
|
157 |
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
|
158 |
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
|
159 |
|
160 |
-
# For QA tasks, computing metrics can be more complex; here we skip metrics for brevity.
|
161 |
training_args = TrainingArguments(
|
162 |
output_dir="./fine_tuned_legal_qa",
|
163 |
evaluation_strategy="steps",
|
@@ -170,7 +160,7 @@ def fine_tune_cuad_model():
|
|
170 |
logging_steps=50,
|
171 |
save_steps=100,
|
172 |
load_best_model_at_end=True,
|
173 |
-
report_to=[] #
|
174 |
)
|
175 |
|
176 |
print("β
Starting fine tuning on CUAD QA dataset...")
|
@@ -203,8 +193,7 @@ try:
|
|
203 |
nlp = spacy.load("en_core_web_sm")
|
204 |
print("β
Loading NLP models...")
|
205 |
|
206 |
-
#
|
207 |
-
from transformers import AutoTokenizer
|
208 |
summarizer = pipeline(
|
209 |
"summarization",
|
210 |
model="nsi319/legal-pegasus",
|
@@ -213,14 +202,11 @@ try:
|
|
213 |
)
|
214 |
|
215 |
embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
|
216 |
-
ner_model = pipeline("ner", model="dslim/bert-base-NER",
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
device_map="auto" if torch.cuda.is_available() else "cpu")
|
222 |
-
|
223 |
-
# β
Load or Fine Tune CUAD QA Model
|
224 |
if os.path.exists("fine_tuned_legal_qa"):
|
225 |
print("β
Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
|
226 |
cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
|
@@ -239,7 +225,6 @@ except Exception as e:
|
|
239 |
raise RuntimeError(f"Error loading models: {str(e)}")
|
240 |
|
241 |
from transformers import pipeline
|
242 |
-
|
243 |
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
|
244 |
|
245 |
def legal_chatbot(user_input, context):
|
@@ -260,11 +245,10 @@ def extract_text_from_pdf(pdf_file):
|
|
260 |
raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
|
261 |
|
262 |
def process_video_to_text(video_file_path):
|
263 |
-
"""
|
264 |
try:
|
265 |
print(f"Processing video file at {video_file_path}")
|
266 |
temp_audio_path = os.path.join("temp", "extracted_audio.wav")
|
267 |
-
# Use ffmpeg command to extract audio from the video file
|
268 |
cmd = [
|
269 |
"ffmpeg", "-i", video_file_path, "-vn",
|
270 |
"-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
|
@@ -283,7 +267,7 @@ def process_video_to_text(video_file_path):
|
|
283 |
raise HTTPException(status_code=400, detail=f"Video processing failed: {str(e)}")
|
284 |
|
285 |
def process_audio_to_text(audio_file_path):
|
286 |
-
"""
|
287 |
try:
|
288 |
print(f"Processing audio file at {audio_file_path}")
|
289 |
result = speech_to_text(audio_file_path)
|
@@ -429,7 +413,7 @@ def analyze_contract_clauses(text):
|
|
429 |
inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
|
430 |
with torch.no_grad():
|
431 |
outputs = cuad_model(**inputs)
|
432 |
-
predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
|
433 |
for idx, confidence in enumerate(predictions):
|
434 |
if confidence > 0.5 and idx < len(clause_types):
|
435 |
clauses_detected.append({"type": clause_types[idx], "confidence": float(confidence)})
|
|
|
1 |
import os
|
2 |
+
os.environ["TRANSFORMERS_NO_FAST"] = "1" # Force use of slow tokenizers
|
3 |
+
|
4 |
import io
|
5 |
import torch
|
6 |
import uvicorn
|
|
|
21 |
from threading import Thread
|
22 |
import time
|
23 |
import uuid
|
24 |
+
import subprocess # For running ffmpeg commands
|
25 |
|
26 |
# β
Ensure compatibility with Google Colab
|
27 |
try:
|
|
|
51 |
|
52 |
# β
Initialize document storage
|
53 |
document_storage = {}
|
54 |
+
chat_history = [] # Global chat history
|
55 |
|
56 |
# β
Function to store document context by task ID
|
57 |
def store_document_context(task_id, text):
|
|
|
70 |
|
71 |
def fine_tune_cuad_model():
|
72 |
"""
|
73 |
+
Fine tunes a QA model on the CUAD dataset for clause extraction.
|
74 |
+
This demo uses one epoch; adjust parameters as needed.
|
|
|
75 |
"""
|
76 |
from datasets import load_dataset
|
77 |
import numpy as np
|
78 |
+
from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering
|
|
|
|
|
79 |
|
80 |
print("β
Loading CUAD dataset for fine tuning...")
|
|
|
81 |
dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
|
82 |
|
|
|
83 |
if "train" in dataset:
|
|
|
84 |
train_dataset = dataset["train"].select(range(1000))
|
|
|
|
|
85 |
if "validation" in dataset:
|
86 |
val_dataset = dataset["validation"].select(range(200))
|
87 |
else:
|
|
|
93 |
|
94 |
print("β
Preparing training features...")
|
95 |
|
|
|
96 |
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
|
97 |
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
|
98 |
|
99 |
def prepare_train_features(examples):
|
|
|
100 |
tokenized_examples = tokenizer(
|
101 |
examples["question"],
|
102 |
examples["context"],
|
|
|
145 |
train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
|
146 |
val_dataset = val_dataset.map(prepare_train_features, batched=True, remove_columns=val_dataset.column_names)
|
147 |
|
|
|
148 |
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
|
149 |
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
|
150 |
|
|
|
151 |
training_args = TrainingArguments(
|
152 |
output_dir="./fine_tuned_legal_qa",
|
153 |
evaluation_strategy="steps",
|
|
|
160 |
logging_steps=50,
|
161 |
save_steps=100,
|
162 |
load_best_model_at_end=True,
|
163 |
+
report_to=[] # Disable wandb logging
|
164 |
)
|
165 |
|
166 |
print("β
Starting fine tuning on CUAD QA dataset...")
|
|
|
193 |
nlp = spacy.load("en_core_web_sm")
|
194 |
print("β
Loading NLP models...")
|
195 |
|
196 |
+
# Initialize summarizer with a slow tokenizer
|
|
|
197 |
summarizer = pipeline(
|
198 |
"summarization",
|
199 |
model="nsi319/legal-pegasus",
|
|
|
202 |
)
|
203 |
|
204 |
embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
|
205 |
+
ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1)
|
206 |
+
speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-medium", chunk_length_s=30,
|
207 |
+
device_map="auto" if torch.cuda.is_available() else "cpu")
|
208 |
+
|
209 |
+
# Load or fine tune CUAD QA model
|
|
|
|
|
|
|
210 |
if os.path.exists("fine_tuned_legal_qa"):
|
211 |
print("β
Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
|
212 |
cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
|
|
|
225 |
raise RuntimeError(f"Error loading models: {str(e)}")
|
226 |
|
227 |
from transformers import pipeline
|
|
|
228 |
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
|
229 |
|
230 |
def legal_chatbot(user_input, context):
|
|
|
245 |
raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
|
246 |
|
247 |
def process_video_to_text(video_file_path):
|
248 |
+
"""Extracts audio from video using ffmpeg and converts to text."""
|
249 |
try:
|
250 |
print(f"Processing video file at {video_file_path}")
|
251 |
temp_audio_path = os.path.join("temp", "extracted_audio.wav")
|
|
|
252 |
cmd = [
|
253 |
"ffmpeg", "-i", video_file_path, "-vn",
|
254 |
"-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
|
|
|
267 |
raise HTTPException(status_code=400, detail=f"Video processing failed: {str(e)}")
|
268 |
|
269 |
def process_audio_to_text(audio_file_path):
|
270 |
+
"""Processes an audio file and converts it to text."""
|
271 |
try:
|
272 |
print(f"Processing audio file at {audio_file_path}")
|
273 |
result = speech_to_text(audio_file_path)
|
|
|
413 |
inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
|
414 |
with torch.no_grad():
|
415 |
outputs = cuad_model(**inputs)
|
416 |
+
predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
|
417 |
for idx, confidence in enumerate(predictions):
|
418 |
if confidence > 0.5 and idx < len(clause_types):
|
419 |
clauses_detected.append({"type": clause_types[idx], "confidence": float(confidence)})
|