Spaces:

tejash300
/

docanalyzer

Runtime error

App Files Files Community

tejash300 commited on Mar 31

Commit

c575db1

verified ·

1 Parent(s): 814be0d

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -9

app.py CHANGED Viewed

@@ -71,7 +71,8 @@ def load_document_context(task_id):
 def fine_tune_cuad_model():
     """
     Fine tunes a QA model on the CUAD dataset for clause extraction.
-    This demo uses one epoch; adjust parameters as needed.
     """
     from datasets import load_dataset
     import numpy as np
@@ -81,9 +82,11 @@ def fine_tune_cuad_model():
     dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
     if "train" in dataset:
-        train_dataset = dataset["train"].select(range(1000))
         if "validation" in dataset:
-            val_dataset = dataset["validation"].select(range(200))
         else:
             split = train_dataset.train_test_split(test_size=0.2)
             train_dataset = split["train"]
@@ -148,17 +151,18 @@ def fine_tune_cuad_model():
     train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
     val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
     training_args = TrainingArguments(
         output_dir="./fine_tuned_legal_qa",
         evaluation_strategy="steps",
-        eval_steps=100,
         learning_rate=2e-5,
-        per_device_train_batch_size=16,
-        per_device_eval_batch_size=16,
-        num_train_epochs=1,
         weight_decay=0.01,
-        logging_steps=50,
-        save_steps=100,
         load_best_model_at_end=True,
         report_to=[]  # Disable wandb logging
     )
@@ -737,3 +741,4 @@ if __name__ == "__main__":
     else:
         print("\n⚠️ Ngrok setup failed. API will only be available locally.\n")
     run()

 def fine_tune_cuad_model():
     """
     Fine tunes a QA model on the CUAD dataset for clause extraction.
+    For testing, we use only 50 training examples (and 10 for validation)
+    and set training arguments for very fast, minimal training.
     """
     from datasets import load_dataset
     import numpy as np
     dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
     if "train" in dataset:
+        # Use only 50 examples for training
+        train_dataset = dataset["train"].select(range(50))
         if "validation" in dataset:
+            # Use 10 examples for validation
+            val_dataset = dataset["validation"].select(range(10))
         else:
             split = train_dataset.train_test_split(test_size=0.2)
             train_dataset = split["train"]
     train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
     val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
+    # Adjust training arguments for fast testing
     training_args = TrainingArguments(
         output_dir="./fine_tuned_legal_qa",
         evaluation_strategy="steps",
+        eval_steps=10,
         learning_rate=2e-5,
+        per_device_train_batch_size=4,
+        per_device_eval_batch_size=4,
+        num_train_epochs=0.1,  # Very short training for testing purposes
         weight_decay=0.01,
+        logging_steps=5,
+        save_steps=10,
         load_best_model_at_end=True,
         report_to=[]  # Disable wandb logging
     )
     else:
         print("\n⚠️ Ngrok setup failed. API will only be available locally.\n")
     run()