amos1088 commited on
Commit
deae167
·
1 Parent(s): b71a1f7
Files changed (2) hide show
  1. app.py +107 -79
  2. inference_chatgpt_simple.py +9 -2
app.py CHANGED
@@ -13,7 +13,7 @@ import gradio as gr
13
  from datasets import Dataset
14
  from transformers import AutoTokenizer, AutoModelForCausalLM
15
  from peft import PeftModel, prepare_model_for_kbit_training, LoraConfig, get_peft_model
16
- from trl import DPOTrainer, DPOConfig
17
  import warnings
18
  import subprocess
19
  import gc
@@ -126,7 +126,7 @@ def format_prompt(query, title, content):
126
  if len(content) > 1000:
127
  content = content[:1000] + "..."
128
 
129
- return f"""you would get a query and document's title and content and return yes (if the document is relevant to the query)/ or no (if the document is not relevant to the query)
130
  Answer only yes / no.
131
  Document:
132
  ####DOCUMENT START
@@ -139,9 +139,7 @@ Query:
139
  {query}
140
  ####Query END
141
 
142
- ANSWER:
143
- ####ANSWER START
144
- """
145
 
146
 
147
  def load_model_and_tokenizer(checkpoint_path=None, model_id=None):
@@ -240,7 +238,7 @@ def get_trained_models_list():
240
  text += f"{i}. **{model['repo']}**\n"
241
  text += f" - Accuracy: {model['accuracy']:.2%}\n"
242
  text += f" - Predictions: Yes {model['yes_ratio']:.1%}, No {model['no_ratio']:.1%}\n"
243
- text += f" - Beta: {model['beta']}, Model: {model['model_id'].split('/')[-1]}\n"
244
  text += f" - Link: https://huggingface.co/{model['repo']}\n\n"
245
 
246
  return text
@@ -312,9 +310,9 @@ def collate_fn(batch):
312
  }
313
 
314
 
315
- def prepare_dpo_dataset(df):
316
- """Convert 4-category labels to DPO format with chosen/rejected pairs"""
317
- dpo_data = []
318
 
319
  # Map 4 categories to yes/no
320
  label_mapping = {
@@ -347,28 +345,21 @@ def prepare_dpo_dataset(df):
347
  original_label = row['label']
348
  mapped_label = label_mapping.get(original_label, original_label)
349
 
350
- if mapped_label == 'yes':
351
- # For 'yes' examples, chosen is "yes" and rejected is "no"
352
- dpo_data.append({
353
- 'prompt': prompt,
354
- 'chosen': 'yes',
355
- 'rejected': 'no',
356
- 'original_label': original_label # Keep original for analysis
357
- })
358
- else:
359
- # For 'no' examples, chosen is "no" and rejected is "yes"
360
- dpo_data.append({
361
- 'prompt': prompt,
362
- 'chosen': 'no',
363
- 'rejected': 'yes',
364
- 'original_label': original_label # Keep original for analysis
365
- })
366
-
367
- return pd.DataFrame(dpo_data)
368
 
369
 
370
- def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=None):
371
- """Training with DPO (Direct Preference Optimization)"""
372
  global current_model, current_tokenizer
373
 
374
  # Clear GPU memory before training
@@ -385,14 +376,14 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
385
  train_df = train_df.sample(n=max_samples, random_state=42)
386
  val_df = val_df.head(min(len(val_df), max_samples // 5)) # Proportional validation set
387
 
388
- # Convert to DPO format
389
- logger.info("Converting to DPO format...")
390
- dpo_train_df = prepare_dpo_dataset(train_df)
391
- dpo_val_df = prepare_dpo_dataset(val_df)
392
 
393
  # Create datasets
394
- train_dataset = Dataset.from_pandas(dpo_train_df)
395
- val_dataset = Dataset.from_pandas(dpo_val_df)
396
 
397
  # Prepare model for training
398
  if hasattr(current_model, 'is_loaded_in_4bit') and current_model.is_loaded_in_4bit:
@@ -428,20 +419,19 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
428
  target_modules=target_modules
429
  )
430
 
431
- logger.info(f"Starting DPO training with {len(train_df)} train samples, {len(val_df)} val samples")
432
  logger.info(f"Learning rate: {lr}, Effective batch size: {batch_size}, Epochs: {epochs}")
433
 
434
  # Create output directory
435
  os.makedirs(OUTPUT_DIR, exist_ok=True)
436
 
437
- # DPO training configuration optimized for A100
438
- # GPT-OSS-20B uses ~16GB with native MXFP4 quantization
439
- target_batch_size = 32 # Target effective batch size
440
 
441
  if current_model_id == "openai/gpt-oss-20b":
442
  # For GPT-OSS-20B: use smaller per-device batch with gradient accumulation
443
  actual_batch_size = 2 # Per-device batch size
444
- seq_length = 256 # Shorter sequences to save memory
445
  grad_accum = target_batch_size // actual_batch_size # 16 gradient accumulation steps
446
  else:
447
  # For smaller models like Phi-3 - can use larger per-device batch
@@ -449,40 +439,76 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
449
  grad_accum = max(1, target_batch_size // actual_batch_size) # Accumulate if needed
450
  seq_length = 512
451
 
452
- training_args = DPOConfig(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  output_dir=OUTPUT_DIR,
454
  num_train_epochs=epochs,
455
  per_device_train_batch_size=actual_batch_size,
456
  per_device_eval_batch_size=actual_batch_size,
457
  gradient_accumulation_steps=grad_accum,
458
- gradient_checkpointing=True, # Still use for memory efficiency
459
  learning_rate=lr,
460
  lr_scheduler_type="cosine",
461
- warmup_steps=100,
462
  logging_steps=10,
463
  save_strategy="epoch",
464
- eval_strategy="epoch",
465
- bf16=True, # A100 supports bf16 efficiently
466
  fp16=False,
467
- remove_unused_columns=False,
468
- run_name="dpo-relevance-a100-8bit",
 
 
 
 
469
  report_to=[],
470
- max_length=seq_length,
471
- max_prompt_length=seq_length,
472
- beta=1.0, # Increased from 0.1 for stronger preference learning
473
- optim="adamw_8bit" if current_model_id == "openai/gpt-oss-20b" else "adamw_torch",
474
- dataloader_num_workers=2, # A100 can handle parallel loading
 
 
 
 
475
  )
476
 
477
- # Create DPO trainer
478
- dpo_trainer = DPOTrainer(
 
 
 
 
479
  model=current_model,
480
- ref_model=None, # Will use the model's initial state as reference
481
  args=training_args,
482
- train_dataset=train_dataset,
483
- eval_dataset=val_dataset,
484
- processing_class=current_tokenizer, # Changed from tokenizer
485
- peft_config=peft_config,
486
  )
487
 
488
  # Custom logging callback
@@ -501,11 +527,14 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
501
 
502
  def compute_accuracy_metrics(trainer, eval_dataset, num_samples=100):
503
  """Compute accuracy metrics and confusion matrix on a subset of eval data"""
 
 
 
504
  # Sample subset for faster evaluation
505
- indices = np.random.choice(len(eval_dataset), min(num_samples, len(eval_dataset)), replace=False)
 
506
 
507
  # Initialize confusion matrix counters
508
- # Rows: true labels, Cols: predicted labels
509
  confusion_matrix = {
510
  'easy_positive': {'yes': 0, 'no': 0},
511
  'hard_positive': {'yes': 0, 'no': 0},
@@ -517,11 +546,10 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
517
  predictions_no = 0
518
  correct = 0
519
 
520
- for idx in indices:
521
- item = eval_dataset[int(idx)]
522
- prompt = item['prompt']
523
- true_choice = item['chosen'] # This is the correct answer (yes/no)
524
- original_label = item.get('original_label', None) # Get original 4-category label
525
 
526
  # Tokenize and run inference
527
  inputs = current_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
@@ -546,7 +574,7 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
546
  else:
547
  predictions_no += 1
548
 
549
- if prediction == true_choice:
550
  correct += 1
551
 
552
  # Update confusion matrix if we have original label
@@ -628,23 +656,23 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
628
  training_status["progress"] = min(int((state.global_step / total_steps) * 100), 99)
629
 
630
  # Add callback with trainer and eval dataset
631
- status_callback = StatusCallback(dpo_trainer, val_dataset)
632
- dpo_trainer.add_callback(status_callback)
633
 
634
  # Train
635
  try:
636
- logger.info("Starting DPO training...")
637
- dpo_trainer.train()
638
 
639
  # Save final model
640
  save_path = os.path.join(OUTPUT_DIR, "final")
641
- dpo_trainer.save_model(save_path)
642
  current_tokenizer.save_pretrained(save_path)
643
  logger.info(f"Model saved to {save_path}")
644
 
645
  # Compute final metrics
646
  logger.info("Computing final accuracy metrics...")
647
- final_metrics = compute_accuracy_metrics(dpo_trainer, val_dataset, num_samples=200)
648
  logger.info(f"Final Accuracy: {final_metrics['accuracy']:.2%}")
649
  logger.info(f"Final Prediction Distribution - Yes: {final_metrics['yes_ratio']:.1%}, No: {final_metrics['no_ratio']:.1%}")
650
 
@@ -666,7 +694,7 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
666
  )
667
 
668
  # Update global model reference
669
- current_model = dpo_trainer.model
670
  current_model.eval()
671
 
672
  # Push to hub if token available
@@ -710,11 +738,11 @@ model-index:
710
 
711
  # {model_short_name} Document Relevance Classifier
712
 
713
- This model was trained using DPO (Direct Preference Optimization) for document relevance classification.
714
 
715
  ## Training Configuration
716
  - Base Model: {current_model_id}
717
- - DPO Beta: {training_args.beta}
718
  - Learning Rate: {training_args.learning_rate}
719
  - Batch Size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}
720
  - Epochs: {training_args.num_train_epochs}
@@ -747,7 +775,7 @@ model = PeftModel.from_pretrained(model, "{HF_USERNAME}/{repo_name.split('/')[-1
747
  current_model.push_to_hub(
748
  repo_name,
749
  use_auth_token=HF_TOKEN,
750
- commit_message=f"DPO training with beta={training_args.beta}, accuracy={final_metrics['accuracy']:.2%}"
751
  )
752
  current_tokenizer.push_to_hub(repo_name, use_auth_token=HF_TOKEN)
753
 
@@ -779,7 +807,7 @@ model = PeftModel.from_pretrained(model, "{HF_USERNAME}/{repo_name.split('/')[-1
779
  "accuracy": final_metrics['accuracy'],
780
  "yes_ratio": final_metrics['yes_ratio'],
781
  "no_ratio": final_metrics['no_ratio'],
782
- "beta": training_args.beta,
783
  "model_id": current_model_id
784
  })
785
 
@@ -892,7 +920,7 @@ def run_training(csv_path, shuffle_flag=False, split_ratio=0.8):
892
  max_samples = 2000 # Start conservative
893
  else:
894
  max_samples = None
895
- train_model(train_df, test_df, epochs=3, batch_size=32, lr=2e-5, max_samples=max_samples)
896
 
897
  with training_lock:
898
  training_status["status"] = "completed"
 
13
  from datasets import Dataset
14
  from transformers import AutoTokenizer, AutoModelForCausalLM
15
  from peft import PeftModel, prepare_model_for_kbit_training, LoraConfig, get_peft_model
16
+ from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
17
  import warnings
18
  import subprocess
19
  import gc
 
126
  if len(content) > 1000:
127
  content = content[:1000] + "..."
128
 
129
+ return f"""You would get a query and document's title and content and return yes (if the document is relevant to the query) or no (if the document is not relevant to the query).
130
  Answer only yes / no.
131
  Document:
132
  ####DOCUMENT START
 
139
  {query}
140
  ####Query END
141
 
142
+ ANSWER: """
 
 
143
 
144
 
145
  def load_model_and_tokenizer(checkpoint_path=None, model_id=None):
 
238
  text += f"{i}. **{model['repo']}**\n"
239
  text += f" - Accuracy: {model['accuracy']:.2%}\n"
240
  text += f" - Predictions: Yes {model['yes_ratio']:.1%}, No {model['no_ratio']:.1%}\n"
241
+ text += f" - LR: {model.get('lr', 'N/A')}, Model: {model['model_id'].split('/')[-1]}\n"
242
  text += f" - Link: https://huggingface.co/{model['repo']}\n\n"
243
 
244
  return text
 
310
  }
311
 
312
 
313
+ def prepare_finetuning_dataset(df):
314
+ """Convert 4-category labels to standard fine-tuning format"""
315
+ ft_data = []
316
 
317
  # Map 4 categories to yes/no
318
  label_mapping = {
 
345
  original_label = row['label']
346
  mapped_label = label_mapping.get(original_label, original_label)
347
 
348
+ # Create the full text with prompt and answer
349
+ text = prompt + mapped_label
350
+
351
+ ft_data.append({
352
+ 'text': text,
353
+ 'prompt': prompt,
354
+ 'label': mapped_label,
355
+ 'original_label': original_label # Keep original for analysis
356
+ })
357
+
358
+ return pd.DataFrame(ft_data)
 
 
 
 
 
 
 
359
 
360
 
361
+ def train_model(train_df, val_df, epochs=5, batch_size=32, lr=5e-6, max_samples=None):
362
+ """Standard fine-tuning for document relevance classification"""
363
  global current_model, current_tokenizer
364
 
365
  # Clear GPU memory before training
 
376
  train_df = train_df.sample(n=max_samples, random_state=42)
377
  val_df = val_df.head(min(len(val_df), max_samples // 5)) # Proportional validation set
378
 
379
+ # Convert to fine-tuning format
380
+ logger.info("Preparing fine-tuning dataset...")
381
+ ft_train_df = prepare_finetuning_dataset(train_df)
382
+ ft_val_df = prepare_finetuning_dataset(val_df)
383
 
384
  # Create datasets
385
+ train_dataset = Dataset.from_pandas(ft_train_df)
386
+ val_dataset = Dataset.from_pandas(ft_val_df)
387
 
388
  # Prepare model for training
389
  if hasattr(current_model, 'is_loaded_in_4bit') and current_model.is_loaded_in_4bit:
 
419
  target_modules=target_modules
420
  )
421
 
422
+ logger.info(f"Starting fine-tuning with {len(train_df)} train samples, {len(val_df)} val samples")
423
  logger.info(f"Learning rate: {lr}, Effective batch size: {batch_size}, Epochs: {epochs}")
424
 
425
  # Create output directory
426
  os.makedirs(OUTPUT_DIR, exist_ok=True)
427
 
428
+ # Training configuration optimized for standard fine-tuning
429
+ target_batch_size = batch_size # Target effective batch size
 
430
 
431
  if current_model_id == "openai/gpt-oss-20b":
432
  # For GPT-OSS-20B: use smaller per-device batch with gradient accumulation
433
  actual_batch_size = 2 # Per-device batch size
434
+ seq_length = 512 # Standard sequence length
435
  grad_accum = target_batch_size // actual_batch_size # 16 gradient accumulation steps
436
  else:
437
  # For smaller models like Phi-3 - can use larger per-device batch
 
439
  grad_accum = max(1, target_batch_size // actual_batch_size) # Accumulate if needed
440
  seq_length = 512
441
 
442
+ # Tokenize the datasets
443
+ def tokenize_function(examples):
444
+ # Tokenize the full texts (prompt + answer)
445
+ model_inputs = current_tokenizer(
446
+ examples['text'],
447
+ truncation=True,
448
+ padding="max_length",
449
+ max_length=seq_length,
450
+ return_tensors=None
451
+ )
452
+
453
+ # For causal LM, labels are the same as input_ids
454
+ model_inputs["labels"] = model_inputs["input_ids"].copy()
455
+
456
+ # Store metadata for evaluation
457
+ model_inputs["original_labels"] = examples['original_label']
458
+ model_inputs["mapped_labels"] = examples['label']
459
+
460
+ return model_inputs
461
+
462
+ # Tokenize datasets
463
+ tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
464
+ tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)
465
+
466
+ # Standard training arguments
467
+ training_args = TrainingArguments(
468
  output_dir=OUTPUT_DIR,
469
  num_train_epochs=epochs,
470
  per_device_train_batch_size=actual_batch_size,
471
  per_device_eval_batch_size=actual_batch_size,
472
  gradient_accumulation_steps=grad_accum,
473
+ gradient_checkpointing=True,
474
  learning_rate=lr,
475
  lr_scheduler_type="cosine",
476
+ warmup_steps=500, # More warmup for standard fine-tuning
477
  logging_steps=10,
478
  save_strategy="epoch",
479
+ evaluation_strategy="epoch",
480
+ bf16=True,
481
  fp16=False,
482
+ weight_decay=0.01,
483
+ optim="adamw_torch",
484
+ save_total_limit=3,
485
+ load_best_model_at_end=True,
486
+ metric_for_best_model="eval_loss",
487
+ greater_is_better=False,
488
  report_to=[],
489
+ run_name="standard-ft-relevance",
490
+ dataloader_num_workers=2,
491
+ )
492
+
493
+ # Create data collator
494
+ data_collator = DataCollatorForLanguageModeling(
495
+ tokenizer=current_tokenizer,
496
+ mlm=False, # Causal LM, not masked LM
497
+ pad_to_multiple_of=8
498
  )
499
 
500
+ # Apply LoRA to the model
501
+ current_model = get_peft_model(current_model, peft_config)
502
+ current_model.print_trainable_parameters()
503
+
504
+ # Create standard trainer
505
+ trainer = Trainer(
506
  model=current_model,
 
507
  args=training_args,
508
+ train_dataset=tokenized_train,
509
+ eval_dataset=tokenized_val,
510
+ data_collator=data_collator,
511
+ tokenizer=current_tokenizer,
512
  )
513
 
514
  # Custom logging callback
 
527
 
528
  def compute_accuracy_metrics(trainer, eval_dataset, num_samples=100):
529
  """Compute accuracy metrics and confusion matrix on a subset of eval data"""
530
+ # Get the original dataframe for easier access to prompts and labels
531
+ eval_df = ft_val_df
532
+
533
  # Sample subset for faster evaluation
534
+ sample_size = min(num_samples, len(eval_df))
535
+ sample_df = eval_df.sample(n=sample_size, random_state=42)
536
 
537
  # Initialize confusion matrix counters
 
538
  confusion_matrix = {
539
  'easy_positive': {'yes': 0, 'no': 0},
540
  'hard_positive': {'yes': 0, 'no': 0},
 
546
  predictions_no = 0
547
  correct = 0
548
 
549
+ for idx, row in sample_df.iterrows():
550
+ prompt = row['prompt']
551
+ true_label = row['label'] # This is the mapped label (yes/no)
552
+ original_label = row['original_label'] # Get original 4-category label
 
553
 
554
  # Tokenize and run inference
555
  inputs = current_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
 
574
  else:
575
  predictions_no += 1
576
 
577
+ if prediction == true_label:
578
  correct += 1
579
 
580
  # Update confusion matrix if we have original label
 
656
  training_status["progress"] = min(int((state.global_step / total_steps) * 100), 99)
657
 
658
  # Add callback with trainer and eval dataset
659
+ status_callback = StatusCallback(trainer, val_dataset)
660
+ trainer.add_callback(status_callback)
661
 
662
  # Train
663
  try:
664
+ logger.info("Starting fine-tuning...")
665
+ trainer.train()
666
 
667
  # Save final model
668
  save_path = os.path.join(OUTPUT_DIR, "final")
669
+ trainer.save_model(save_path)
670
  current_tokenizer.save_pretrained(save_path)
671
  logger.info(f"Model saved to {save_path}")
672
 
673
  # Compute final metrics
674
  logger.info("Computing final accuracy metrics...")
675
+ final_metrics = compute_accuracy_metrics(trainer, val_dataset, num_samples=200)
676
  logger.info(f"Final Accuracy: {final_metrics['accuracy']:.2%}")
677
  logger.info(f"Final Prediction Distribution - Yes: {final_metrics['yes_ratio']:.1%}, No: {final_metrics['no_ratio']:.1%}")
678
 
 
694
  )
695
 
696
  # Update global model reference
697
+ current_model = trainer.model
698
  current_model.eval()
699
 
700
  # Push to hub if token available
 
738
 
739
  # {model_short_name} Document Relevance Classifier
740
 
741
+ This model was trained using standard fine-tuning for document relevance classification.
742
 
743
  ## Training Configuration
744
  - Base Model: {current_model_id}
745
+ - Training Type: Standard Fine-tuning
746
  - Learning Rate: {training_args.learning_rate}
747
  - Batch Size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}
748
  - Epochs: {training_args.num_train_epochs}
 
775
  current_model.push_to_hub(
776
  repo_name,
777
  use_auth_token=HF_TOKEN,
778
+ commit_message=f"Standard fine-tuning with lr={training_args.learning_rate}, accuracy={final_metrics['accuracy']:.2%}"
779
  )
780
  current_tokenizer.push_to_hub(repo_name, use_auth_token=HF_TOKEN)
781
 
 
807
  "accuracy": final_metrics['accuracy'],
808
  "yes_ratio": final_metrics['yes_ratio'],
809
  "no_ratio": final_metrics['no_ratio'],
810
+ "lr": training_args.learning_rate,
811
  "model_id": current_model_id
812
  })
813
 
 
920
  max_samples = 2000 # Start conservative
921
  else:
922
  max_samples = None
923
+ train_model(train_df, test_df, epochs=5, batch_size=32, lr=5e-6, max_samples=max_samples)
924
 
925
  with training_lock:
926
  training_status["status"] = "completed"
inference_chatgpt_simple.py CHANGED
@@ -49,7 +49,7 @@ def main():
49
  df = pd.read_csv(csv_path)
50
  # Process each row
51
  prds = [(str(row['query_text']),str(row['title']),str(row['text'])) for idx, row in df.iterrows()]
52
- predictions = ThreadPool(1000).starmap(get_prediction,prds)
53
 
54
  df['prediction'] = predictions
55
  conf_matrix = pd.crosstab(
@@ -69,5 +69,12 @@ def main():
69
  print("\nResults:")
70
  print(df['prediction'].value_counts())
71
 
 
 
 
 
 
 
 
72
  if __name__ == "__main__":
73
- main()
 
49
  df = pd.read_csv(csv_path)
50
  # Process each row
51
  prds = [(str(row['query_text']),str(row['title']),str(row['text'])) for idx, row in df.iterrows()]
52
+ predictions = ThreadPool(100).starmap(get_prediction,prds)
53
 
54
  df['prediction'] = predictions
55
  conf_matrix = pd.crosstab(
 
69
  print("\nResults:")
70
  print(df['prediction'].value_counts())
71
 
72
+ def make_sample_db():
73
+ df = pd.read_csv(rf"train_datasets_creation/full_train_dataset.csv")
74
+ dfs = [df[df['label']==d].sample(100) for d in df['label'].unique()]
75
+ df = pd.concat(dfs).reset_index()
76
+ df.to_csv(f"sample_db_{datetime.now().isoformat()}.csv")
77
+
78
+
79
  if __name__ == "__main__":
80
+ make_sample_db()