Spaces:
Paused
Paused
no
Browse files- app.py +107 -79
- inference_chatgpt_simple.py +9 -2
app.py
CHANGED
@@ -13,7 +13,7 @@ import gradio as gr
|
|
13 |
from datasets import Dataset
|
14 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
15 |
from peft import PeftModel, prepare_model_for_kbit_training, LoraConfig, get_peft_model
|
16 |
-
from
|
17 |
import warnings
|
18 |
import subprocess
|
19 |
import gc
|
@@ -126,7 +126,7 @@ def format_prompt(query, title, content):
|
|
126 |
if len(content) > 1000:
|
127 |
content = content[:1000] + "..."
|
128 |
|
129 |
-
return f"""
|
130 |
Answer only yes / no.
|
131 |
Document:
|
132 |
####DOCUMENT START
|
@@ -139,9 +139,7 @@ Query:
|
|
139 |
{query}
|
140 |
####Query END
|
141 |
|
142 |
-
ANSWER:
|
143 |
-
####ANSWER START
|
144 |
-
"""
|
145 |
|
146 |
|
147 |
def load_model_and_tokenizer(checkpoint_path=None, model_id=None):
|
@@ -240,7 +238,7 @@ def get_trained_models_list():
|
|
240 |
text += f"{i}. **{model['repo']}**\n"
|
241 |
text += f" - Accuracy: {model['accuracy']:.2%}\n"
|
242 |
text += f" - Predictions: Yes {model['yes_ratio']:.1%}, No {model['no_ratio']:.1%}\n"
|
243 |
-
text += f" -
|
244 |
text += f" - Link: https://huggingface.co/{model['repo']}\n\n"
|
245 |
|
246 |
return text
|
@@ -312,9 +310,9 @@ def collate_fn(batch):
|
|
312 |
}
|
313 |
|
314 |
|
315 |
-
def
|
316 |
-
"""Convert 4-category labels to
|
317 |
-
|
318 |
|
319 |
# Map 4 categories to yes/no
|
320 |
label_mapping = {
|
@@ -347,28 +345,21 @@ def prepare_dpo_dataset(df):
|
|
347 |
original_label = row['label']
|
348 |
mapped_label = label_mapping.get(original_label, original_label)
|
349 |
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
'prompt': prompt,
|
362 |
-
'chosen': 'no',
|
363 |
-
'rejected': 'yes',
|
364 |
-
'original_label': original_label # Keep original for analysis
|
365 |
-
})
|
366 |
-
|
367 |
-
return pd.DataFrame(dpo_data)
|
368 |
|
369 |
|
370 |
-
def train_model(train_df, val_df, epochs=
|
371 |
-
"""
|
372 |
global current_model, current_tokenizer
|
373 |
|
374 |
# Clear GPU memory before training
|
@@ -385,14 +376,14 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
385 |
train_df = train_df.sample(n=max_samples, random_state=42)
|
386 |
val_df = val_df.head(min(len(val_df), max_samples // 5)) # Proportional validation set
|
387 |
|
388 |
-
# Convert to
|
389 |
-
logger.info("
|
390 |
-
|
391 |
-
|
392 |
|
393 |
# Create datasets
|
394 |
-
train_dataset = Dataset.from_pandas(
|
395 |
-
val_dataset = Dataset.from_pandas(
|
396 |
|
397 |
# Prepare model for training
|
398 |
if hasattr(current_model, 'is_loaded_in_4bit') and current_model.is_loaded_in_4bit:
|
@@ -428,20 +419,19 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
428 |
target_modules=target_modules
|
429 |
)
|
430 |
|
431 |
-
logger.info(f"Starting
|
432 |
logger.info(f"Learning rate: {lr}, Effective batch size: {batch_size}, Epochs: {epochs}")
|
433 |
|
434 |
# Create output directory
|
435 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
436 |
|
437 |
-
#
|
438 |
-
|
439 |
-
target_batch_size = 32 # Target effective batch size
|
440 |
|
441 |
if current_model_id == "openai/gpt-oss-20b":
|
442 |
# For GPT-OSS-20B: use smaller per-device batch with gradient accumulation
|
443 |
actual_batch_size = 2 # Per-device batch size
|
444 |
-
seq_length =
|
445 |
grad_accum = target_batch_size // actual_batch_size # 16 gradient accumulation steps
|
446 |
else:
|
447 |
# For smaller models like Phi-3 - can use larger per-device batch
|
@@ -449,40 +439,76 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
449 |
grad_accum = max(1, target_batch_size // actual_batch_size) # Accumulate if needed
|
450 |
seq_length = 512
|
451 |
|
452 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
453 |
output_dir=OUTPUT_DIR,
|
454 |
num_train_epochs=epochs,
|
455 |
per_device_train_batch_size=actual_batch_size,
|
456 |
per_device_eval_batch_size=actual_batch_size,
|
457 |
gradient_accumulation_steps=grad_accum,
|
458 |
-
gradient_checkpointing=True,
|
459 |
learning_rate=lr,
|
460 |
lr_scheduler_type="cosine",
|
461 |
-
warmup_steps=
|
462 |
logging_steps=10,
|
463 |
save_strategy="epoch",
|
464 |
-
|
465 |
-
bf16=True,
|
466 |
fp16=False,
|
467 |
-
|
468 |
-
|
|
|
|
|
|
|
|
|
469 |
report_to=[],
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
|
|
|
|
|
|
|
|
475 |
)
|
476 |
|
477 |
-
#
|
478 |
-
|
|
|
|
|
|
|
|
|
479 |
model=current_model,
|
480 |
-
ref_model=None, # Will use the model's initial state as reference
|
481 |
args=training_args,
|
482 |
-
train_dataset=
|
483 |
-
eval_dataset=
|
484 |
-
|
485 |
-
|
486 |
)
|
487 |
|
488 |
# Custom logging callback
|
@@ -501,11 +527,14 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
501 |
|
502 |
def compute_accuracy_metrics(trainer, eval_dataset, num_samples=100):
|
503 |
"""Compute accuracy metrics and confusion matrix on a subset of eval data"""
|
|
|
|
|
|
|
504 |
# Sample subset for faster evaluation
|
505 |
-
|
|
|
506 |
|
507 |
# Initialize confusion matrix counters
|
508 |
-
# Rows: true labels, Cols: predicted labels
|
509 |
confusion_matrix = {
|
510 |
'easy_positive': {'yes': 0, 'no': 0},
|
511 |
'hard_positive': {'yes': 0, 'no': 0},
|
@@ -517,11 +546,10 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
517 |
predictions_no = 0
|
518 |
correct = 0
|
519 |
|
520 |
-
for idx in
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
original_label = item.get('original_label', None) # Get original 4-category label
|
525 |
|
526 |
# Tokenize and run inference
|
527 |
inputs = current_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
@@ -546,7 +574,7 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
546 |
else:
|
547 |
predictions_no += 1
|
548 |
|
549 |
-
if prediction ==
|
550 |
correct += 1
|
551 |
|
552 |
# Update confusion matrix if we have original label
|
@@ -628,23 +656,23 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
628 |
training_status["progress"] = min(int((state.global_step / total_steps) * 100), 99)
|
629 |
|
630 |
# Add callback with trainer and eval dataset
|
631 |
-
status_callback = StatusCallback(
|
632 |
-
|
633 |
|
634 |
# Train
|
635 |
try:
|
636 |
-
logger.info("Starting
|
637 |
-
|
638 |
|
639 |
# Save final model
|
640 |
save_path = os.path.join(OUTPUT_DIR, "final")
|
641 |
-
|
642 |
current_tokenizer.save_pretrained(save_path)
|
643 |
logger.info(f"Model saved to {save_path}")
|
644 |
|
645 |
# Compute final metrics
|
646 |
logger.info("Computing final accuracy metrics...")
|
647 |
-
final_metrics = compute_accuracy_metrics(
|
648 |
logger.info(f"Final Accuracy: {final_metrics['accuracy']:.2%}")
|
649 |
logger.info(f"Final Prediction Distribution - Yes: {final_metrics['yes_ratio']:.1%}, No: {final_metrics['no_ratio']:.1%}")
|
650 |
|
@@ -666,7 +694,7 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
|
|
666 |
)
|
667 |
|
668 |
# Update global model reference
|
669 |
-
current_model =
|
670 |
current_model.eval()
|
671 |
|
672 |
# Push to hub if token available
|
@@ -710,11 +738,11 @@ model-index:
|
|
710 |
|
711 |
# {model_short_name} Document Relevance Classifier
|
712 |
|
713 |
-
This model was trained using
|
714 |
|
715 |
## Training Configuration
|
716 |
- Base Model: {current_model_id}
|
717 |
-
-
|
718 |
- Learning Rate: {training_args.learning_rate}
|
719 |
- Batch Size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}
|
720 |
- Epochs: {training_args.num_train_epochs}
|
@@ -747,7 +775,7 @@ model = PeftModel.from_pretrained(model, "{HF_USERNAME}/{repo_name.split('/')[-1
|
|
747 |
current_model.push_to_hub(
|
748 |
repo_name,
|
749 |
use_auth_token=HF_TOKEN,
|
750 |
-
commit_message=f"
|
751 |
)
|
752 |
current_tokenizer.push_to_hub(repo_name, use_auth_token=HF_TOKEN)
|
753 |
|
@@ -779,7 +807,7 @@ model = PeftModel.from_pretrained(model, "{HF_USERNAME}/{repo_name.split('/')[-1
|
|
779 |
"accuracy": final_metrics['accuracy'],
|
780 |
"yes_ratio": final_metrics['yes_ratio'],
|
781 |
"no_ratio": final_metrics['no_ratio'],
|
782 |
-
"
|
783 |
"model_id": current_model_id
|
784 |
})
|
785 |
|
@@ -892,7 +920,7 @@ def run_training(csv_path, shuffle_flag=False, split_ratio=0.8):
|
|
892 |
max_samples = 2000 # Start conservative
|
893 |
else:
|
894 |
max_samples = None
|
895 |
-
train_model(train_df, test_df, epochs=
|
896 |
|
897 |
with training_lock:
|
898 |
training_status["status"] = "completed"
|
|
|
13 |
from datasets import Dataset
|
14 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
15 |
from peft import PeftModel, prepare_model_for_kbit_training, LoraConfig, get_peft_model
|
16 |
+
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
17 |
import warnings
|
18 |
import subprocess
|
19 |
import gc
|
|
|
126 |
if len(content) > 1000:
|
127 |
content = content[:1000] + "..."
|
128 |
|
129 |
+
return f"""You would get a query and document's title and content and return yes (if the document is relevant to the query) or no (if the document is not relevant to the query).
|
130 |
Answer only yes / no.
|
131 |
Document:
|
132 |
####DOCUMENT START
|
|
|
139 |
{query}
|
140 |
####Query END
|
141 |
|
142 |
+
ANSWER: """
|
|
|
|
|
143 |
|
144 |
|
145 |
def load_model_and_tokenizer(checkpoint_path=None, model_id=None):
|
|
|
238 |
text += f"{i}. **{model['repo']}**\n"
|
239 |
text += f" - Accuracy: {model['accuracy']:.2%}\n"
|
240 |
text += f" - Predictions: Yes {model['yes_ratio']:.1%}, No {model['no_ratio']:.1%}\n"
|
241 |
+
text += f" - LR: {model.get('lr', 'N/A')}, Model: {model['model_id'].split('/')[-1]}\n"
|
242 |
text += f" - Link: https://huggingface.co/{model['repo']}\n\n"
|
243 |
|
244 |
return text
|
|
|
310 |
}
|
311 |
|
312 |
|
313 |
+
def prepare_finetuning_dataset(df):
|
314 |
+
"""Convert 4-category labels to standard fine-tuning format"""
|
315 |
+
ft_data = []
|
316 |
|
317 |
# Map 4 categories to yes/no
|
318 |
label_mapping = {
|
|
|
345 |
original_label = row['label']
|
346 |
mapped_label = label_mapping.get(original_label, original_label)
|
347 |
|
348 |
+
# Create the full text with prompt and answer
|
349 |
+
text = prompt + mapped_label
|
350 |
+
|
351 |
+
ft_data.append({
|
352 |
+
'text': text,
|
353 |
+
'prompt': prompt,
|
354 |
+
'label': mapped_label,
|
355 |
+
'original_label': original_label # Keep original for analysis
|
356 |
+
})
|
357 |
+
|
358 |
+
return pd.DataFrame(ft_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
|
360 |
|
361 |
+
def train_model(train_df, val_df, epochs=5, batch_size=32, lr=5e-6, max_samples=None):
|
362 |
+
"""Standard fine-tuning for document relevance classification"""
|
363 |
global current_model, current_tokenizer
|
364 |
|
365 |
# Clear GPU memory before training
|
|
|
376 |
train_df = train_df.sample(n=max_samples, random_state=42)
|
377 |
val_df = val_df.head(min(len(val_df), max_samples // 5)) # Proportional validation set
|
378 |
|
379 |
+
# Convert to fine-tuning format
|
380 |
+
logger.info("Preparing fine-tuning dataset...")
|
381 |
+
ft_train_df = prepare_finetuning_dataset(train_df)
|
382 |
+
ft_val_df = prepare_finetuning_dataset(val_df)
|
383 |
|
384 |
# Create datasets
|
385 |
+
train_dataset = Dataset.from_pandas(ft_train_df)
|
386 |
+
val_dataset = Dataset.from_pandas(ft_val_df)
|
387 |
|
388 |
# Prepare model for training
|
389 |
if hasattr(current_model, 'is_loaded_in_4bit') and current_model.is_loaded_in_4bit:
|
|
|
419 |
target_modules=target_modules
|
420 |
)
|
421 |
|
422 |
+
logger.info(f"Starting fine-tuning with {len(train_df)} train samples, {len(val_df)} val samples")
|
423 |
logger.info(f"Learning rate: {lr}, Effective batch size: {batch_size}, Epochs: {epochs}")
|
424 |
|
425 |
# Create output directory
|
426 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
427 |
|
428 |
+
# Training configuration optimized for standard fine-tuning
|
429 |
+
target_batch_size = batch_size # Target effective batch size
|
|
|
430 |
|
431 |
if current_model_id == "openai/gpt-oss-20b":
|
432 |
# For GPT-OSS-20B: use smaller per-device batch with gradient accumulation
|
433 |
actual_batch_size = 2 # Per-device batch size
|
434 |
+
seq_length = 512 # Standard sequence length
|
435 |
grad_accum = target_batch_size // actual_batch_size # 16 gradient accumulation steps
|
436 |
else:
|
437 |
# For smaller models like Phi-3 - can use larger per-device batch
|
|
|
439 |
grad_accum = max(1, target_batch_size // actual_batch_size) # Accumulate if needed
|
440 |
seq_length = 512
|
441 |
|
442 |
+
# Tokenize the datasets
|
443 |
+
def tokenize_function(examples):
|
444 |
+
# Tokenize the full texts (prompt + answer)
|
445 |
+
model_inputs = current_tokenizer(
|
446 |
+
examples['text'],
|
447 |
+
truncation=True,
|
448 |
+
padding="max_length",
|
449 |
+
max_length=seq_length,
|
450 |
+
return_tensors=None
|
451 |
+
)
|
452 |
+
|
453 |
+
# For causal LM, labels are the same as input_ids
|
454 |
+
model_inputs["labels"] = model_inputs["input_ids"].copy()
|
455 |
+
|
456 |
+
# Store metadata for evaluation
|
457 |
+
model_inputs["original_labels"] = examples['original_label']
|
458 |
+
model_inputs["mapped_labels"] = examples['label']
|
459 |
+
|
460 |
+
return model_inputs
|
461 |
+
|
462 |
+
# Tokenize datasets
|
463 |
+
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
|
464 |
+
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)
|
465 |
+
|
466 |
+
# Standard training arguments
|
467 |
+
training_args = TrainingArguments(
|
468 |
output_dir=OUTPUT_DIR,
|
469 |
num_train_epochs=epochs,
|
470 |
per_device_train_batch_size=actual_batch_size,
|
471 |
per_device_eval_batch_size=actual_batch_size,
|
472 |
gradient_accumulation_steps=grad_accum,
|
473 |
+
gradient_checkpointing=True,
|
474 |
learning_rate=lr,
|
475 |
lr_scheduler_type="cosine",
|
476 |
+
warmup_steps=500, # More warmup for standard fine-tuning
|
477 |
logging_steps=10,
|
478 |
save_strategy="epoch",
|
479 |
+
evaluation_strategy="epoch",
|
480 |
+
bf16=True,
|
481 |
fp16=False,
|
482 |
+
weight_decay=0.01,
|
483 |
+
optim="adamw_torch",
|
484 |
+
save_total_limit=3,
|
485 |
+
load_best_model_at_end=True,
|
486 |
+
metric_for_best_model="eval_loss",
|
487 |
+
greater_is_better=False,
|
488 |
report_to=[],
|
489 |
+
run_name="standard-ft-relevance",
|
490 |
+
dataloader_num_workers=2,
|
491 |
+
)
|
492 |
+
|
493 |
+
# Create data collator
|
494 |
+
data_collator = DataCollatorForLanguageModeling(
|
495 |
+
tokenizer=current_tokenizer,
|
496 |
+
mlm=False, # Causal LM, not masked LM
|
497 |
+
pad_to_multiple_of=8
|
498 |
)
|
499 |
|
500 |
+
# Apply LoRA to the model
|
501 |
+
current_model = get_peft_model(current_model, peft_config)
|
502 |
+
current_model.print_trainable_parameters()
|
503 |
+
|
504 |
+
# Create standard trainer
|
505 |
+
trainer = Trainer(
|
506 |
model=current_model,
|
|
|
507 |
args=training_args,
|
508 |
+
train_dataset=tokenized_train,
|
509 |
+
eval_dataset=tokenized_val,
|
510 |
+
data_collator=data_collator,
|
511 |
+
tokenizer=current_tokenizer,
|
512 |
)
|
513 |
|
514 |
# Custom logging callback
|
|
|
527 |
|
528 |
def compute_accuracy_metrics(trainer, eval_dataset, num_samples=100):
|
529 |
"""Compute accuracy metrics and confusion matrix on a subset of eval data"""
|
530 |
+
# Get the original dataframe for easier access to prompts and labels
|
531 |
+
eval_df = ft_val_df
|
532 |
+
|
533 |
# Sample subset for faster evaluation
|
534 |
+
sample_size = min(num_samples, len(eval_df))
|
535 |
+
sample_df = eval_df.sample(n=sample_size, random_state=42)
|
536 |
|
537 |
# Initialize confusion matrix counters
|
|
|
538 |
confusion_matrix = {
|
539 |
'easy_positive': {'yes': 0, 'no': 0},
|
540 |
'hard_positive': {'yes': 0, 'no': 0},
|
|
|
546 |
predictions_no = 0
|
547 |
correct = 0
|
548 |
|
549 |
+
for idx, row in sample_df.iterrows():
|
550 |
+
prompt = row['prompt']
|
551 |
+
true_label = row['label'] # This is the mapped label (yes/no)
|
552 |
+
original_label = row['original_label'] # Get original 4-category label
|
|
|
553 |
|
554 |
# Tokenize and run inference
|
555 |
inputs = current_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
|
|
574 |
else:
|
575 |
predictions_no += 1
|
576 |
|
577 |
+
if prediction == true_label:
|
578 |
correct += 1
|
579 |
|
580 |
# Update confusion matrix if we have original label
|
|
|
656 |
training_status["progress"] = min(int((state.global_step / total_steps) * 100), 99)
|
657 |
|
658 |
# Add callback with trainer and eval dataset
|
659 |
+
status_callback = StatusCallback(trainer, val_dataset)
|
660 |
+
trainer.add_callback(status_callback)
|
661 |
|
662 |
# Train
|
663 |
try:
|
664 |
+
logger.info("Starting fine-tuning...")
|
665 |
+
trainer.train()
|
666 |
|
667 |
# Save final model
|
668 |
save_path = os.path.join(OUTPUT_DIR, "final")
|
669 |
+
trainer.save_model(save_path)
|
670 |
current_tokenizer.save_pretrained(save_path)
|
671 |
logger.info(f"Model saved to {save_path}")
|
672 |
|
673 |
# Compute final metrics
|
674 |
logger.info("Computing final accuracy metrics...")
|
675 |
+
final_metrics = compute_accuracy_metrics(trainer, val_dataset, num_samples=200)
|
676 |
logger.info(f"Final Accuracy: {final_metrics['accuracy']:.2%}")
|
677 |
logger.info(f"Final Prediction Distribution - Yes: {final_metrics['yes_ratio']:.1%}, No: {final_metrics['no_ratio']:.1%}")
|
678 |
|
|
|
694 |
)
|
695 |
|
696 |
# Update global model reference
|
697 |
+
current_model = trainer.model
|
698 |
current_model.eval()
|
699 |
|
700 |
# Push to hub if token available
|
|
|
738 |
|
739 |
# {model_short_name} Document Relevance Classifier
|
740 |
|
741 |
+
This model was trained using standard fine-tuning for document relevance classification.
|
742 |
|
743 |
## Training Configuration
|
744 |
- Base Model: {current_model_id}
|
745 |
+
- Training Type: Standard Fine-tuning
|
746 |
- Learning Rate: {training_args.learning_rate}
|
747 |
- Batch Size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}
|
748 |
- Epochs: {training_args.num_train_epochs}
|
|
|
775 |
current_model.push_to_hub(
|
776 |
repo_name,
|
777 |
use_auth_token=HF_TOKEN,
|
778 |
+
commit_message=f"Standard fine-tuning with lr={training_args.learning_rate}, accuracy={final_metrics['accuracy']:.2%}"
|
779 |
)
|
780 |
current_tokenizer.push_to_hub(repo_name, use_auth_token=HF_TOKEN)
|
781 |
|
|
|
807 |
"accuracy": final_metrics['accuracy'],
|
808 |
"yes_ratio": final_metrics['yes_ratio'],
|
809 |
"no_ratio": final_metrics['no_ratio'],
|
810 |
+
"lr": training_args.learning_rate,
|
811 |
"model_id": current_model_id
|
812 |
})
|
813 |
|
|
|
920 |
max_samples = 2000 # Start conservative
|
921 |
else:
|
922 |
max_samples = None
|
923 |
+
train_model(train_df, test_df, epochs=5, batch_size=32, lr=5e-6, max_samples=max_samples)
|
924 |
|
925 |
with training_lock:
|
926 |
training_status["status"] = "completed"
|
inference_chatgpt_simple.py
CHANGED
@@ -49,7 +49,7 @@ def main():
|
|
49 |
df = pd.read_csv(csv_path)
|
50 |
# Process each row
|
51 |
prds = [(str(row['query_text']),str(row['title']),str(row['text'])) for idx, row in df.iterrows()]
|
52 |
-
predictions = ThreadPool(
|
53 |
|
54 |
df['prediction'] = predictions
|
55 |
conf_matrix = pd.crosstab(
|
@@ -69,5 +69,12 @@ def main():
|
|
69 |
print("\nResults:")
|
70 |
print(df['prediction'].value_counts())
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
if __name__ == "__main__":
|
73 |
-
|
|
|
49 |
df = pd.read_csv(csv_path)
|
50 |
# Process each row
|
51 |
prds = [(str(row['query_text']),str(row['title']),str(row['text'])) for idx, row in df.iterrows()]
|
52 |
+
predictions = ThreadPool(100).starmap(get_prediction,prds)
|
53 |
|
54 |
df['prediction'] = predictions
|
55 |
conf_matrix = pd.crosstab(
|
|
|
69 |
print("\nResults:")
|
70 |
print(df['prediction'].value_counts())
|
71 |
|
72 |
+
def make_sample_db():
|
73 |
+
df = pd.read_csv(rf"train_datasets_creation/full_train_dataset.csv")
|
74 |
+
dfs = [df[df['label']==d].sample(100) for d in df['label'].unique()]
|
75 |
+
df = pd.concat(dfs).reset_index()
|
76 |
+
df.to_csv(f"sample_db_{datetime.now().isoformat()}.csv")
|
77 |
+
|
78 |
+
|
79 |
if __name__ == "__main__":
|
80 |
+
make_sample_db()
|