amos1088 commited on
Commit
cae25d0
·
1 Parent(s): 005e7d1
Files changed (36) hide show
  1. app.py +386 -60
  2. common/ask_claude_clean.py +253 -0
  3. common/ask_openai.py +404 -0
  4. common/global_config.py +58 -0
  5. common/trash.py +248 -0
  6. download_beir_datasets.py +0 -287
  7. evaluate.py +0 -273
  8. generate_sample_data.py +0 -90
  9. inference_chatgpt_simple.py +89 -0
  10. inference_server.py +0 -179
  11. setup_environment.py +0 -120
  12. test_scores/analyze_each_dataset.py +272 -0
  13. test_scores/analyze_webis_only.py +269 -0
  14. test_scores/beir_analysis.json +0 -0
  15. test_scores/dataset_reports/FINAL_TRAINING_SUMMARY.md +189 -0
  16. test_scores/dataset_reports/arguana_analysis.md +157 -0
  17. test_scores/dataset_reports/climate-fever_analysis.md +122 -0
  18. test_scores/dataset_reports/dbpedia-entity_analysis.md +908 -0
  19. test_scores/dataset_reports/fever_analysis.md +281 -0
  20. test_scores/dataset_reports/fiqa_analysis.md +302 -0
  21. test_scores/dataset_reports/hotpotqa_analysis.md +362 -0
  22. test_scores/dataset_reports/nfcorpus_analysis.md +865 -0
  23. test_scores/dataset_reports/quora_analysis.md +0 -0
  24. test_scores/dataset_reports/scidocs_analysis.md +555 -0
  25. test_scores/dataset_reports/scifact_analysis.md +264 -0
  26. test_scores/dataset_reports/training_config.json +164 -0
  27. test_scores/dataset_reports/trec-covid_analysis.md +782 -0
  28. test_scores/dataset_reports/webis-touche2020_analysis.md +624 -0
  29. test_scores/examine_more_scores.py +157 -0
  30. test_scores/test_fiqa_relevance.py +146 -0
  31. test_scores/test_nfcorpus_train_relevance.py +176 -0
  32. train_datasets_creation/create_training_data_ids_batched.py +220 -0
  33. train_datasets_creation/make_dataset_for_train.py +24 -0
  34. train_datasets_creation/merge_all_splits.py +102 -0
  35. train_dpo_hf.py +0 -268
  36. train_dpo_hf_fixed.py +0 -366
app.py CHANGED
@@ -13,13 +13,12 @@ import gradio as gr
13
  from datasets import Dataset
14
  from transformers import AutoTokenizer, AutoModelForCausalLM
15
  from peft import PeftModel, prepare_model_for_kbit_training, LoraConfig, get_peft_model
16
- from torch.utils.data import DataLoader
17
- from torch.nn import CrossEntropyLoss
18
  from trl import DPOTrainer, DPOConfig
19
  import warnings
20
  import subprocess
21
  import gc
22
  import psutil
 
23
  warnings.filterwarnings("ignore")
24
 
25
  # ===== CONFIG =====
@@ -111,12 +110,17 @@ current_model = None
111
  current_tokenizer = None
112
  train_df = None
113
  test_df = None
114
- training_status = {"status": "idle", "progress": 0, "logs": []}
 
115
  training_lock = threading.Lock()
116
 
117
 
118
  def format_prompt(query, title, content):
119
  """Format the prompt for the model"""
 
 
 
 
120
  return f"""you would get a query and document's title and content and return yes (if the document is relevant to the query)/ or no (if the document is not relevant to the query)
121
  Answer only yes / no.
122
  Document:
@@ -221,6 +225,21 @@ def get_gpu_memory_status():
221
 
222
  return f"GPU Memory: {allocated:.1f}GB allocated / {reserved:.1f}GB reserved / {total:.1f}GB total"
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  def switch_model(model_id):
225
  """Switch to a different model"""
226
  global current_model, current_tokenizer, current_model_id, OUTPUT_DIR
@@ -289,25 +308,55 @@ def collate_fn(batch):
289
 
290
 
291
  def prepare_dpo_dataset(df):
292
- """Convert yes/no labels to DPO format with chosen/rejected pairs"""
293
  dpo_data = []
294
 
 
 
 
 
 
 
 
 
 
 
295
  for _, row in df.iterrows():
296
- prompt = row['prompt'] if 'prompt' in row else format_prompt(row['query'], row['title'], row['content'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
- if row['label'] == 'yes':
 
 
 
 
299
  # For 'yes' examples, chosen is "yes" and rejected is "no"
300
  dpo_data.append({
301
  'prompt': prompt,
302
  'chosen': 'yes',
303
- 'rejected': 'no'
 
304
  })
305
  else:
306
  # For 'no' examples, chosen is "no" and rejected is "yes"
307
  dpo_data.append({
308
  'prompt': prompt,
309
  'chosen': 'no',
310
- 'rejected': 'yes'
 
311
  })
312
 
313
  return pd.DataFrame(dpo_data)
@@ -446,10 +495,19 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
446
  import numpy as np
447
 
448
  def compute_accuracy_metrics(trainer, eval_dataset, num_samples=100):
449
- """Compute accuracy metrics on a subset of eval data"""
450
  # Sample subset for faster evaluation
451
  indices = np.random.choice(len(eval_dataset), min(num_samples, len(eval_dataset)), replace=False)
452
 
 
 
 
 
 
 
 
 
 
453
  predictions_yes = 0
454
  predictions_no = 0
455
  correct = 0
@@ -457,7 +515,8 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
457
  for idx in indices:
458
  item = eval_dataset[int(idx)]
459
  prompt = item['prompt']
460
- true_choice = item['chosen'] # This is the correct answer
 
461
 
462
  # Tokenize and run inference
463
  inputs = current_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
@@ -475,25 +534,45 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
475
  no_logit = logits[no_token_id].item()
476
 
477
  # Get prediction
478
- prediction = " yes" if yes_logit > no_logit else " no"
479
 
480
- if prediction == " yes":
481
  predictions_yes += 1
482
  else:
483
  predictions_no += 1
484
 
485
- if prediction.strip() == true_choice.strip():
486
  correct += 1
 
 
 
 
487
 
488
  accuracy = correct / len(indices)
489
  yes_ratio = predictions_yes / len(indices)
490
  no_ratio = predictions_no / len(indices)
491
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
  return {
493
  'accuracy': accuracy,
494
  'yes_ratio': yes_ratio,
495
  'no_ratio': no_ratio,
496
- 'total_samples': len(indices)
 
 
497
  }
498
 
499
  class StatusCallback(TrainerCallback):
@@ -520,6 +599,17 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
520
  f"No={metrics['no_ratio']:.1%}"
521
  )
522
 
 
 
 
 
 
 
 
 
 
 
 
523
  # Warn if model is biased
524
  if metrics['yes_ratio'] < 0.2 or metrics['no_ratio'] < 0.2:
525
  training_status["logs"].append(
@@ -555,9 +645,20 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
555
 
556
  with training_lock:
557
  training_status["logs"].append(f"\n=== FINAL RESULTS ===")
558
- training_status["logs"].append(f"Accuracy: {final_metrics['accuracy']:.2%}")
559
  training_status["logs"].append(f"Yes predictions: {final_metrics['yes_ratio']:.1%}")
560
  training_status["logs"].append(f"No predictions: {final_metrics['no_ratio']:.1%}")
 
 
 
 
 
 
 
 
 
 
 
561
 
562
  # Update global model reference
563
  current_model = dpo_trainer.model
@@ -566,10 +667,117 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
566
  # Push to hub if token available
567
  if HF_TOKEN and HF_USERNAME:
568
  try:
569
- repo_name = f"{HF_USERNAME}/phi3-dpo-relevance"
570
- current_model.push_to_hub(repo_name, use_auth_token=HF_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
  current_tokenizer.push_to_hub(repo_name, use_auth_token=HF_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
  logger.info(f"Model pushed to hub: {repo_name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
  except Exception as e:
574
  logger.error(f"Failed to push to hub: {e}")
575
 
@@ -579,7 +787,7 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
579
 
580
 
581
  def run_training(csv_path, shuffle_flag=False, split_ratio=0.8):
582
- """Run training from CSV file"""
583
  global train_df, test_df, training_status
584
 
585
  try:
@@ -591,30 +799,62 @@ def run_training(csv_path, shuffle_flag=False, split_ratio=0.8):
591
  df = pd.read_csv(csv_path)
592
  logger.info(f"Loaded {len(df)} samples from {csv_path}")
593
 
594
- # Shuffle if requested
595
- if shuffle_flag:
596
- df = df.sample(frac=1).reset_index(drop=True)
597
 
598
- # Check required columns
599
- required_cols = ['query', 'title', 'content', 'label']
600
- if not all(col in df.columns for col in required_cols):
601
- # If prompt column exists, use it directly
602
- if 'prompt' in df.columns and 'label' in df.columns:
603
- logger.info("Using existing prompt column")
604
- else:
605
- raise ValueError(f"CSV must have columns: {required_cols} or ['prompt', 'label']")
606
- else:
607
- # Create prompt column if not exists
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
608
  if 'prompt' not in df.columns:
609
- logger.info("Creating prompt column from query/title/content")
610
  df['prompt'] = df.apply(
611
  lambda row: format_prompt(row['query'], row['title'], row['content']),
612
  axis=1
613
  )
 
 
 
 
 
614
 
615
- # Validate labels
616
- if not all(label in ['yes', 'no'] for label in df['label'].unique()):
617
- raise ValueError("Labels must be 'yes' or 'no'")
618
 
619
  # Split data
620
  split_idx = int(len(df) * split_ratio)
@@ -628,6 +868,8 @@ def run_training(csv_path, shuffle_flag=False, split_ratio=0.8):
628
  with training_lock:
629
  training_status["status"] = "training"
630
  training_status["logs"].append("Starting training...")
 
 
631
 
632
  # Conservative sample size for GPT-OSS-20B
633
  if current_model_id == "openai/gpt-oss-20b":
@@ -654,18 +896,17 @@ def run_inference(query, document_title, document_content, checkpoint="latest"):
654
  global current_model, current_tokenizer
655
 
656
  # Validate inputs
657
- if not query or not query.strip():
658
  return "Error: Query cannot be empty"
659
- if not document_title or not document_title.strip():
660
  return "Error: Document title cannot be empty"
661
- if not document_content or not document_content.strip():
662
  return "Error: Document content cannot be empty"
663
 
664
- # Truncate content if too long (warn user)
665
- max_content_length = 1000
666
- if len(document_content) > max_content_length:
667
- logger.warning(f"Content truncated from {len(document_content)} to {max_content_length} characters")
668
- document_content = document_content[:max_content_length]
669
 
670
  # Load model if needed
671
  if current_model is None:
@@ -740,8 +981,12 @@ def run_inference_by_row(split_choice, row_idx, checkpoint="latest"):
740
 
741
  row = df.iloc[int(row_idx)]
742
 
743
- # Get data from row
744
- if all(col in row for col in ['query', 'title', 'content']):
 
 
 
 
745
  query = str(row['query'])
746
  title = str(row['title'])
747
  content = str(row['content'])
@@ -759,10 +1004,22 @@ def run_inference_by_row(split_choice, row_idx, checkpoint="latest"):
759
  # Extract just the prediction (yes/no) from the result
760
  prediction = prediction_with_confidence.split()[0] # Gets "yes" or "no" from "yes (confidence: X%)"
761
 
762
- # Check if prediction matches actual
763
- is_correct = prediction == actual
764
-
765
- return f"Prediction: {prediction_with_confidence}\nActual: {actual}\nCorrect: {is_correct}"
 
 
 
 
 
 
 
 
 
 
 
 
766
 
767
 
768
  def run_csv_inference(csv_file, model_choice="finetuned", checkpoint_path=None, inference_batch_size=16, progress=gr.Progress()):
@@ -781,10 +1038,20 @@ def run_csv_inference(csv_file, model_choice="finetuned", checkpoint_path=None,
781
  df = pd.read_csv(csv_file.name)
782
  logger.info(f"Loaded {len(df)} samples from CSV for inference")
783
 
784
- # Check required columns
785
- required_cols = ['query', 'title', 'content']
786
- if not all(col in df.columns for col in required_cols):
787
- return None, f"CSV must have columns: {required_cols}"
 
 
 
 
 
 
 
 
 
 
788
 
789
  # Load appropriate model
790
  if model_choice == "base":
@@ -865,9 +1132,9 @@ def run_csv_inference(csv_file, model_choice="finetuned", checkpoint_path=None,
865
  prompts = []
866
  for _, row in batch_df.iterrows():
867
  prompt = format_prompt(
868
- str(row['query']),
869
  str(row['title']),
870
- str(row['content'])
871
  )
872
  prompts.append(prompt)
873
 
@@ -921,9 +1188,36 @@ def run_csv_inference(csv_file, model_choice="finetuned", checkpoint_path=None,
921
 
922
  # If labels exist, calculate accuracy and add is_right column
923
  if 'label' in df.columns:
924
- df['is_right'] = df['prediction'] == df['label']
925
- accuracy = df['is_right'].mean()
926
- accuracy_text = f"\nAccuracy: {accuracy:.2%} ({df['is_right'].sum()}/{len(df)} correct)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
927
  else:
928
  accuracy_text = ""
929
 
@@ -955,7 +1249,7 @@ with gr.Blocks(title="Phi-3 Document Relevance Classifier") as demo:
955
 
956
  with gr.Tab("Training"):
957
  csv_input = gr.File(label="Upload Training CSV", file_types=[".csv"])
958
- gr.Markdown("CSV should have columns: query, title, content, label (yes/no)")
959
  shuffle_flag = gr.Checkbox(label="Shuffle Dataset", value=True)
960
  split_slider = gr.Slider(0.5, 0.9, value=0.8, step=0.05, label="Train Split %")
961
  start_btn = gr.Button("Start Training", variant="primary")
@@ -1047,7 +1341,7 @@ with gr.Blocks(title="Phi-3 Document Relevance Classifier") as demo:
1047
  file_types=[".csv"],
1048
  file_count="single"
1049
  )
1050
- gr.Markdown("CSV must have columns: `query`, `title`, `content`. Optional: `label` (for accuracy calculation)")
1051
 
1052
  with gr.Row():
1053
  model_choice_radio = gr.Radio(
@@ -1093,6 +1387,38 @@ with gr.Blocks(title="Phi-3 Document Relevance Classifier") as demo:
1093
  outputs=[csv_output_file, csv_results_text]
1094
  )
1095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1096
  with gr.Tab("Model Settings"):
1097
  gr.Markdown("### Model Selection")
1098
  gr.Markdown("Choose which model to use for training and inference")
 
13
  from datasets import Dataset
14
  from transformers import AutoTokenizer, AutoModelForCausalLM
15
  from peft import PeftModel, prepare_model_for_kbit_training, LoraConfig, get_peft_model
 
 
16
  from trl import DPOTrainer, DPOConfig
17
  import warnings
18
  import subprocess
19
  import gc
20
  import psutil
21
+
22
  warnings.filterwarnings("ignore")
23
 
24
  # ===== CONFIG =====
 
110
  current_tokenizer = None
111
  train_df = None
112
  test_df = None
113
+ training_status = {"status": "idle", "progress": 0, "logs": [], "model_repo": None}
114
+ trained_models = [] # Keep track of all trained models
115
  training_lock = threading.Lock()
116
 
117
 
118
  def format_prompt(query, title, content):
119
  """Format the prompt for the model"""
120
+ # Truncate content if too long
121
+ if len(content) > 1000:
122
+ content = content[:1000] + "..."
123
+
124
  return f"""you would get a query and document's title and content and return yes (if the document is relevant to the query)/ or no (if the document is not relevant to the query)
125
  Answer only yes / no.
126
  Document:
 
225
 
226
  return f"GPU Memory: {allocated:.1f}GB allocated / {reserved:.1f}GB reserved / {total:.1f}GB total"
227
 
228
+ def get_trained_models_list():
229
+ """Get formatted list of all trained models"""
230
+ if not trained_models:
231
+ return "No models trained yet in this session.\n\nPreviously trained models on HuggingFace:\n- amos1088/phi3-dpo-relevance"
232
+
233
+ text = "## Trained Models in This Session:\n\n"
234
+ for i, model in enumerate(trained_models, 1):
235
+ text += f"{i}. **{model['repo']}**\n"
236
+ text += f" - Accuracy: {model['accuracy']:.2%}\n"
237
+ text += f" - Predictions: Yes {model['yes_ratio']:.1%}, No {model['no_ratio']:.1%}\n"
238
+ text += f" - Beta: {model['beta']}, Model: {model['model_id'].split('/')[-1]}\n"
239
+ text += f" - Link: https://huggingface.co/{model['repo']}\n\n"
240
+
241
+ return text
242
+
243
  def switch_model(model_id):
244
  """Switch to a different model"""
245
  global current_model, current_tokenizer, current_model_id, OUTPUT_DIR
 
308
 
309
 
310
  def prepare_dpo_dataset(df):
311
+ """Convert 4-category labels to DPO format with chosen/rejected pairs"""
312
  dpo_data = []
313
 
314
+ # Map 4 categories to yes/no
315
+ label_mapping = {
316
+ 'easy_positive': 'yes',
317
+ 'hard_positive': 'yes',
318
+ 'easy_negative': 'no',
319
+ 'hard_negative': 'no',
320
+ 'yes': 'yes',
321
+ 'no': 'no'
322
+ }
323
+
324
  for _, row in df.iterrows():
325
+ # Handle both old and new column names
326
+ if 'query_text' in row:
327
+ query = row['query_text']
328
+ title = row['title']
329
+ content = row['text']
330
+ else:
331
+ query = row.get('query', '')
332
+ title = row.get('title', '')
333
+ content = row.get('content', '')
334
+
335
+ # Create prompt if not exists
336
+ if 'prompt' in row:
337
+ prompt = row['prompt']
338
+ else:
339
+ prompt = format_prompt(query, title, content)
340
 
341
+ # Get mapped label
342
+ original_label = row['label']
343
+ mapped_label = label_mapping.get(original_label, original_label)
344
+
345
+ if mapped_label == 'yes':
346
  # For 'yes' examples, chosen is "yes" and rejected is "no"
347
  dpo_data.append({
348
  'prompt': prompt,
349
  'chosen': 'yes',
350
+ 'rejected': 'no',
351
+ 'original_label': original_label # Keep original for analysis
352
  })
353
  else:
354
  # For 'no' examples, chosen is "no" and rejected is "yes"
355
  dpo_data.append({
356
  'prompt': prompt,
357
  'chosen': 'no',
358
+ 'rejected': 'yes',
359
+ 'original_label': original_label # Keep original for analysis
360
  })
361
 
362
  return pd.DataFrame(dpo_data)
 
495
  import numpy as np
496
 
497
  def compute_accuracy_metrics(trainer, eval_dataset, num_samples=100):
498
+ """Compute accuracy metrics and confusion matrix on a subset of eval data"""
499
  # Sample subset for faster evaluation
500
  indices = np.random.choice(len(eval_dataset), min(num_samples, len(eval_dataset)), replace=False)
501
 
502
+ # Initialize confusion matrix counters
503
+ # Rows: true labels, Cols: predicted labels
504
+ confusion_matrix = {
505
+ 'easy_positive': {'yes': 0, 'no': 0},
506
+ 'hard_positive': {'yes': 0, 'no': 0},
507
+ 'easy_negative': {'yes': 0, 'no': 0},
508
+ 'hard_negative': {'yes': 0, 'no': 0}
509
+ }
510
+
511
  predictions_yes = 0
512
  predictions_no = 0
513
  correct = 0
 
515
  for idx in indices:
516
  item = eval_dataset[int(idx)]
517
  prompt = item['prompt']
518
+ true_choice = item['chosen'] # This is the correct answer (yes/no)
519
+ original_label = item.get('original_label', None) # Get original 4-category label
520
 
521
  # Tokenize and run inference
522
  inputs = current_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
 
534
  no_logit = logits[no_token_id].item()
535
 
536
  # Get prediction
537
+ prediction = "yes" if yes_logit > no_logit else "no"
538
 
539
+ if prediction == "yes":
540
  predictions_yes += 1
541
  else:
542
  predictions_no += 1
543
 
544
+ if prediction == true_choice:
545
  correct += 1
546
+
547
+ # Update confusion matrix if we have original label
548
+ if original_label and original_label in confusion_matrix:
549
+ confusion_matrix[original_label][prediction] += 1
550
 
551
  accuracy = correct / len(indices)
552
  yes_ratio = predictions_yes / len(indices)
553
  no_ratio = predictions_no / len(indices)
554
 
555
+ # Calculate per-category accuracies
556
+ category_accuracies = {}
557
+ for category in confusion_matrix:
558
+ total = confusion_matrix[category]['yes'] + confusion_matrix[category]['no']
559
+ if total > 0:
560
+ if category in ['easy_positive', 'hard_positive']:
561
+ # For positive categories, correct prediction is 'yes'
562
+ category_accuracies[category] = confusion_matrix[category]['yes'] / total
563
+ else:
564
+ # For negative categories, correct prediction is 'no'
565
+ category_accuracies[category] = confusion_matrix[category]['no'] / total
566
+ else:
567
+ category_accuracies[category] = 0.0
568
+
569
  return {
570
  'accuracy': accuracy,
571
  'yes_ratio': yes_ratio,
572
  'no_ratio': no_ratio,
573
+ 'total_samples': len(indices),
574
+ 'confusion_matrix': confusion_matrix,
575
+ 'category_accuracies': category_accuracies
576
  }
577
 
578
  class StatusCallback(TrainerCallback):
 
599
  f"No={metrics['no_ratio']:.1%}"
600
  )
601
 
602
+ # Add confusion matrix info if available
603
+ if 'confusion_matrix' in metrics:
604
+ training_status["logs"].append("\n=== Confusion Matrix ===")
605
+ for category, preds in metrics['confusion_matrix'].items():
606
+ total = preds['yes'] + preds['no']
607
+ if total > 0:
608
+ acc = metrics['category_accuracies'][category]
609
+ training_status["logs"].append(
610
+ f"{category}: Yes={preds['yes']}, No={preds['no']} (Acc: {acc:.1%})"
611
+ )
612
+
613
  # Warn if model is biased
614
  if metrics['yes_ratio'] < 0.2 or metrics['no_ratio'] < 0.2:
615
  training_status["logs"].append(
 
645
 
646
  with training_lock:
647
  training_status["logs"].append(f"\n=== FINAL RESULTS ===")
648
+ training_status["logs"].append(f"Overall Accuracy: {final_metrics['accuracy']:.2%}")
649
  training_status["logs"].append(f"Yes predictions: {final_metrics['yes_ratio']:.1%}")
650
  training_status["logs"].append(f"No predictions: {final_metrics['no_ratio']:.1%}")
651
+
652
+ # Add final confusion matrix
653
+ if 'confusion_matrix' in final_metrics:
654
+ training_status["logs"].append("\n=== Final Confusion Matrix ===")
655
+ for category, preds in final_metrics['confusion_matrix'].items():
656
+ total = preds['yes'] + preds['no']
657
+ if total > 0:
658
+ acc = final_metrics['category_accuracies'][category]
659
+ training_status["logs"].append(
660
+ f"{category}: Yes={preds['yes']}, No={preds['no']} (Accuracy: {acc:.1%})"
661
+ )
662
 
663
  # Update global model reference
664
  current_model = dpo_trainer.model
 
667
  # Push to hub if token available
668
  if HF_TOKEN and HF_USERNAME:
669
  try:
670
+ # Generate unique repo name with timestamp
671
+ from datetime import datetime
672
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
673
+ model_short_name = current_model_id.split("/")[-1]
674
+
675
+ # Create descriptive repo name with training details
676
+ repo_name = f"{HF_USERNAME}/{model_short_name}-relevance-beta{int(training_args.beta)}-{timestamp}"
677
+
678
+ # Create model card with training information
679
+ model_card_content = f"""---
680
+ tags:
681
+ - document-relevance
682
+ - dpo
683
+ - {model_short_name}
684
+ datasets:
685
+ - custom-relevance-dataset
686
+ metrics:
687
+ - accuracy
688
+ model-index:
689
+ - name: {repo_name.split('/')[-1]}
690
+ results:
691
+ - task:
692
+ type: text-classification
693
+ name: Document Relevance Classification
694
+ metrics:
695
+ - type: accuracy
696
+ value: {final_metrics['accuracy']:.4f}
697
+ name: Validation Accuracy
698
+ - type: yes_ratio
699
+ value: {final_metrics['yes_ratio']:.4f}
700
+ name: Yes Prediction Ratio
701
+ - type: no_ratio
702
+ value: {final_metrics['no_ratio']:.4f}
703
+ name: No Prediction Ratio
704
+ ---
705
+
706
+ # {model_short_name} Document Relevance Classifier
707
+
708
+ This model was trained using DPO (Direct Preference Optimization) for document relevance classification.
709
+
710
+ ## Training Configuration
711
+ - Base Model: {current_model_id}
712
+ - DPO Beta: {training_args.beta}
713
+ - Learning Rate: {training_args.learning_rate}
714
+ - Batch Size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}
715
+ - Epochs: {training_args.num_train_epochs}
716
+ - Training Samples: {len(train_df)}
717
+ - Validation Samples: {len(val_df)}
718
+
719
+ ## Performance Metrics
720
+ - **Accuracy**: {final_metrics['accuracy']:.2%}
721
+ - **Yes Predictions**: {final_metrics['yes_ratio']:.1%}
722
+ - **No Predictions**: {final_metrics['no_ratio']:.1%}
723
+
724
+ ## Usage
725
+ ```python
726
+ from transformers import AutoTokenizer, AutoModelForCausalLM
727
+ from peft import PeftModel
728
+
729
+ # Load base model
730
+ model = AutoModelForCausalLM.from_pretrained("{current_model_id}")
731
+ tokenizer = AutoTokenizer.from_pretrained("{current_model_id}")
732
+
733
+ # Load adapter
734
+ model = PeftModel.from_pretrained(model, "{HF_USERNAME}/{repo_name.split('/')[-1]}")
735
+ ```
736
+
737
+ ## Training Date
738
+ {datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC')}
739
+ """
740
+
741
+ # Push model with model card
742
+ current_model.push_to_hub(
743
+ repo_name,
744
+ use_auth_token=HF_TOKEN,
745
+ commit_message=f"DPO training with beta={training_args.beta}, accuracy={final_metrics['accuracy']:.2%}"
746
+ )
747
  current_tokenizer.push_to_hub(repo_name, use_auth_token=HF_TOKEN)
748
+
749
+ # Save model card
750
+ try:
751
+ from huggingface_hub import HfApi
752
+ api = HfApi()
753
+ api.upload_file(
754
+ path_or_fileobj=model_card_content.encode(),
755
+ path_in_repo="README.md",
756
+ repo_id=repo_name,
757
+ repo_type="model",
758
+ token=HF_TOKEN
759
+ )
760
+ except:
761
+ pass # Model card upload is optional
762
+
763
  logger.info(f"Model pushed to hub: {repo_name}")
764
+
765
+ # Save repo name to training status and trained models list
766
+ with training_lock:
767
+ training_status["logs"].append(f"Model saved to: https://huggingface.co/{repo_name}")
768
+ training_status["model_repo"] = repo_name
769
+
770
+ # Add to trained models list
771
+ trained_models.append({
772
+ "repo": repo_name,
773
+ "timestamp": timestamp,
774
+ "accuracy": final_metrics['accuracy'],
775
+ "yes_ratio": final_metrics['yes_ratio'],
776
+ "no_ratio": final_metrics['no_ratio'],
777
+ "beta": training_args.beta,
778
+ "model_id": current_model_id
779
+ })
780
+
781
  except Exception as e:
782
  logger.error(f"Failed to push to hub: {e}")
783
 
 
787
 
788
 
789
  def run_training(csv_path, shuffle_flag=False, split_ratio=0.8):
790
+ """Run training from CSV file with balanced sampling"""
791
  global train_df, test_df, training_status
792
 
793
  try:
 
799
  df = pd.read_csv(csv_path)
800
  logger.info(f"Loaded {len(df)} samples from {csv_path}")
801
 
802
+ # Check required columns for new format
803
+ new_format_cols = ['query_text', 'title', 'text', 'label']
804
+ old_format_cols = ['query', 'title', 'content', 'label']
805
 
806
+ if all(col in df.columns for col in new_format_cols):
807
+ # New format with 4 categories
808
+ logger.info("Using new CSV format with 4 categories")
809
+
810
+ # Validate labels
811
+ valid_labels = ['easy_positive', 'hard_positive', 'easy_negative', 'hard_negative']
812
+ if not all(label in valid_labels for label in df['label'].unique()):
813
+ raise ValueError(f"Labels must be one of: {valid_labels}")
814
+
815
+ # Create prompt column
816
+ if 'prompt' not in df.columns:
817
+ logger.info("Creating prompt column from query_text/title/text")
818
+ df['prompt'] = df.apply(
819
+ lambda row: format_prompt(row['query_text'], row['title'], row['text']),
820
+ axis=1
821
+ )
822
+
823
+ # Log original distribution
824
+ logger.info(f"Original label distribution: {df['label'].value_counts().to_dict()}")
825
+
826
+ # Balance the dataset by sampling equal amounts from each category
827
+ min_category_count = df['label'].value_counts().min()
828
+ logger.info(f"Balancing dataset to {min_category_count} samples per category")
829
+
830
+ balanced_dfs = []
831
+ for label in valid_labels:
832
+ category_df = df[df['label'] == label]
833
+ if len(category_df) > min_category_count:
834
+ category_df = category_df.sample(n=min_category_count, random_state=42)
835
+ balanced_dfs.append(category_df)
836
+
837
+ df = pd.concat(balanced_dfs, ignore_index=True)
838
+ logger.info(f"Balanced dataset size: {len(df)}")
839
+ logger.info(f"Balanced label distribution: {df['label'].value_counts().to_dict()}")
840
+
841
+ elif all(col in df.columns for col in old_format_cols):
842
+ # Old format
843
+ logger.info("Using old CSV format")
844
  if 'prompt' not in df.columns:
 
845
  df['prompt'] = df.apply(
846
  lambda row: format_prompt(row['query'], row['title'], row['content']),
847
  axis=1
848
  )
849
+ # Validate labels for old format
850
+ if not all(label in ['yes', 'no'] for label in df['label'].unique()):
851
+ raise ValueError("Labels must be 'yes' or 'no' for old format")
852
+ else:
853
+ raise ValueError(f"CSV must have columns: {new_format_cols} or {old_format_cols}")
854
 
855
+ # Shuffle if requested
856
+ if shuffle_flag:
857
+ df = df.sample(frac=1).reset_index(drop=True)
858
 
859
  # Split data
860
  split_idx = int(len(df) * split_ratio)
 
868
  with training_lock:
869
  training_status["status"] = "training"
870
  training_status["logs"].append("Starting training...")
871
+ training_status["logs"].append(f"Train samples: {len(train_df)}")
872
+ training_status["logs"].append(f"Test samples: {len(test_df)}")
873
 
874
  # Conservative sample size for GPT-OSS-20B
875
  if current_model_id == "openai/gpt-oss-20b":
 
896
  global current_model, current_tokenizer
897
 
898
  # Validate inputs
899
+ if not query or not str(query).strip():
900
  return "Error: Query cannot be empty"
901
+ if not document_title or not str(document_title).strip():
902
  return "Error: Document title cannot be empty"
903
+ if not document_content or not str(document_content).strip():
904
  return "Error: Document content cannot be empty"
905
 
906
+ # Convert to strings to handle any data type
907
+ query = str(query)
908
+ document_title = str(document_title)
909
+ document_content = str(document_content)
 
910
 
911
  # Load model if needed
912
  if current_model is None:
 
981
 
982
  row = df.iloc[int(row_idx)]
983
 
984
+ # Get data from row - handle both old and new formats
985
+ if all(col in row for col in ['query_text', 'title', 'text']):
986
+ query = str(row['query_text'])
987
+ title = str(row['title'])
988
+ content = str(row['text'])
989
+ elif all(col in row for col in ['query', 'title', 'content']):
990
  query = str(row['query'])
991
  title = str(row['title'])
992
  content = str(row['content'])
 
1004
  # Extract just the prediction (yes/no) from the result
1005
  prediction = prediction_with_confidence.split()[0] # Gets "yes" or "no" from "yes (confidence: X%)"
1006
 
1007
+ # Handle 4-category labels
1008
+ if actual in ['easy_positive', 'hard_positive', 'easy_negative', 'hard_negative']:
1009
+ # Map to yes/no for comparison
1010
+ label_mapping = {
1011
+ 'easy_positive': 'yes',
1012
+ 'hard_positive': 'yes',
1013
+ 'easy_negative': 'no',
1014
+ 'hard_negative': 'no'
1015
+ }
1016
+ mapped_actual = label_mapping[actual]
1017
+ is_correct = prediction == mapped_actual
1018
+ return f"Prediction: {prediction_with_confidence}\nActual: {actual} (mapped to: {mapped_actual})\nCorrect: {is_correct}"
1019
+ else:
1020
+ # Old format with yes/no
1021
+ is_correct = prediction == actual
1022
+ return f"Prediction: {prediction_with_confidence}\nActual: {actual}\nCorrect: {is_correct}"
1023
 
1024
 
1025
  def run_csv_inference(csv_file, model_choice="finetuned", checkpoint_path=None, inference_batch_size=16, progress=gr.Progress()):
 
1038
  df = pd.read_csv(csv_file.name)
1039
  logger.info(f"Loaded {len(df)} samples from CSV for inference")
1040
 
1041
+ # Check required columns - support both old and new formats
1042
+ new_format_cols = ['query_text', 'title', 'text']
1043
+ old_format_cols = ['query', 'title', 'content']
1044
+
1045
+ if all(col in df.columns for col in new_format_cols):
1046
+ # New format
1047
+ query_col = 'query_text'
1048
+ content_col = 'text'
1049
+ elif all(col in df.columns for col in old_format_cols):
1050
+ # Old format
1051
+ query_col = 'query'
1052
+ content_col = 'content'
1053
+ else:
1054
+ return None, f"CSV must have columns: {new_format_cols} or {old_format_cols}"
1055
 
1056
  # Load appropriate model
1057
  if model_choice == "base":
 
1132
  prompts = []
1133
  for _, row in batch_df.iterrows():
1134
  prompt = format_prompt(
1135
+ str(row[query_col]),
1136
  str(row['title']),
1137
+ str(row[content_col])
1138
  )
1139
  prompts.append(prompt)
1140
 
 
1188
 
1189
  # If labels exist, calculate accuracy and add is_right column
1190
  if 'label' in df.columns:
1191
+ # Handle 4-category labels
1192
+ if df['label'].iloc[0] in ['easy_positive', 'hard_positive', 'easy_negative', 'hard_negative']:
1193
+ # Map 4 categories to yes/no for accuracy calculation
1194
+ label_mapping = {
1195
+ 'easy_positive': 'yes',
1196
+ 'hard_positive': 'yes',
1197
+ 'easy_negative': 'no',
1198
+ 'hard_negative': 'no'
1199
+ }
1200
+ df['mapped_label'] = df['label'].map(label_mapping)
1201
+ df['is_right'] = df['prediction'] == df['mapped_label']
1202
+
1203
+ # Calculate per-category accuracy
1204
+ accuracy_text = "\n=== Overall Results ==="
1205
+ accuracy = df['is_right'].mean()
1206
+ accuracy_text += f"\nOverall Accuracy: {accuracy:.2%} ({df['is_right'].sum()}/{len(df)} correct)"
1207
+
1208
+ accuracy_text += "\n\n=== Per-Category Results ==="
1209
+ for category in ['easy_positive', 'hard_positive', 'easy_negative', 'hard_negative']:
1210
+ category_df = df[df['label'] == category]
1211
+ if len(category_df) > 0:
1212
+ category_acc = category_df['is_right'].mean()
1213
+ category_count = len(category_df)
1214
+ correct_count = category_df['is_right'].sum()
1215
+ accuracy_text += f"\n{category}: {category_acc:.2%} ({correct_count}/{category_count} correct)"
1216
+ else:
1217
+ # Old format with yes/no labels
1218
+ df['is_right'] = df['prediction'] == df['label']
1219
+ accuracy = df['is_right'].mean()
1220
+ accuracy_text = f"\nAccuracy: {accuracy:.2%} ({df['is_right'].sum()}/{len(df)} correct)"
1221
  else:
1222
  accuracy_text = ""
1223
 
 
1249
 
1250
  with gr.Tab("Training"):
1251
  csv_input = gr.File(label="Upload Training CSV", file_types=[".csv"])
1252
+ gr.Markdown("CSV should have columns: query_text, title, text, label (easy_positive/hard_positive/easy_negative/hard_negative)")
1253
  shuffle_flag = gr.Checkbox(label="Shuffle Dataset", value=True)
1254
  split_slider = gr.Slider(0.5, 0.9, value=0.8, step=0.05, label="Train Split %")
1255
  start_btn = gr.Button("Start Training", variant="primary")
 
1341
  file_types=[".csv"],
1342
  file_count="single"
1343
  )
1344
+ gr.Markdown("CSV must have columns: `query_text`, `title`, `text`. Optional: `label` (for accuracy calculation)")
1345
 
1346
  with gr.Row():
1347
  model_choice_radio = gr.Radio(
 
1387
  outputs=[csv_output_file, csv_results_text]
1388
  )
1389
 
1390
+ with gr.Tab("Trained Models"):
1391
+ gr.Markdown("### Model Training History")
1392
+ gr.Markdown("View all models trained in this session and their performance metrics")
1393
+
1394
+ models_display = gr.Markdown(get_trained_models_list())
1395
+ refresh_btn = gr.Button("Refresh Model List", variant="secondary")
1396
+
1397
+ # Update CSV inference dropdown with trained models
1398
+ model_selector = gr.Dropdown(
1399
+ label="Select Trained Model for Inference",
1400
+ choices=["Latest"] + [m["repo"] for m in trained_models],
1401
+ value="Latest",
1402
+ interactive=True
1403
+ )
1404
+
1405
+ def refresh_models_list():
1406
+ models_text = get_trained_models_list()
1407
+ choices = ["Latest", "amos1088/phi3-dpo-relevance"] + [m["repo"] for m in trained_models]
1408
+ return models_text, gr.update(choices=choices)
1409
+
1410
+ refresh_btn.click(
1411
+ refresh_models_list,
1412
+ outputs=[models_display, model_selector]
1413
+ )
1414
+
1415
+ # Auto-refresh after training
1416
+ training_status_display = gr.Textbox(visible=False)
1417
+ training_status_display.change(
1418
+ lambda: (get_trained_models_list(), gr.update(choices=["Latest"] + [m["repo"] for m in trained_models])),
1419
+ outputs=[models_display, model_selector]
1420
+ )
1421
+
1422
  with gr.Tab("Model Settings"):
1423
  gr.Markdown("### Model Selection")
1424
  gr.Markdown("Choose which model to use for training and inference")
common/ask_claude_clean.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Clean Claude API wrapper - Class-based, simple, no hidden errors
3
+ """
4
+
5
+ import anthropic
6
+ import json
7
+ import base64
8
+ from typing import Optional, Dict, List, Union, Any
9
+ from pathlib import Path
10
+
11
+ # Import config - adjust path for hackathon folder
12
+ import sys
13
+ sys.path.insert(0, str(Path(__file__).parent.parent))
14
+ try:
15
+ from common.global_config import anthropic_client
16
+ except ImportError:
17
+ # Fallback if config not available
18
+ anthropic_client = None
19
+
20
+
21
+ class ClaudeClient:
22
+ """Clean wrapper for Claude API"""
23
+
24
+ def __init__(self, api_key: Optional[str] = None):
25
+ if api_key:
26
+ self.client = anthropic.Anthropic(api_key=api_key)
27
+ elif anthropic_client:
28
+ self.client = anthropic_client
29
+ else:
30
+ raise ValueError("No anthropic client available. Pass api_key or configure global.css client.")
31
+
32
+
33
+ async def ask(
34
+ self,
35
+ messages: List[Dict],
36
+ model: str = "claude-sonnet-4-20250514",
37
+ system: Optional[str] = None,
38
+ max_tokens: int = 20000,
39
+ temperature: float = 1.0,
40
+ tools: Optional[List[Dict]] = None,
41
+ tool_choice: Optional[Dict] = None,
42
+ web_search_config: Optional[Dict] = None,
43
+ thinking: Optional[Dict] = None,
44
+ stream: bool = False,
45
+ ) -> Dict[str, Any]:
46
+ """
47
+ Call Claude API with messages.
48
+
49
+ Returns:
50
+ Dict with response, tool calls, and metadata
51
+ """
52
+
53
+ # Process messages for files
54
+ messages = self._process_files(messages.copy())
55
+
56
+ # Handle empty user messages
57
+ for msg in messages:
58
+ if msg.get("role") == "user" and not msg.get("content"):
59
+ msg["content"] = "Hello! How can I help you today?"
60
+
61
+
62
+ # Build API parameters
63
+ params = {
64
+ "model": model,
65
+ "max_tokens": max_tokens,
66
+ "temperature": temperature,
67
+ "messages": messages
68
+ }
69
+
70
+ # Add system parameter if provided
71
+ if system:
72
+ params["system"] = system
73
+
74
+ # Add tools
75
+ all_tools = tools or []
76
+ if web_search_config:
77
+ # Build web search tool with config
78
+ web_tool = {"type": "web_search_20250305"}
79
+ if "name" in web_search_config:
80
+ web_tool["name"] = web_search_config["name"]
81
+ else:
82
+ web_tool["name"] = "web_search"
83
+
84
+ # Add optional parameters if provided
85
+ if "max_uses" in web_search_config:
86
+ web_tool["max_uses"] = web_search_config["max_uses"]
87
+ if "allowed_domains" in web_search_config:
88
+ web_tool["allowed_domains"] = web_search_config["allowed_domains"]
89
+ if "blocked_domains" in web_search_config:
90
+ web_tool["blocked_domains"] = web_search_config["blocked_domains"]
91
+ if "user_location" in web_search_config:
92
+ web_tool["user_location"] = web_search_config["user_location"]
93
+
94
+ all_tools.append(web_tool)
95
+ # Web search requires specific model
96
+ params["model"] = "claude-opus-4-20250514"
97
+
98
+ # Convert tool format if needed
99
+ if all_tools:
100
+ formatted_tools = []
101
+ for tool in all_tools:
102
+ if "type" not in tool and "name" in tool:
103
+ # Convert from test format to API format
104
+ formatted_tools.append({
105
+ "type": "custom",
106
+ "name": tool["name"],
107
+ "description": tool.get("description", ""),
108
+ "input_schema": tool.get("input_schema", {})
109
+ })
110
+ else:
111
+ formatted_tools.append(tool)
112
+ params["tools"] = formatted_tools
113
+
114
+ if tool_choice:
115
+ params["tool_choice"] = tool_choice
116
+
117
+ # Add thinking mode
118
+ if thinking:
119
+ params["thinking"] = thinking
120
+ params["temperature"] = 1.0 # Override for thinking
121
+
122
+ # Make API call
123
+ if web_search_config or thinking:
124
+ response = self.client.beta.messages.create(**params)
125
+ else:
126
+ response = self.client.messages.create(**params)
127
+
128
+
129
+ # Check if response has tool use
130
+ has_tool_use = hasattr(response, "content") and any(
131
+ hasattr(block, "type") and block.type == "tool_use"
132
+ for block in response.content
133
+ )
134
+
135
+ # Build comprehensive result dict
136
+ result = {
137
+ "response": None, # Will be set below
138
+ "raw": response, # Keep raw for compatibility
139
+ "messages": messages,
140
+ "usage": response.usage if hasattr(response, 'usage') else None,
141
+ "model": response.model if hasattr(response, 'model') else model,
142
+ "has_tool_use": has_tool_use,
143
+ "tool_calls": []
144
+ }
145
+
146
+ # Extract tool calls if present
147
+ if has_tool_use:
148
+ for block in response.content:
149
+ if hasattr(block, 'type') and block.type == 'tool_use':
150
+ result["tool_calls"].append({
151
+ "id": block.id if hasattr(block, 'id') else None,
152
+ "name": block.name,
153
+ "input": block.input
154
+ })
155
+ # For tool use, response is the full object
156
+ result["response"] = response
157
+ else:
158
+ # Extract text response
159
+ text_response = self._extract_text(response)
160
+ result["response"] = text_response
161
+
162
+ return result
163
+
164
+ def _process_files(self, messages: List[Dict]) -> List[Dict]:
165
+ """Process file references in messages"""
166
+ for msg in messages:
167
+ if not isinstance(msg.get("content"), list):
168
+ continue
169
+
170
+ new_content = []
171
+ for item in msg["content"]:
172
+ if isinstance(item, dict) and item.get("type") == "file":
173
+ file_content = self._read_file(item["path"])
174
+ new_content.append(file_content)
175
+ else:
176
+ new_content.append(item)
177
+
178
+ msg["content"] = new_content
179
+
180
+ return messages
181
+
182
+ def _read_file(self, file_path: str) -> Dict:
183
+ """Read file and return content block"""
184
+ path = Path(file_path)
185
+ if not path.exists():
186
+ raise FileNotFoundError(f"File not found: {file_path}")
187
+
188
+ # Text files
189
+ text_exts = {'.txt', '.py', '.js', '.json', '.md', '.html', '.svg', '.xml', '.csv'}
190
+ if path.suffix.lower() in text_exts:
191
+ content = path.read_text(encoding='utf-8')
192
+ return {"type": "text", "text": f"File: {path.name}\n\n{content}"}
193
+
194
+ # PDF
195
+ if path.suffix.lower() == '.pdf':
196
+ data = base64.b64encode(path.read_bytes()).decode()
197
+ return {
198
+ "type": "document",
199
+ "source": {"type": "base64", "media_type": "application/pdf", "data": data}
200
+ }
201
+
202
+ # Images
203
+ img_exts = {'.png', '.jpg', '.jpeg', '.gif', '.webp'}
204
+ if path.suffix.lower() in img_exts:
205
+ mime_type = f"image/{path.suffix[1:]}"
206
+ data = base64.b64encode(path.read_bytes()).decode()
207
+ return {
208
+ "type": "image",
209
+ "source": {"type": "base64", "media_type": mime_type, "data": data}
210
+ }
211
+
212
+ # Try as text
213
+ content = path.read_text(encoding='utf-8')
214
+ return {"type": "text", "text": f"File: {path.name}\n\n{content}"}
215
+
216
+
217
+
218
+ def _extract_text(self, response) -> str:
219
+ """Extract text from API response"""
220
+ if not hasattr(response, "content"):
221
+ return str(response)
222
+
223
+ parts = []
224
+ for block in response.content:
225
+ if hasattr(block, "type") and block.type == "text":
226
+ parts.append(block.text)
227
+
228
+ return " ".join(parts).strip()
229
+
230
+
231
+
232
+
233
+
234
+ # Create default client lazily
235
+ _default_client = None
236
+
237
+ # Backwards compatible function for tests
238
+ async def ask_claude(messages: List[Dict], **kwargs) -> Any:
239
+ """Function wrapper for backwards compatibility"""
240
+ global _default_client
241
+ if _default_client is None:
242
+ if anthropic_client:
243
+ _default_client = ClaudeClient()
244
+ else:
245
+ # Try to create with environment variable
246
+ import os
247
+ api_key = os.getenv('ANTHROPIC_API_KEY')
248
+ if api_key:
249
+ _default_client = ClaudeClient(api_key=api_key)
250
+ else:
251
+ raise ValueError("No anthropic client available. Set ANTHROPIC_API_KEY or configure global.css client.")
252
+
253
+ return await _default_client.ask(messages, **kwargs)
common/ask_openai.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simple OpenAI wrapper - no lies, just works
3
+ NOW WITH PROPER TOOL CALLING FLOW
4
+ AND VISION SUPPORT
5
+ """
6
+
7
+ import json
8
+ import base64
9
+ from openai import OpenAI
10
+ from typing import Dict, List, Any, Optional, Callable, Union
11
+ import os
12
+ import sys
13
+ from pathlib import Path
14
+
15
+ sys.path.insert(0, str(Path(__file__).parent.parent))
16
+ from common.global_config import openai_client, secrets
17
+
18
+
19
+ class OpenAIClient:
20
+ """Simple OpenAI client that actually works"""
21
+
22
+ def __init__(self, api_key: Optional[str] = None):
23
+ if api_key:
24
+ self.client = OpenAI(api_key=api_key)
25
+ elif openai_client:
26
+ self.client = openai_client
27
+ else:
28
+ raise ValueError("No OpenAI client. Pass api_key or set up global_config.")
29
+
30
+ def ask(
31
+ self,
32
+ messages: List[Dict],
33
+ model: str = "gpt-4o-mini",
34
+ temperature: float = 0.7,
35
+ max_tokens: int = 4000,
36
+ response_format: Dict = None,
37
+ tools: List[Dict] = None,
38
+ tool_choice: Any = None,
39
+ strict_schema: bool = False,
40
+ max_retries: int = 3
41
+ ) -> Dict[str, Any]:
42
+ """Just ask OpenAI and get response"""
43
+
44
+ # Build params
45
+ params = {
46
+ "model": model,
47
+ "messages": messages,
48
+ "temperature": temperature,
49
+ "max_tokens": max_tokens
50
+ }
51
+
52
+ if response_format:
53
+ params["response_format"] = response_format
54
+ if tools:
55
+ params["tools"] = tools
56
+ if tool_choice:
57
+ params["tool_choice"] = tool_choice
58
+
59
+ # Try to get response
60
+ attempt = 0
61
+ last_error = None
62
+
63
+ while attempt < max_retries:
64
+ try:
65
+ response = self.client.chat.completions.create(**params)
66
+
67
+ # Get the content
68
+ content = response.choices[0].message.content
69
+
70
+ # If we need JSON and strict_schema is on, validate it
71
+ if strict_schema and response_format and response_format.get("type") == "json_object":
72
+ try:
73
+ json_content = json.loads(content)
74
+ content = json_content
75
+ except json.JSONDecodeError as e:
76
+ if attempt < max_retries - 1:
77
+ # Tell it what went wrong
78
+ messages.append({"role": "assistant", "content": content})
79
+ messages.append({"role": "user", "content": f"Invalid JSON. Error: {str(e)}. Please return valid JSON."})
80
+ attempt += 1
81
+ continue
82
+ else:
83
+ return {
84
+ "error": f"Invalid JSON after {max_retries} attempts",
85
+ "last_response": content,
86
+ "attempts": attempt + 1
87
+ }
88
+
89
+ # Build result
90
+ result = {
91
+ "content": content,
92
+ "model": response.model,
93
+ "usage": {
94
+ "prompt_tokens": response.usage.prompt_tokens,
95
+ "completion_tokens": response.usage.completion_tokens,
96
+ "total_tokens": response.usage.total_tokens
97
+ }
98
+ }
99
+
100
+ # Add tool calls if any
101
+ if response.choices[0].message.tool_calls:
102
+ result["tool_calls"] = []
103
+ for tc in response.choices[0].message.tool_calls:
104
+ result["tool_calls"].append({
105
+ "id": tc.id,
106
+ "name": tc.function.name,
107
+ "arguments": json.loads(tc.function.arguments)
108
+ })
109
+
110
+ return result
111
+
112
+ except Exception as e:
113
+ last_error = str(e)
114
+ attempt += 1
115
+ if attempt < max_retries:
116
+ continue
117
+
118
+ return {
119
+ "error": f"Failed after {max_retries} attempts",
120
+ "last_error": last_error,
121
+ "attempts": max_retries
122
+ }
123
+
124
+
125
+
126
+
127
+ # Simple function to use
128
+ def ask_gpt(messages, **kwargs):
129
+ """Simple function to ask GPT"""
130
+ client = OpenAIClient()
131
+ return client.ask(messages, **kwargs)
132
+
133
+
134
+ # Helper functions for tool calling
135
+ def format_tool_message(tool_call_id: str, result: Any) -> Dict[str, Any]:
136
+ """Format tool result for conversation"""
137
+ return {
138
+ "role": "tool",
139
+ "content": json.dumps(result) if not isinstance(result, str) else result,
140
+ "tool_call_id": tool_call_id
141
+ }
142
+
143
+
144
+ def format_assistant_with_tools(content: str, tool_calls: List[Dict]) -> Dict[str, Any]:
145
+ """Format assistant message with tool calls"""
146
+ msg = {
147
+ "role": "assistant",
148
+ "content": content
149
+ }
150
+
151
+ if tool_calls:
152
+ msg["tool_calls"] = [
153
+ {
154
+ "id": tc['id'],
155
+ "type": "function",
156
+ "function": {
157
+ "name": tc['name'],
158
+ "arguments": json.dumps(tc['arguments']) if isinstance(tc['arguments'], dict) else tc['arguments']
159
+ }
160
+ }
161
+ for tc in tool_calls
162
+ ]
163
+
164
+ return msg
165
+
166
+
167
+ def ask_gpt_with_validation(messages, expected_format="json", max_retries=3, **kwargs):
168
+ """
169
+ Ask GPT with automatic validation and retry
170
+
171
+ Args:
172
+ messages: Messages to send
173
+ expected_format: "json" or "text"
174
+ max_retries: Number of retries if validation fails
175
+ **kwargs: Other params for ask_gpt
176
+ """
177
+ client = OpenAIClient()
178
+ conversation = messages.copy()
179
+
180
+ for attempt in range(max_retries):
181
+ # Force JSON format if expected
182
+ if expected_format == "json" and "response_format" not in kwargs:
183
+ kwargs["response_format"] = {"type": "json_object"}
184
+ kwargs["strict_schema"] = True
185
+
186
+ result = client.ask(conversation, **kwargs)
187
+
188
+ # If no error and we got what we expected, return
189
+ if "error" not in result:
190
+ if expected_format == "json" and isinstance(result.get('content'), dict):
191
+ return result
192
+ elif expected_format == "text":
193
+ return result
194
+
195
+ # Validation failed - retry with repair
196
+ if attempt < max_retries - 1:
197
+ # Add failed response to conversation
198
+ if result.get('last_response'):
199
+ conversation.append({
200
+ "role": "assistant",
201
+ "content": str(result['last_response'])
202
+ })
203
+
204
+ # Add repair prompt
205
+ if expected_format == "json":
206
+ repair = "Your response was not valid JSON. Please return a properly formatted JSON object."
207
+ else:
208
+ repair = "Please try again with a clear response."
209
+
210
+ conversation.append({
211
+ "role": "user",
212
+ "content": repair
213
+ })
214
+
215
+ # All retries failed
216
+ return {
217
+ "error": f"Failed validation after {max_retries} attempts",
218
+ "attempts": max_retries
219
+ }
220
+
221
+
222
+ def encode_image_to_base64(image_path: Union[str, Path]) -> str:
223
+ """Encode image to base64 for vision API"""
224
+ image_path = Path(image_path)
225
+
226
+ if not image_path.exists():
227
+ raise FileNotFoundError(f"Image not found: {image_path}")
228
+
229
+ with open(image_path, "rb") as image_file:
230
+ return base64.b64encode(image_file.read()).decode('utf-8')
231
+
232
+
233
+ def ask_gpt_with_image(
234
+ text_prompt: str,
235
+ image_path: Union[str, Path],
236
+ model: str = "gpt-4.1-mini",
237
+ system_prompt: str = None,
238
+ detail: str = "high",
239
+ max_tokens: int = 4000,
240
+ temperature: float = 0.7,
241
+ response_format: Dict = None,
242
+ **kwargs # Can include run_id, agent_name, etc.
243
+ ) -> Dict[str, Any]:
244
+ """
245
+ Ask GPT with both text and image input
246
+
247
+ Args:
248
+ text_prompt: The question/instruction about the image
249
+ image_path: Path to image file
250
+ model: Model to use (default: gpt-4.1-mini)
251
+ system_prompt: System message for behavior/role
252
+ detail: Image detail level (low/high/auto, default: high)
253
+ max_tokens: Max response tokens
254
+ temperature: Response creativity (0-1)
255
+ response_format: OpenAI response format (e.g. {"type": "json_object"})
256
+ **kwargs: Other OpenAI parameters
257
+
258
+ Returns:
259
+ Dict with content, model, usage, etc
260
+ """
261
+ # Encode image
262
+ base64_image = encode_image_to_base64(image_path)
263
+
264
+ # Determine MIME type
265
+ suffix = Path(image_path).suffix.lower()
266
+ mime_types = {
267
+ '.png': 'image/png',
268
+ '.jpg': 'image/jpeg',
269
+ '.jpeg': 'image/jpeg',
270
+ '.gif': 'image/gif',
271
+ '.webp': 'image/webp'
272
+ }
273
+ mime_type = mime_types.get(suffix, 'image/jpeg')
274
+
275
+ # Build messages
276
+ messages = []
277
+
278
+ if system_prompt:
279
+ messages.append({
280
+ "role": "system",
281
+ "content": system_prompt
282
+ })
283
+
284
+ # Add user message with text and image
285
+ messages.append({
286
+ "role": "user",
287
+ "content": [
288
+ {
289
+ "type": "text",
290
+ "text": text_prompt
291
+ },
292
+ {
293
+ "type": "image_url",
294
+ "image_url": {
295
+ "url": f"data:{mime_type};base64,{base64_image}",
296
+ "detail": detail
297
+ }
298
+ }
299
+ ]
300
+ })
301
+
302
+ # Call OpenAI - filter out context params
303
+ api_kwargs = {
304
+ 'model': model,
305
+ 'max_tokens': max_tokens,
306
+ 'temperature': temperature
307
+ }
308
+ if response_format:
309
+ api_kwargs['response_format'] = response_format
310
+
311
+ # Add any other OpenAI-specific kwargs
312
+ for k, v in kwargs.items():
313
+ if k not in ['run_id', 'agent_name', 'tool_name']:
314
+ api_kwargs[k] = v
315
+
316
+ return ask_gpt(
317
+ messages=messages,
318
+ **api_kwargs
319
+ )
320
+
321
+
322
+ def parse_json_response(response: Union[str, Dict]) -> Dict[str, Any]:
323
+ """
324
+ Parse JSON from API response, handling markdown formatting
325
+
326
+ Args:
327
+ response: Either raw string or dict with 'content' key
328
+
329
+ Returns:
330
+ Parsed JSON object or error dict
331
+ """
332
+ # Extract content if dict
333
+ if isinstance(response, dict):
334
+ content = response.get('content', '')
335
+ # Check if we have an error response
336
+ if 'error' in response:
337
+ return response # Return error as-is
338
+ else:
339
+ content = response
340
+
341
+ # Check for empty content
342
+ if not content or not content.strip():
343
+ return {
344
+ "error": "Empty response from API",
345
+ "raw": ""
346
+ }
347
+
348
+ # Clean markdown formatting if present
349
+ if content.strip().startswith('```'):
350
+ lines = content.strip().split('\n')
351
+ if lines[0].startswith('```'):
352
+ lines = lines[1:]
353
+ if lines[-1].startswith('```'):
354
+ lines = lines[:-1]
355
+ content = '\n'.join(lines)
356
+
357
+ # Parse JSON
358
+ try:
359
+ return json.loads(content)
360
+ except json.JSONDecodeError as e:
361
+ return {
362
+ "error": f"JSON parse error: {str(e)}",
363
+ "raw": content[:500] + "..." if len(content) > 500 else content
364
+ }
365
+
366
+
367
+ def validate_against_schema(data: Dict, schema: Dict) -> Dict[str, Any]:
368
+ """
369
+ Simple schema validation (can be extended later)
370
+
371
+ Args:
372
+ data: Parsed data to validate
373
+ schema: Expected schema structure
374
+
375
+ Returns:
376
+ Dict with validation result
377
+ """
378
+ # Basic validation - just check if all schema keys exist
379
+ missing_keys = []
380
+
381
+ def check_keys(schema_part, data_part, path=""):
382
+ if isinstance(schema_part, dict):
383
+ if not isinstance(data_part, dict):
384
+ missing_keys.append(f"{path} should be dict")
385
+ return
386
+ for key in schema_part:
387
+ if key not in data_part:
388
+ missing_keys.append(f"{path}.{key}" if path else key)
389
+ else:
390
+ check_keys(schema_part[key], data_part[key], f"{path}.{key}" if path else key)
391
+
392
+ check_keys(schema, data)
393
+
394
+ if missing_keys:
395
+ return {
396
+ "valid": False,
397
+ "missing": missing_keys,
398
+ "data": data
399
+ }
400
+
401
+ return {
402
+ "valid": True,
403
+ "data": data
404
+ }
common/global_config.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import anthropic
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+ from google.cloud import bigquery
6
+ from openai import OpenAI
7
+
8
+ # Base directories
9
+ BASE_DIR = Path(__file__).parent
10
+ PROJECT_ROOT = BASE_DIR.parent # dual_agent_system directory
11
+
12
+ # Load secrets and scrapers config
13
+ secret_path = BASE_DIR / "secrets.json"
14
+ scrapers_path = BASE_DIR / "scrapers.json"
15
+
16
+ secrets = json.load(open(secret_path))
17
+ scrapers = json.load(open(scrapers_path))
18
+
19
+ # API Clients
20
+ anthropic_client = anthropic.Anthropic(api_key=secrets["anthropic_api_key"])
21
+ bqclient = bigquery.Client.from_service_account_info(secrets["AI_TWIN_CREDENTIALS_GCP"])
22
+ openai_client = OpenAI(api_key=secrets["openai_api_key"])
23
+
24
+ # Paths
25
+ PATHS = {
26
+ "logs": PROJECT_ROOT / "logs",
27
+ "tests": PROJECT_ROOT / "tests",
28
+ "files": PROJECT_ROOT / "files",
29
+ "raw_files": PROJECT_ROOT / "files" / "raw_files_file_handler",
30
+ "tables": PROJECT_ROOT / "files" / "tables",
31
+ "vector_stores": PROJECT_ROOT / "files" / "vector_stores"
32
+ }
33
+
34
+ # Ensure directories exist
35
+ for path in PATHS.values():
36
+ path.mkdir(exist_ok=True)
37
+
38
+ # API Tokens
39
+ TOKENS = {
40
+ "apify": secrets.get("apify"),
41
+ "anthropic": secrets.get("anthropic_api_key"),
42
+ "slack": secrets.get("slack"),
43
+ "openai": secrets.get("openai_api_key"),
44
+ "perplexity": secrets.get("perplexity")
45
+ }
46
+
47
+ # Scraper endpoints
48
+ SCRAPER_ENDPOINTS = {
49
+ "youtube_search": scrapers["youtube"]["youtube_scraper"]["endpoint_url"],
50
+ "youtube_transcript": scrapers["youtube"]["youtube_transcript_scraper"]["endpoint_url"],
51
+ "fast_youtube_channel_scraper": scrapers["youtube"]["fast_youtube_channel_scraper"]["endpoint_url"],
52
+ "linkedin_post_search_scraper": scrapers["linkedin"]["linkedin_post_search_scraper"]["endpoint_url"],
53
+ "linkedin_everything_scraper": scrapers["linkedin"]["linkedin_everything_scraper"]["endpoint_url"],
54
+ "tiktok_search_scraper": scrapers["tiktok"]["tiktok_search_scraper"]["endpoint_url"],
55
+ "twitter_search": scrapers["twitter"]["twitter_search_scraper"]["endpoint_url"]
56
+ }
57
+
58
+
common/trash.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ xample_data=[
2
+ {
3
+ "role": "user",
4
+ "content": [
5
+ {
6
+ "widget_type": "text",
7
+ "content": "I want to brainstorm an AI-driven personal finance coach. Show me search interest trends over the last six months first.",
8
+ "is_expanded": True
9
+ }
10
+ ]
11
+ },
12
+ {
13
+ "role": "assistant",
14
+ "content": [
15
+ {
16
+ "widget_type": "text",
17
+ "content": "That’s a fantastic idea! Understanding market interest is key. Here’s the trend:",
18
+ "is_expanded": True
19
+ },
20
+ {
21
+ "widget_type": "line-chart",
22
+ "content": {
23
+ "title": "Search Interest: AI Finance Coaches",
24
+ "data": {
25
+ "labels": ["Nov 2024", "Dec 2024", "Jan 2025", "Feb 2025", "Mar 2025", "Apr 2025"],
26
+ "datasets": [
27
+ { "label": "Search Volume", "data": [80, 95, 110, 130, 150, 175] }
28
+ ]
29
+ },
30
+ "options": {
31
+ "fill": False,
32
+ "tension": 0.2,
33
+ "showPoints": True
34
+ }
35
+ },
36
+ "is_expanded": False
37
+ }
38
+ ]
39
+ },
40
+ {
41
+ "role": "user",
42
+ "content": [
43
+ {
44
+ "widget_type": "text",
45
+ "content": "Trend looks promising. Now, can we compare feature popularity for Budget Tracking, Investment Advice, and Spending Alerts?",
46
+ "is_expanded": True
47
+ }
48
+ ]
49
+ },
50
+ {
51
+ "role": "assistant",
52
+ "content": [
53
+ {
54
+ "widget_type": "text",
55
+ "content": "Sure—you can see how those features scored in our survey:",
56
+ "is_expanded": True
57
+ },
58
+ {
59
+ "widget_type": "bar-chart",
60
+ "content": {
61
+ "title": "Feature Popularity Scores",
62
+ "data": {
63
+ "labels": ["Budget Tracking", "Investment Advice", "Spending Alerts"],
64
+ "datasets": [
65
+ { "label": "Survey Score", "data": [90, 75, 85] }
66
+ ]
67
+ },
68
+ "options": {
69
+ "horizontal": False,
70
+ "stacked": False
71
+ }
72
+ },
73
+ "is_expanded": False
74
+ },
75
+ {
76
+ "widget_type": "text",
77
+ "content": "Budget Tracking leads the pack, which makes sense given demand for cashflow insights.",
78
+ "is_expanded": True
79
+ }
80
+ ]
81
+ },
82
+ {
83
+ "role": "user",
84
+ "content": [
85
+ {
86
+ "widget_type": "text",
87
+ "content": "Great insight. Let's brainstorm some MVP feature ideas based on those results.",
88
+ "is_expanded": True
89
+ }
90
+ ]
91
+ },
92
+ {
93
+ "role": "assistant",
94
+ "content": [
95
+ {
96
+ "widget_type": "text",
97
+ "content": "Here are some potential MVP features to focus on:",
98
+ "is_expanded": True
99
+ },
100
+ {
101
+ "widget_type": "thinking",
102
+ "content": {
103
+ "title": "Brainstormed MVP Features",
104
+ "steps": [
105
+ { "description": "Real-time budget categorization" },
106
+ { "description": "AI-driven savings recommendations" },
107
+ { "description": "Voice-activated expense logging" }
108
+ ],
109
+ "conclusion": "Three strong MVP ideas identified.",
110
+ "options": {
111
+ "expandable": True,
112
+ "showStepNumbers": True
113
+ }
114
+ },
115
+ "is_expanded": False
116
+ }
117
+ ]
118
+ },
119
+ {
120
+ "role": "user",
121
+ "content": [
122
+ {
123
+ "widget_type": "text",
124
+ "content": "I'd like to prototype real-time budget categorization and voice-activated expense logging first.",
125
+ "is_expanded": True
126
+ }
127
+ ]
128
+ },
129
+ {
130
+ "role": "assistant",
131
+ "content": [
132
+ {
133
+ "widget_type": "text",
134
+ "content": "Perfect. Here’s a timeline for prototyping those two features:",
135
+ "is_expanded": True
136
+ },
137
+ {
138
+ "widget_type": "timeline",
139
+ "content": {
140
+ "title": "Prototyping Timeline",
141
+ "events": [
142
+ { "date": "2025-05-01", "description": "Real-time Budgeting Prototype" },
143
+ { "date": "2025-05-15", "description": "Voice Logging Prototype" }
144
+ ],
145
+ "options": {
146
+ "showDates": True,
147
+ "orientation": "vertical"
148
+ }
149
+ },
150
+ "is_expanded": False
151
+ }
152
+ ]
153
+ },
154
+ {
155
+ "role": "user",
156
+ "content": [
157
+ {
158
+ "widget_type": "text",
159
+ "content": "That timeline works for me. What launch approach should we take—internal pilot or closed beta?",
160
+ "is_expanded": True
161
+ }
162
+ ]
163
+ },
164
+ {
165
+ "role": "assistant",
166
+ "content": [
167
+ {
168
+ "widget_type": "text",
169
+ "content": "Industry benchmarks suggest many opt for a closed beta. Here’s the typical split:",
170
+ "is_expanded": True
171
+ },
172
+ {
173
+ "widget_type": "pie-chart",
174
+ "content": {
175
+ "title": "Launch Strategy Preference",
176
+ "data": {
177
+ "labels": ["Internal Pilot", "Closed Beta"],
178
+ "datasets": [
179
+ {
180
+ "data": [40, 60],
181
+ "backgroundColor": ["#36A2EB", "#FF6384"]
182
+ }
183
+ ]
184
+ },
185
+ "options": {
186
+ "doughnut": False,
187
+ "showPercentage": True
188
+ }
189
+ },
190
+ "is_expanded": False
191
+ },
192
+ {
193
+ "widget_type": "suggestion",
194
+ "content": {
195
+ "title": "Select Launch Approach",
196
+ "suggestions": [
197
+ { "label": "Internal Pilot" },
198
+ { "label": "Closed Beta" }
199
+ ],
200
+ "options": {
201
+ "multiSelect": False,
202
+ "showButtons": True
203
+ }
204
+ },
205
+ "is_expanded": True
206
+ }
207
+ ]
208
+ },
209
+ {
210
+ "role": "user",
211
+ "content": [
212
+ {
213
+ "widget_type": "text",
214
+ "content": "Let's go with a closed beta group.",
215
+ "is_expanded": True
216
+ }
217
+ ]
218
+ },
219
+ {
220
+ "role": "assistant",
221
+ "content": [
222
+ {
223
+ "widget_type": "text",
224
+ "content": "Excellent choice! Let’s outline the next steps for a successful closed beta:",
225
+ "is_expanded": True
226
+ },
227
+ {
228
+ "widget_type": "suggestion",
229
+ "content": {
230
+ "title": "Next Steps",
231
+ "suggestions": [
232
+ { "label": "Recruit Beta Users" },
233
+ { "label": "Draft NDAs" },
234
+ { "label": "Set Up Feedback Channels" },
235
+ { "label": "Monitor Key Metrics" }
236
+ ],
237
+ "options": {
238
+ "multiSelect": True,
239
+ "showButtons": True
240
+ }
241
+ },
242
+ "is_expanded": True
243
+ }
244
+ ]
245
+ }
246
+ ]
247
+ #load data
248
+ data_path= "widget-examples.json"
download_beir_datasets.py DELETED
@@ -1,287 +0,0 @@
1
- import os
2
- import pandas as pd
3
- import json
4
- import random
5
- from sklearn.model_selection import train_test_split
6
-
7
- from beir import util
8
- from beir.datasets.data_loader import GenericDataLoader
9
- from app import format_prompt
10
- # BEIR datasets with corpus sizes
11
- BEIR_DATASETS = {
12
- # Smaller datasets (good for testing)
13
- "scifact": {"size": "5K docs", "domain": "Scientific"},
14
- "nfcorpus": {"size": "3.6K docs", "domain": "Medical"},
15
- "fiqa": {"size": "57K docs", "domain": "Financial QA"},
16
-
17
- # Medium datasets
18
- "scidocs": {"size": "25K docs", "domain": "Scientific"},
19
- "arguana": {"size": "8.7K docs", "domain": "Argument mining"},
20
- "quora": {"size": "523K docs", "domain": "Duplicate questions"},
21
-
22
- # Large datasets
23
- "msmarco": {"size": "8.8M docs", "domain": "Web search"},
24
- "trec-covid": {"size": "171K docs", "domain": "COVID research"},
25
- "hotpotqa": {"size": "5.2M docs", "domain": "Multi-hop QA"},
26
- }
27
-
28
- def download_beir_dataset(dataset_name, data_path="datasets"):
29
- """
30
- Download
31
- a
32
- BEIR
33
- dataset
34
- """
35
- print(f"\nDownloading {dataset_name}...")
36
- print(f"Info: {BEIR_DATASETS[dataset_name]}")
37
-
38
- # Download dataset
39
- url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset_name}.zip"
40
- out_dir = os.path.join(data_path, dataset_name)
41
-
42
- data_path = util.download_and_unzip(url, out_dir)
43
- print(f"Downloaded to: {data_path}")
44
-
45
- # Load the data
46
- corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")
47
-
48
- print(f"Loaded:")
49
- print(f" - Corpus: {len(corpus)} documents")
50
- print(f" - Queries: {len(queries)} queries")
51
- print(f" - Qrels: {len(qrels)} query-doc relevance pairs")
52
-
53
- # Calculate average docs per query
54
- docs_per_query = []
55
- for qid in qrels:
56
- relevant_docs = [doc_id for doc_id, score in qrels[qid].items() if score > 0]
57
- docs_per_query.append(len(relevant_docs))
58
-
59
- avg_docs = sum(docs_per_query) / len(docs_per_query) if docs_per_query else 0
60
- print(f" - Avg relevant docs per query: {avg_docs:.1f}")
61
-
62
- return corpus, queries, qrels
63
-
64
- def create_retrieval_training_data(corpus, queries, qrels, num_samples=1000):
65
- """
66
- Create
67
- training
68
- data
69
- for retrieval model"""
70
- print(f"\nCreating {num_samples} training examples...")
71
-
72
- training_data = []
73
-
74
- for qid, query_text in list(queries.items())[:num_samples]:
75
- if qid not in qrels:
76
- continue
77
-
78
- # Get relevant and non-relevant documents
79
- relevant_docs = []
80
- non_relevant_docs = []
81
-
82
- for doc_id, score in qrels[qid].items():
83
- if score > 0:
84
- relevant_docs.append(doc_id)
85
-
86
- # Sample some non-relevant documents
87
- all_doc_ids = list(corpus.keys())
88
- for _ in range(min(10, len(relevant_docs) * 2)):
89
- doc_id = random.choice(all_doc_ids)
90
- if doc_id not in relevant_docs:
91
- non_relevant_docs.append(doc_id)
92
-
93
- # Create positive examples
94
- for doc_id in relevant_docs[:5]: # Limit to 5 per query
95
- if doc_id in corpus:
96
- example = {
97
- "query": query_text,
98
- "document_id": doc_id,
99
- "document": corpus[doc_id]["text"][:1000], # Truncate
100
- "title": corpus[doc_id].get("title", ""),
101
- "relevance_score": qrels[qid][doc_id],
102
- "label": "relevant",
103
- "task": "retrieve_all_relevant"
104
- }
105
- training_data.append(example)
106
-
107
- # Create negative examples
108
- for doc_id in non_relevant_docs[:5]:
109
- if doc_id in corpus:
110
- example = {
111
- "query": query_text,
112
- "document_id": doc_id,
113
- "document": corpus[doc_id]["text"][:1000],
114
- "title": corpus[doc_id].get("title", ""),
115
- "relevance_score": 0,
116
- "label": "not_relevant",
117
- "task": "retrieve_all_relevant"
118
- }
119
- training_data.append(example)
120
-
121
- return training_data
122
-
123
- def show_examples(corpus, queries, qrels, n=3):
124
- """Show examples of the retrieval task"""
125
- print("\n" + "="*80)
126
- print("EXAMPLES OF RETRIEVAL TASK")
127
- print("="*80)
128
-
129
- for i, (qid, query_text) in enumerate(list(queries.items())[:n]):
130
- print(f"\n📌 Query {i+1}: {query_text}")
131
-
132
- if qid in qrels:
133
- relevant_docs = [(doc_id, score) for doc_id, score in qrels[qid].items() if score > 0]
134
- relevant_docs.sort(key=lambda x: x[1], reverse=True)
135
-
136
- print(f" Found {len(relevant_docs)} relevant documents:")
137
-
138
- for doc_id, score in relevant_docs[:3]:
139
- if doc_id in corpus:
140
- doc = corpus[doc_id]
141
- print(f"\n ✓ Doc ID: {doc_id} (Relevance: {score})")
142
- print(f" Title: {doc.get('title', 'No title')}")
143
- print(f" Text: {doc['text'][:200]}...")
144
-
145
- def main(dataset_name):
146
- print("="*80)
147
- print("BEIR - Real Retrieval Ranking Datasets")
148
- print("Task: Given query, find ALL relevant docs from thousands!")
149
- print("="*80)
150
-
151
-
152
-
153
- try:
154
- # Download
155
- corpus, queries, qrels = download_beir_dataset(dataset_name)
156
-
157
- # Show examples
158
- show_examples(corpus, queries, qrels)
159
-
160
- # Create training data
161
- training_data = create_retrieval_training_data(corpus, queries, qrels, num_samples=100)
162
-
163
- # Save training data
164
- output_file = f"beir_{dataset_name}_training.jsonl"
165
- with open(output_file, 'w') as f:
166
- for example in training_data:
167
- f.write(json.dumps(example) + '\n')
168
-
169
- print(f"\n✅ Saved {len(training_data)} training examples to {output_file}")
170
-
171
- # Explain the task
172
- print("\n" + "="*80)
173
- print("THE RETRIEVAL TASK:")
174
- print("="*80)
175
- print("1. You have a QUERY (e.g., 'coronavirus treatment options')")
176
- print("2. You have a CORPUS of thousands of documents")
177
- print("3. You must find ALL relevant documents (not just one!)")
178
- print("4. Some queries have 1 relevant doc, others have 100+")
179
- print("5. You must rank them by relevance")
180
- print("\nThis is MUCH harder than binary classification!")
181
-
182
- except Exception as e:
183
- print(f"\nError: {e}")
184
- print("\nInstall BEIR first:")
185
- print("pip install beir")
186
-
187
- if __name__ == "__main__":
188
- # Choose dataset
189
- print("\nAvailable datasets:")
190
- for dataset_name, info in BEIR_DATASETS.items():
191
- print(f" - {dataset_name}: {info['size']} ({info['domain']})")
192
-
193
- # Download smaller datasets for training
194
- selected_datasets = ["scifact", "nfcorpus", "fiqa"] # Smaller datasets
195
- #
196
- # for dataset_name in selected_datasets:
197
- # try:
198
- # main(dataset_name)
199
- # except Exception as e:
200
- # print(f"Error processing {dataset_name}: {e}")
201
- # continue
202
-
203
- # Process downloaded datasets
204
- if os.path.exists('datasets'):
205
- datasets = os.listdir('datasets')
206
- all_rows = []
207
-
208
- for dataset_name in datasets:
209
- print(f"\nProcessing {dataset_name}...")
210
- dataset_path = f"datasets/{dataset_name}/{dataset_name}"
211
-
212
- # Check if required files exist
213
- if not all(os.path.exists(f"{dataset_path}/{f}") for f in ["corpus.jsonl", "queries.jsonl", "qrels"]):
214
- print(f"Skipping {dataset_name} - missing required files")
215
- continue
216
-
217
- # Load corpus
218
- print(f"Loading corpus for {dataset_name}...")
219
- corpus = pd.DataFrame([json.loads(line) for line in open(f"{dataset_path}/corpus.jsonl", "r", encoding="utf-8")])
220
- queries = pd.DataFrame([json.loads(line) for line in open(f"{dataset_path}/queries.jsonl", "r", encoding="utf-8")])
221
- # Process qrels files
222
- qrels_files = [f for f in os.listdir(f"{dataset_path}/qrels") if f.endswith('.tsv') and not f.startswith('my_')]
223
-
224
- qrels_df = pd.concat([pd.read_csv(f"{dataset_path}/qrels/{qrel_file}", sep="\t") for qrel_file in qrels_files])
225
- corpus['_id'] = corpus['_id'].astype(str)
226
- queries['_id'] = queries['_id'].astype(str)
227
- qrels_df['corpus-id'] = qrels_df['corpus-id'].astype(str)
228
- qrels_df['query-id'] = qrels_df['query-id'].astype(str)
229
-
230
- qrels_df = pd.merge(left=qrels_df,right=corpus,left_on='corpus-id',right_on='_id',suffixes=('','_doc'))
231
- qrels_df = pd.merge(left=qrels_df, right=queries, left_on='query-id', right_on='_id',suffixes=('','_query'))
232
- qrels_df = qrels_df[qrels_df['score']==2]
233
- if qrels_df.size == 0:
234
- continue
235
- qrels_df1 = qrels_df.sample(frac=1).reset_index(drop=True)[0:5000]
236
- qrels_df2 = qrels_df.sample(frac=1).reset_index(drop=True)[0:5000]
237
- for (_,row1),(_,row2) in zip(qrels_df1.iterrows(),qrels_df2.iterrows()):
238
- query_text = row1.text_query
239
- title = row1.title
240
- content = row1.text
241
-
242
- prompt = format_prompt(query_text, title, content)
243
- example = {
244
- 'query': query_text,
245
- 'title': title,
246
- 'content': content,
247
- 'prompt': prompt,
248
- 'label': 'yes',
249
- 'dataset': dataset_name,
250
- 'score': row1.score,
251
- 'query-id':row1['query-id'],
252
- 'doc_id': row1['corpus-id']
253
-
254
- }
255
-
256
- query_text = row2.text_query
257
- all_rows.append(example)
258
-
259
- prompt = format_prompt(query_text, title, content)
260
- example = {
261
- 'query': query_text,
262
- 'title': title,
263
- 'content': content,
264
- 'prompt': prompt,
265
- 'label': 'no',
266
- 'dataset': dataset_name,
267
- 'score': 0,
268
- 'query-id': row2['query-id'],
269
- 'doc_id': row1['corpus-id']
270
-
271
- }
272
-
273
- all_rows.append(example)
274
-
275
-
276
- # Create DataFrame
277
- df = pd.DataFrame(all_rows)
278
- print(f"\nTotal examples collected: {len(df)}")
279
- print(f"Label distribution: {df['label'].value_counts().to_dict()}")
280
- print(f"Dataset distribution: {df['dataset'].value_counts().to_dict()}")
281
-
282
- # Balance the dataset (equal yes/no)
283
- # Save splits
284
- df.to_csv('train_beir.csv', index=False)
285
- df.sample(50).to_csv('sampled_beir.csv')
286
-
287
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluate.py DELETED
@@ -1,273 +0,0 @@
1
- """
2
- Evaluation Script - Compare Base vs Fine-tuned Phi-4-mini
3
- Optionally use GPT-4o as a judge
4
- """
5
-
6
- import os
7
- import json
8
- import torch
9
- from transformers import AutoTokenizer, AutoModelForCausalLM
10
- from peft import PeftModel
11
- import logging
12
- from datetime import datetime
13
- from typing import Dict, List
14
-
15
- logging.basicConfig(level=logging.INFO)
16
- logger = logging.getLogger(__name__)
17
-
18
- MODEL_ID = "microsoft/Phi-3-mini-4k-instruct" # Using stable Phi-3
19
- FINETUNED_DIR = "./phi4-lora-finetuned"
20
-
21
- def format_prompt(query, content):
22
- """Format input into structured prompt"""
23
- prompt = f"""You are an AI content analyst.
24
-
25
- Task:
26
- 1. Given the following content and a user query, decide if the content is relevant.
27
- 2. If it is relevant:
28
- - Extract the top 2-3 key sentences
29
- - Suggest 3-5 relevant tags
30
- - Provide a short explanation or content extension (~2-3 sentences)
31
-
32
- Format your response in JSON with:
33
- {{
34
- "relevant": true or false,
35
- "key_sentences": [...],
36
- "tags": [...],
37
- "expansion": "..."
38
- }}
39
-
40
- User Query:
41
- {query}
42
-
43
- Content:
44
- {content}
45
-
46
- Response:"""
47
- return prompt
48
-
49
- def generate_response(model, tokenizer, prompt, max_new_tokens=512):
50
- """Generate response from model"""
51
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
52
- inputs = {k: v.to(model.device) for k, v in inputs.items()}
53
-
54
- with torch.no_grad():
55
- outputs = model.generate(
56
- **inputs,
57
- max_new_tokens=max_new_tokens,
58
- temperature=0.1,
59
- do_sample=True,
60
- top_p=0.9,
61
- pad_token_id=tokenizer.eos_token_id
62
- )
63
-
64
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
65
- # Extract only the generated part
66
- response = response[len(prompt):].strip()
67
-
68
- return response
69
-
70
- def extract_json_from_response(response):
71
- """Try to extract JSON from model response"""
72
- try:
73
- # Find JSON-like content
74
- start = response.find('{')
75
- end = response.rfind('}') + 1
76
- if start >= 0 and end > start:
77
- json_str = response[start:end]
78
- return json.loads(json_str)
79
- except:
80
- pass
81
- return None
82
-
83
- def evaluate_models(test_file="test_set.jsonl"):
84
- """Compare base and fine-tuned models"""
85
- logger.info("Loading models for evaluation...")
86
-
87
- # Load tokenizer
88
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
89
- tokenizer.pad_token = tokenizer.eos_token
90
-
91
- # Load base model
92
- logger.info("Loading base model...")
93
- base_model = AutoModelForCausalLM.from_pretrained(
94
- MODEL_ID,
95
- device_map="auto",
96
- load_in_4bit=True,
97
- torch_dtype=torch.float16,
98
- trust_remote_code=True
99
- )
100
-
101
- # Load fine-tuned model if available
102
- finetuned_model = None
103
- if os.path.exists(FINETUNED_DIR):
104
- logger.info("Loading fine-tuned model...")
105
- finetuned_model = PeftModel.from_pretrained(base_model, FINETUNED_DIR)
106
- else:
107
- logger.warning(f"Fine-tuned model not found at {FINETUNED_DIR}")
108
- logger.warning("Run train.py first to create the fine-tuned model")
109
-
110
- # Create test set if it doesn't exist
111
- if not os.path.exists(test_file):
112
- logger.info("Creating example test set...")
113
- test_examples = [
114
- {
115
- "query": "What are smart contracts?",
116
- "content": "Smart contracts are self-executing contracts with the terms directly written into code. They run on blockchain networks and automatically execute when predetermined conditions are met. This eliminates the need for intermediaries and ensures transparency.",
117
- "expected_output": {
118
- "relevant": True,
119
- "key_sentences": [
120
- "Smart contracts are self-executing contracts with the terms directly written into code.",
121
- "They run on blockchain networks and automatically execute when predetermined conditions are met."
122
- ],
123
- "tags": ["smart-contracts", "blockchain", "automation", "decentralized"],
124
- "expansion": "This content explains the fundamental concept of smart contracts, emphasizing their self-executing nature and role in eliminating intermediaries through blockchain-based automation."
125
- }
126
- },
127
- {
128
- "query": "How does machine learning work?",
129
- "content": "Pizza is a popular Italian dish consisting of a round, flat base of dough topped with tomatoes, cheese, and various toppings. It originated in Naples and has become one of the world's most popular foods.",
130
- "expected_output": {
131
- "relevant": False,
132
- "key_sentences": [],
133
- "tags": [],
134
- "expansion": ""
135
- }
136
- },
137
- {
138
- "query": "What is DeFi?",
139
- "content": "Decentralized Finance (DeFi) refers to financial services built on blockchain technology that operate without traditional intermediaries like banks. DeFi protocols use smart contracts to provide services such as lending, borrowing, and trading. Users maintain control of their assets through cryptocurrency wallets.",
140
- "expected_output": {
141
- "relevant": True,
142
- "key_sentences": [
143
- "Decentralized Finance (DeFi) refers to financial services built on blockchain technology that operate without traditional intermediaries like banks.",
144
- "DeFi protocols use smart contracts to provide services such as lending, borrowing, and trading."
145
- ],
146
- "tags": ["defi", "blockchain", "smart-contracts", "lending", "decentralized-finance"],
147
- "expansion": "This content provides a clear overview of DeFi, explaining how it leverages blockchain and smart contracts to recreate traditional financial services in a decentralized manner."
148
- }
149
- }
150
- ]
151
-
152
- with open(test_file, 'w', encoding='utf-8') as f:
153
- for example in test_examples:
154
- f.write(json.dumps(example) + '\n')
155
-
156
- # Load test data
157
- logger.info(f"Loading test data from {test_file}...")
158
- test_data = []
159
- with open(test_file, 'r', encoding='utf-8') as f:
160
- for line in f:
161
- test_data.append(json.loads(line))
162
-
163
- # Evaluate
164
- results = []
165
- for i, example in enumerate(test_data):
166
- logger.info(f"\nEvaluating example {i+1}/{len(test_data)}...")
167
-
168
- prompt = format_prompt(example["query"], example["content"])
169
-
170
- # Get base model response
171
- base_response = generate_response(base_model, tokenizer, prompt)
172
- base_json = extract_json_from_response(base_response)
173
-
174
- # Get fine-tuned model response if available
175
- finetuned_response = None
176
- finetuned_json = None
177
- if finetuned_model:
178
- finetuned_response = generate_response(finetuned_model, tokenizer, prompt)
179
- finetuned_json = extract_json_from_response(finetuned_response)
180
-
181
- result = {
182
- "example_id": i + 1,
183
- "query": example["query"],
184
- "content": example["content"][:100] + "...", # Truncate for display
185
- "expected": example["expected_output"],
186
- "base_response": base_response,
187
- "base_json": base_json,
188
- "finetuned_response": finetuned_response,
189
- "finetuned_json": finetuned_json
190
- }
191
-
192
- results.append(result)
193
-
194
- # Print comparison
195
- print(f"\n{'='*80}")
196
- print(f"Example {i+1}: {example['query']}")
197
- print(f"{'='*80}")
198
- print(f"\n📄 Content: {example['content'][:100]}...")
199
- print(f"\n✅ Expected Output:")
200
- print(json.dumps(example['expected_output'], indent=2))
201
- print(f"\n🔹 Base Model Output:")
202
- print(base_response[:500] + "..." if len(base_response) > 500 else base_response)
203
- if finetuned_response:
204
- print(f"\n🔸 Fine-tuned Model Output:")
205
- print(finetuned_response[:500] + "..." if len(finetuned_response) > 500 else finetuned_response)
206
-
207
- # Save results
208
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
209
- results_file = f"evaluation_results_{timestamp}.json"
210
- with open(results_file, 'w', encoding='utf-8') as f:
211
- json.dump(results, f, indent=2)
212
-
213
- logger.info(f"\n✅ Evaluation complete! Results saved to: {results_file}")
214
-
215
- # Generate summary
216
- print(f"\n{'='*80}")
217
- print("EVALUATION SUMMARY")
218
- print(f"{'='*80}")
219
-
220
- base_valid_json = sum(1 for r in results if r['base_json'] is not None)
221
- finetuned_valid_json = sum(1 for r in results if r['finetuned_json'] is not None)
222
-
223
- print(f"Base model valid JSON responses: {base_valid_json}/{len(results)}")
224
- if finetuned_model:
225
- print(f"Fine-tuned model valid JSON responses: {finetuned_valid_json}/{len(results)}")
226
-
227
- print("\n💡 To use GPT-4o as a judge:")
228
- print("1. Copy the results from the evaluation file")
229
- print("2. Ask GPT-4o to compare base vs fine-tuned outputs")
230
- print("3. Request scores for relevance detection, tag quality, and key sentence extraction")
231
-
232
- return results
233
-
234
- def create_gpt4_evaluation_prompt(results):
235
- """Create a prompt for GPT-4 to evaluate the results"""
236
- prompt = """You are an expert evaluator comparing two versions of a document analysis model.
237
-
238
- For each example below, score both the Base Model and Fine-tuned Model outputs on:
239
- 1. Relevance Detection (0-5): How accurately did it determine if the content is relevant?
240
- 2. Tag Quality (0-5): How good are the suggested tags?
241
- 3. Key Sentences (0-5): How well did it extract the most important sentences?
242
- 4. Expansion Quality (0-5): How insightful is the content expansion?
243
-
244
- Provide an overall winner for each example and a final recommendation.
245
-
246
- EVALUATION DATA:
247
- """
248
-
249
- for result in results:
250
- prompt += f"\n\nExample {result['example_id']}:\n"
251
- prompt += f"Query: {result['query']}\n"
252
- prompt += f"Content: {result['content']}\n"
253
- prompt += f"Expected: {json.dumps(result['expected'], indent=2)}\n"
254
- prompt += f"Base Output: {result['base_response']}\n"
255
- if result['finetuned_response']:
256
- prompt += f"Fine-tuned Output: {result['finetuned_response']}\n"
257
-
258
- prompt += "\n\nPlease evaluate and provide scores."
259
-
260
- return prompt
261
-
262
- if __name__ == "__main__":
263
- results = evaluate_models()
264
-
265
- # Optionally create GPT-4 evaluation prompt
266
- if results and len(results) > 0:
267
- gpt4_prompt = create_gpt4_evaluation_prompt(results[:3]) # First 3 examples
268
-
269
- with open("gpt4_evaluation_prompt.txt", "w", encoding="utf-8") as f:
270
- f.write(gpt4_prompt)
271
-
272
- print(f"\n📝 GPT-4 evaluation prompt saved to: gpt4_evaluation_prompt.txt")
273
- print("Copy this to ChatGPT to get detailed scoring!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
generate_sample_data.py DELETED
@@ -1,90 +0,0 @@
1
- """
2
- Generate sample train/val/test data for testing if CSV files don't exist
3
- """
4
-
5
- import pandas as pd
6
- import os
7
-
8
- def generate_sample_data():
9
- """Generate sample DPO data for BEIR-style relevance classification"""
10
-
11
- # Sample data
12
- samples = [
13
- {
14
- "query": "Do Cholesterol Statin Drugs Cause Breast Cancer?",
15
- "document": "Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear.",
16
- "relevant": True
17
- },
18
- {
19
- "query": "What is machine learning?",
20
- "document": "Pizza is a popular Italian dish consisting of a round, flat base of dough topped with tomatoes, cheese, and various toppings.",
21
- "relevant": False
22
- },
23
- {
24
- "query": "How does deep learning work?",
25
- "document": "Deep learning is a subset of machine learning that uses neural networks with multiple layers to progressively extract higher-level features from raw input.",
26
- "relevant": True
27
- },
28
- {
29
- "query": "What are the symptoms of COVID-19?",
30
- "document": "The stock market showed significant volatility today with major indices closing lower due to concerns about inflation.",
31
- "relevant": False
32
- },
33
- {
34
- "query": "What is blockchain technology?",
35
- "document": "Blockchain is a distributed ledger technology that maintains a secure and decentralized record of transactions. It consists of blocks linked together using cryptography.",
36
- "relevant": True
37
- }
38
- ]
39
-
40
- # Create DPO format data
41
- dpo_data = []
42
- for sample in samples * 20: # Repeat to create more samples
43
- prompt = f"""you would get a query and document's title and content and return Relevant/Irrelevant.
44
-
45
- Query:
46
- {sample['query']}
47
-
48
- Document:
49
- title: Document
50
- content: {sample['document']}"""
51
-
52
- if sample['relevant']:
53
- chosen = "Relevant"
54
- rejected = "Irrelevant"
55
- else:
56
- chosen = "Irrelevant"
57
- rejected = "Relevant"
58
-
59
- dpo_data.append({
60
- "prompt": prompt,
61
- "chosen": chosen,
62
- "rejected": rejected
63
- })
64
-
65
- # Split into train/val/test
66
- total = len(dpo_data)
67
- train_size = int(0.7 * total)
68
- val_size = int(0.15 * total)
69
-
70
- train_data = dpo_data[:train_size]
71
- val_data = dpo_data[train_size:train_size + val_size]
72
- test_data = dpo_data[train_size + val_size:]
73
-
74
- # Save to CSV
75
- pd.DataFrame(train_data).to_csv("train.csv", index=False)
76
- pd.DataFrame(val_data).to_csv("val.csv", index=False)
77
- pd.DataFrame(test_data).to_csv("test.csv", index=False)
78
-
79
- print(f"✅ Generated sample data:")
80
- print(f" - train.csv: {len(train_data)} samples")
81
- print(f" - val.csv: {len(val_data)} samples")
82
- print(f" - test.csv: {len(test_data)} samples")
83
-
84
- if __name__ == "__main__":
85
- # Check if files exist
86
- if not os.path.exists("train.csv"):
87
- print("⚠️ train.csv not found. Generating sample data...")
88
- generate_sample_data()
89
- else:
90
- print("✅ Data files already exist")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
inference_chatgpt_simple.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simplified ChatGPT inference script for yes/no classification
4
+ Forces single token responses for consistent results
5
+ """
6
+
7
+ import os
8
+ import pandas as pd
9
+ from openai import OpenAI
10
+ import time
11
+ from datetime import datetime
12
+ from app import format_prompt
13
+
14
+ # Initialize OpenAI client
15
+ MICROSERVICES_FIVERR_OS_FIVERR_OS_BACKEND_CREDENTIALS_OPENAI_API_KEY="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"
16
+ client = OpenAI(api_key=MICROSERVICES_FIVERR_OS_FIVERR_OS_BACKEND_CREDENTIALS_OPENAI_API_KEY)
17
+
18
+ def get_prediction(query, title, content, model="gpt-4.1"):
19
+ """Get yes/no prediction from ChatGPT"""
20
+ prompt = format_prompt(query, title, content)
21
+
22
+ try:
23
+ response = client.chat.completions.create(
24
+ model=model,
25
+ messages=[
26
+ {"role": "user", "content": prompt}
27
+ ],
28
+ )
29
+
30
+ # Get prediction
31
+ prediction = response.choices[0].message.content.strip().lower()
32
+
33
+ # Ensure it's yes or no
34
+ if prediction not in ['yes', 'no']:
35
+ prediction = 'error'
36
+ print(prediction)
37
+
38
+ return prediction
39
+
40
+ except Exception as e:
41
+ print(f"API Error: {e}")
42
+ return 'error'
43
+
44
+ def main():
45
+ csv_path ="train_datasets_creation/full_train_dataset.csv"
46
+ # Load CSV
47
+ print(f"Loading {csv_path}...")
48
+ df = pd.read_csv(csv_path)
49
+ dfs = [df[df['label']==d].sample(20) for d in df['label'].unique()]
50
+ df = pd.concat(dfs).reset_index()
51
+ # Process each row
52
+ predictions = []
53
+ for idx, row in df.iterrows():
54
+ # Show progress
55
+ if idx % 10 == 0:
56
+ print(f"Progress: {idx}/{len(df)}")
57
+
58
+ # Get prediction
59
+ pred = get_prediction(
60
+ str(row['query_text']),
61
+ str(row['title']),
62
+ str(row['text'])
63
+ )
64
+ predictions.append(pred)
65
+
66
+ # Rate limit
67
+ time.sleep(0.5)
68
+
69
+ # Add predictions
70
+ df['prediction'] = predictions
71
+ conf_matrix = pd.crosstab(
72
+ index=df['label'], # True labels
73
+ columns=df['prediction'], # Predicted labels
74
+ rownames=['Actual'],
75
+ colnames=['Predicted']
76
+ )
77
+ accuracy = (df['prediction']=='yes')&((df['label']=='easy_positive')|(df['label']=='hard_positive'))|(df['prediction']=='no')&((df['label']=='easy_negative')|(df['label']=='hard_negative'))
78
+ print(conf_matrix)
79
+ print(accuracy.mean())
80
+ output = f"chatgpt_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
81
+ df.to_csv(output, index=False)
82
+ print(f"\nSaved to: {output}")
83
+
84
+ # Show summary
85
+ print("\nResults:")
86
+ print(df['prediction'].value_counts())
87
+
88
+ if __name__ == "__main__":
89
+ main()
inference_server.py DELETED
@@ -1,179 +0,0 @@
1
- """
2
- Inference server to test model while training
3
- Can load checkpoints on-the-fly
4
- """
5
-
6
- from flask import Flask, request, jsonify
7
- import torch
8
- from transformers import AutoTokenizer, AutoModelForCausalLM
9
- from peft import PeftModel
10
- import json
11
- import os
12
- import glob
13
- import logging
14
-
15
- app = Flask(__name__)
16
- logging.basicConfig(level=logging.INFO)
17
- logger = logging.getLogger(__name__)
18
-
19
- # Global model and tokenizer
20
- model = None
21
- tokenizer = None
22
- current_checkpoint = None
23
-
24
- MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
25
- CHECKPOINT_DIR = "./phi3-dpo-beir-checkpoints"
26
-
27
- def load_latest_checkpoint():
28
- """Load the latest checkpoint from training"""
29
- global model, tokenizer, current_checkpoint
30
-
31
- # Find latest checkpoint
32
- checkpoints = glob.glob(f"{CHECKPOINT_DIR}/checkpoint-*")
33
- if not checkpoints:
34
- logger.warning("No checkpoints found")
35
- return False
36
-
37
- latest = max(checkpoints, key=os.path.getctime)
38
-
39
- if latest == current_checkpoint:
40
- return True # Already loaded
41
-
42
- logger.info(f"Loading checkpoint: {latest}")
43
-
44
- try:
45
- # Load tokenizer
46
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
47
- tokenizer.pad_token = tokenizer.eos_token
48
-
49
- # Load base model
50
- base_model = AutoModelForCausalLM.from_pretrained(
51
- MODEL_ID,
52
- load_in_4bit=True,
53
- torch_dtype=torch.float16,
54
- device_map="auto",
55
- trust_remote_code=True
56
- )
57
-
58
- # Load LoRA weights
59
- model = PeftModel.from_pretrained(base_model, latest)
60
- model.eval()
61
-
62
- current_checkpoint = latest
63
- logger.info(f"✅ Loaded checkpoint: {latest}")
64
- return True
65
-
66
- except Exception as e:
67
- logger.error(f"Failed to load checkpoint: {e}")
68
- return False
69
-
70
- def format_prompt(query, document):
71
- """Format prompt for inference"""
72
- return f"""You are an AI content analyst.
73
-
74
- Task:
75
- 1. Given the following content and a user query, decide if the content is relevant.
76
- 2. If it is relevant:
77
- - Extract the top 2-3 key sentences
78
- - Suggest 3-5 relevant tags
79
- - Provide a short explanation or content extension (~2-3 sentences)
80
-
81
- Format your response in JSON with:
82
- {{
83
- "relevant": true or false,
84
- "key_sentences": [...],
85
- "tags": [...],
86
- "expansion": "..."
87
- }}
88
-
89
- User Query:
90
- {query}
91
-
92
- Content:
93
- {document}
94
-
95
- Response:"""
96
-
97
- @app.route('/health', methods=['GET'])
98
- def health():
99
- """Health check endpoint"""
100
- return jsonify({
101
- "status": "healthy",
102
- "model_loaded": model is not None,
103
- "current_checkpoint": current_checkpoint
104
- })
105
-
106
- @app.route('/inference', methods=['POST'])
107
- def inference():
108
- """Run inference on query-document pair"""
109
- global model, tokenizer
110
-
111
- # Load latest checkpoint if needed
112
- if not model or request.args.get('reload') == 'true':
113
- if not load_latest_checkpoint():
114
- return jsonify({"error": "No model available"}), 503
115
-
116
- data = request.json
117
- query = data.get('query', '')
118
- document = data.get('document', '')
119
-
120
- if not query or not document:
121
- return jsonify({"error": "Missing query or document"}), 400
122
-
123
- # Format prompt
124
- prompt = format_prompt(query, document)
125
-
126
- # Tokenize
127
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
128
- inputs = {k: v.to(model.device) for k, v in inputs.items()}
129
-
130
- # Generate
131
- with torch.no_grad():
132
- outputs = model.generate(
133
- **inputs,
134
- max_new_tokens=256,
135
- temperature=0.1,
136
- do_sample=True,
137
- pad_token_id=tokenizer.eos_token_id
138
- )
139
-
140
- # Decode
141
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
142
- response = response[len(prompt):].strip()
143
-
144
- # Try to parse JSON
145
- try:
146
- # Find JSON in response
147
- start = response.find('{')
148
- end = response.rfind('}') + 1
149
- if start >= 0 and end > start:
150
- json_str = response[start:end]
151
- result = json.loads(json_str)
152
- else:
153
- result = {"raw_response": response}
154
- except:
155
- result = {"raw_response": response}
156
-
157
- return jsonify({
158
- "checkpoint": current_checkpoint,
159
- "result": result,
160
- "raw_response": response
161
- })
162
-
163
- @app.route('/checkpoints', methods=['GET'])
164
- def list_checkpoints():
165
- """List available checkpoints"""
166
- checkpoints = glob.glob(f"{CHECKPOINT_DIR}/checkpoint-*")
167
- checkpoints.sort(key=os.path.getctime, reverse=True)
168
-
169
- return jsonify({
170
- "checkpoints": [os.path.basename(cp) for cp in checkpoints],
171
- "current": os.path.basename(current_checkpoint) if current_checkpoint else None
172
- })
173
-
174
- if __name__ == '__main__':
175
- # Try to load initial checkpoint
176
- load_latest_checkpoint()
177
-
178
- # Run server
179
- app.run(host='0.0.0.0', port=5000, debug=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
setup_environment.py DELETED
@@ -1,120 +0,0 @@
1
- """
2
- Setup script to ensure all dependencies are correctly installed
3
- """
4
-
5
- import subprocess
6
- import sys
7
- import os
8
-
9
- def run_command(cmd):
10
- """Run a command and return success status"""
11
- try:
12
- subprocess.check_call(cmd, shell=True)
13
- return True
14
- except subprocess.CalledProcessError:
15
- return False
16
-
17
- def main():
18
- print("🔧 Setting up environment for DPO training...")
19
- print("="*60)
20
-
21
- # Python version check
22
- python_version = sys.version_info
23
- print(f"Python version: {python_version.major}.{python_version.minor}.{python_version.micro}")
24
-
25
- if python_version.major < 3 or (python_version.major == 3 and python_version.minor < 8):
26
- print("❌ Python 3.8+ is required!")
27
- sys.exit(1)
28
-
29
- # Fix protobuf issues
30
- print("\n📦 Fixing protobuf issues...")
31
- run_command(f"{sys.executable} -m pip uninstall -y protobuf")
32
- run_command(f"{sys.executable} -m pip install protobuf==3.20.3")
33
-
34
- # Install tf-keras for compatibility
35
- print("\n📦 Installing tf-keras for compatibility...")
36
- run_command(f"{sys.executable} -m pip install tf-keras")
37
-
38
- # Core dependencies
39
- print("\n📦 Installing core dependencies...")
40
- dependencies = [
41
- "torch>=2.0.0",
42
- "transformers>=4.36.0",
43
- "datasets",
44
- "accelerate>=0.25.0",
45
- "peft>=0.7.0",
46
- "trl>=0.7.0",
47
- "bitsandbytes>=0.41.0",
48
- "pandas",
49
- "scipy",
50
- "sentencepiece", # Required for some tokenizers
51
- "protobuf==3.20.3", # Specific version to avoid issues
52
- ]
53
-
54
- for dep in dependencies:
55
- print(f"Installing {dep}...")
56
- if not run_command(f"{sys.executable} -m pip install {dep}"):
57
- print(f"⚠️ Failed to install {dep}, continuing...")
58
-
59
- # BEIR dependencies (optional)
60
- print("\n📦 Installing BEIR dependencies (optional)...")
61
- beir_deps = ["beir", "scikit-learn", "tqdm"]
62
- for dep in beir_deps:
63
- print(f"Installing {dep}...")
64
- run_command(f"{sys.executable} -m pip install {dep}")
65
-
66
- # Check CUDA
67
- print("\n🔍 Checking CUDA availability...")
68
- try:
69
- import torch
70
- if torch.cuda.is_available():
71
- print(f"✅ CUDA is available!")
72
- print(f" Device: {torch.cuda.get_device_name(0)}")
73
- print(f" CUDA version: {torch.version.cuda}")
74
- else:
75
- print("⚠️ CUDA not available. Training will be slow on CPU.")
76
- except Exception as e:
77
- print(f"⚠️ Could not check CUDA: {e}")
78
-
79
- # Test imports
80
- print("\n🧪 Testing imports...")
81
- test_imports = [
82
- "torch",
83
- "transformers",
84
- "trl",
85
- "peft",
86
- "datasets",
87
- "accelerate",
88
- "bitsandbytes",
89
- "pandas"
90
- ]
91
-
92
- failed = []
93
- for module in test_imports:
94
- try:
95
- __import__(module)
96
- print(f"✅ {module}")
97
- except ImportError as e:
98
- print(f"❌ {module}: {e}")
99
- failed.append(module)
100
-
101
- if failed:
102
- print(f"\n⚠️ Some imports failed: {', '.join(failed)}")
103
- print("Try running: pip install --upgrade " + " ".join(failed))
104
- else:
105
- print("\n✅ All imports successful!")
106
-
107
- # Generate sample data if needed
108
- if not os.path.exists("train.csv"):
109
- print("\n📊 Generating sample data...")
110
- try:
111
- run_command(f"{sys.executable} generate_sample_data.py")
112
- except:
113
- print("⚠️ Could not generate sample data")
114
-
115
- print("\n✅ Setup complete!")
116
- print("\nTo start training, run:")
117
- print(f" {sys.executable} train_dpo_hf_fixed.py")
118
-
119
- if __name__ == "__main__":
120
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_scores/analyze_each_dataset.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Create individual markdown reports for each BEIR dataset
3
+ Analyze Score 1 vs Score 2 meanings with examples
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import random
9
+ from pathlib import Path
10
+ from collections import defaultdict
11
+
12
+ def create_dataset_report(dataset_name, beir_base_path):
13
+ """Create a detailed markdown report for a single dataset"""
14
+ dataset_path = Path(beir_base_path) / dataset_name
15
+
16
+ if not dataset_path.exists():
17
+ return None
18
+
19
+ report = []
20
+ report.append(f"# {dataset_name.upper()} Dataset Analysis\n")
21
+ report.append(f"## Overview\n")
22
+
23
+ # Check which splits exist
24
+ qrels_path = dataset_path / "qrels"
25
+ if not qrels_path.exists():
26
+ report.append("No qrels directory found.\n")
27
+ return "\n".join(report)
28
+
29
+ # Load queries
30
+ queries = {}
31
+ queries_file = dataset_path / "queries.jsonl"
32
+ if queries_file.exists():
33
+ with open(queries_file, 'r') as f:
34
+ for line in f:
35
+ q = json.loads(line)
36
+ queries[q['_id']] = q['text']
37
+ report.append(f"- Total queries available: {len(queries)}\n")
38
+
39
+ # Load MORE corpus for better examples
40
+ corpus = {}
41
+ corpus_count = 0
42
+ corpus_file = dataset_path / "corpus.jsonl"
43
+ if corpus_file.exists():
44
+ with open(corpus_file, 'r') as f:
45
+ for i, line in enumerate(f):
46
+ corpus_count += 1
47
+ if i < 50000: # Load MUCH MORE for examples
48
+ doc = json.loads(line)
49
+ corpus[doc['_id']] = {
50
+ 'title': doc.get('title', ''),
51
+ 'text': doc.get('text', '')
52
+ }
53
+ report.append(f"- Total corpus documents: {corpus_count}\n")
54
+ report.append(f"- Corpus documents loaded for examples: {len(corpus)}\n")
55
+
56
+ # Analyze each split
57
+ split_data = {}
58
+ all_score_examples = {} # Collect examples across all splits
59
+
60
+ for split in ["train", "dev", "test"]:
61
+ qrel_file = qrels_path / f"{split}.tsv"
62
+
63
+ if qrel_file.exists():
64
+ score_dist = {}
65
+ queries_set = set()
66
+ total_pairs = 0
67
+ examples_by_query = {}
68
+
69
+ with open(qrel_file, 'r') as f:
70
+ # Skip header if exists
71
+ first_line = f.readline().strip()
72
+ if not first_line.startswith("query"):
73
+ f.seek(0)
74
+
75
+ for line in f:
76
+ parts = line.strip().split('\t')
77
+ if len(parts) >= 3:
78
+ if len(parts) == 3:
79
+ query_id, doc_id, score = parts
80
+ else: # len(parts) == 4, TREC format
81
+ query_id = parts[0]
82
+ doc_id = parts[2]
83
+ score = parts[3]
84
+ score = int(float(score))
85
+
86
+ queries_set.add(query_id)
87
+ score_dist[score] = score_dist.get(score, 0) + 1
88
+ total_pairs += 1
89
+
90
+ # Collect ALL examples WITH SPLIT INFO
91
+ if query_id in queries and doc_id in corpus:
92
+ if score not in all_score_examples:
93
+ all_score_examples[score] = {}
94
+ key = f"{split}:{query_id}"
95
+ if key not in all_score_examples[score]:
96
+ all_score_examples[score][key] = []
97
+ all_score_examples[score][key].append(doc_id)
98
+
99
+ if query_id not in examples_by_query:
100
+ examples_by_query[query_id] = {}
101
+ if score not in examples_by_query[query_id]:
102
+ examples_by_query[query_id][score] = []
103
+ examples_by_query[query_id][score].append(doc_id)
104
+
105
+ split_data[split] = {
106
+ "queries": len(queries_set),
107
+ "total_pairs": total_pairs,
108
+ "score_dist": score_dist,
109
+ "examples": examples_by_query
110
+ }
111
+
112
+ # Write split statistics
113
+ report.append("\n## Split Statistics\n")
114
+ report.append("| Split | Queries | Total Pairs | Score Distribution |\n")
115
+ report.append("|-------|---------|-------------|-------------------|\n")
116
+
117
+ for split in ["train", "dev", "test"]:
118
+ if split in split_data:
119
+ data = split_data[split]
120
+ scores_str = ", ".join([f"{k}:{v}" for k, v in sorted(data["score_dist"].items())])
121
+ report.append(f"| {split} | {data['queries']} | {data['total_pairs']} | {scores_str} |\n")
122
+ else:
123
+ report.append(f"| {split} | N/A | N/A | N/A |\n")
124
+
125
+ # Analyze score meanings
126
+ report.append("\n## Score Analysis\n")
127
+
128
+ unique_scores = set()
129
+ for split in split_data.values():
130
+ unique_scores.update(split["score_dist"].keys())
131
+
132
+ if len(unique_scores) == 1:
133
+ report.append(f"**Only one score level ({list(unique_scores)[0]}) found in this dataset.**\n")
134
+ report.append("- This means all labeled documents are considered equally relevant\n")
135
+ report.append("- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)\n")
136
+ else:
137
+ report.append(f"**Multiple score levels found: {sorted(unique_scores)}**\n")
138
+
139
+
140
+ # Add examples
141
+ report.append("\n## Examples\n")
142
+
143
+ # Show multiple examples with score comparisons
144
+ report.append("### Score Comparison Examples\n")
145
+
146
+ # Find queries with multiple score levels
147
+ queries_with_multiple_scores = []
148
+ for split_name, split in split_data.items():
149
+ for query_id, scores_dict in split["examples"].items():
150
+ if len(scores_dict) > 1 and query_id in queries:
151
+ queries_with_multiple_scores.append((split_name, query_id, scores_dict))
152
+
153
+ # Show up to 5 comparison examples
154
+ if queries_with_multiple_scores:
155
+ for i, (split_name, query_id, scores_dict) in enumerate(queries_with_multiple_scores[:5], 1):
156
+ report.append(f"#### Comparison Example {i}\n")
157
+ report.append(f"**Query:** {queries[query_id]}\n")
158
+ report.append(f"**From:** {split_name} split\n\n")
159
+
160
+ for score in sorted(scores_dict.keys(), reverse=True):
161
+ doc_ids = scores_dict[score]
162
+ report.append(f"**Score {score} Documents:**\n")
163
+
164
+ shown = 0
165
+ for doc_id in doc_ids[:3]: # Show up to 3 docs per score
166
+ if doc_id in corpus:
167
+ doc = corpus[doc_id]
168
+ shown += 1
169
+ report.append(f"\n*Document {shown}:*\n")
170
+ if doc['title']:
171
+ report.append(f"- Title: {doc['title'][:200]}\n")
172
+ text_preview = doc['text'][:400].replace('\n', ' ')
173
+ report.append(f"- Text: {text_preview}...\n")
174
+
175
+ report.append("\n")
176
+ report.append("---\n\n")
177
+
178
+ # Show examples PER SPLIT
179
+ report.append("### Examples by Split and Score\n")
180
+
181
+ for split in ["train", "dev", "test"]:
182
+ if split not in split_data:
183
+ continue
184
+
185
+ report.append(f"\n#### {split.upper()} Split\n")
186
+
187
+ # Get scores for this split
188
+ split_scores = sorted(split_data[split]["score_dist"].keys())
189
+ report.append(f"**Scores in {split}: {split_scores}**\n")
190
+
191
+ for score in split_scores:
192
+ report.append(f"\n##### Score {score} Examples ({split})\n")
193
+
194
+ # Find examples for this score in this split
195
+ examples_shown = 0
196
+ for key, doc_ids in all_score_examples.get(score, {}).items():
197
+ if key.startswith(f"{split}:"):
198
+ query_id = key.split(':', 1)[1]
199
+ if query_id in queries and examples_shown < 10: # Changed from 3 to 5
200
+ examples_shown += 1
201
+ report.append(f"\n**Example {examples_shown}:**\n")
202
+ report.append(f"- Query: {queries[query_id]}\n")
203
+
204
+ # Show first doc
205
+ for doc_id in doc_ids[:1]:
206
+ if doc_id in corpus:
207
+ doc = corpus[doc_id]
208
+ if doc['title']:
209
+ report.append(f"- Doc Title: {doc['title']}\n")
210
+ text_preview = doc['text'].replace('\n', ' ')
211
+ report.append(f"- Doc Text: {text_preview}...\n")
212
+ break
213
+
214
+ if examples_shown == 0:
215
+ report.append("No examples found in loaded corpus.\n")
216
+
217
+ # Add sections to be filled
218
+ report.append("\n## Training Recommendations\n")
219
+ report.append("[TO BE FILLED BASED ON ANALYSIS]\n")
220
+
221
+ report.append("\n## Observations\n")
222
+
223
+ # Check for train/test mismatch
224
+ if "train" in split_data and "test" in split_data:
225
+ train_scores = set(split_data["train"]["score_dist"].keys())
226
+ test_scores = set(split_data["test"]["score_dist"].keys())
227
+
228
+ if train_scores != test_scores:
229
+ report.append(f"⚠️ **Score mismatch between train and test:**\n")
230
+ report.append(f"- Train has scores: {sorted(train_scores)}\n")
231
+ report.append(f"- Test has scores: {sorted(test_scores)}\n")
232
+ report.append(f"- This could cause issues when training models\n")
233
+
234
+ # Check for single score in train
235
+ if "train" in split_data:
236
+ train_scores = set(split_data["train"]["score_dist"].keys())
237
+ if len(train_scores) == 1 and 1 in train_scores:
238
+ report.append(f"⚠️ **Training data only has Score 1:**\n")
239
+ report.append(f"- All training documents are marked as equally relevant\n")
240
+ report.append(f"- Model cannot learn to distinguish relevance levels\n")
241
+ report.append(f"- May indicate collapsed/merged relevance levels\n")
242
+
243
+ return "\n".join(report)
244
+
245
+ def main():
246
+ beir_base = "../beir_data"
247
+ output_dir = Path(beir_base).parent / "test_scores" / "dataset_reports"
248
+ output_dir.mkdir(exist_ok=True)
249
+
250
+ # List all datasets
251
+ datasets = [d for d in os.listdir(beir_base)
252
+ if os.path.isdir(os.path.join(beir_base, d))]
253
+
254
+ print("="*80)
255
+ print("Creating individual dataset reports...")
256
+ print("="*80)
257
+
258
+ for dataset in sorted(datasets):
259
+ print(f"Processing {dataset}...")
260
+ report = create_dataset_report(dataset, beir_base)
261
+
262
+ if report:
263
+ # Save report
264
+ output_file = output_dir / f"{dataset}_analysis.md"
265
+ with open(output_file, 'w', encoding='utf-8') as f:
266
+ f.write(report)
267
+ print(f" ✓ Report saved to: {output_file}")
268
+
269
+ print(f"\nAll reports saved in: {output_dir}")
270
+
271
+ if __name__ == "__main__":
272
+ main()
test_scores/analyze_webis_only.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Analyze ONLY webis-touche2020 dataset - SAME FORMAT AS analyze_each_dataset.py
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import random
8
+ from pathlib import Path
9
+ from collections import defaultdict
10
+
11
+ def create_dataset_report(dataset_name, beir_base_path):
12
+ """Create a detailed markdown report for a single dataset"""
13
+ dataset_path = Path(beir_base_path) / dataset_name
14
+
15
+ if not dataset_path.exists():
16
+ return None
17
+
18
+ report = []
19
+ report.append(f"# {dataset_name.upper()} Dataset Analysis\n")
20
+ report.append(f"## Overview\n")
21
+
22
+ # Check which splits exist
23
+ qrels_path = dataset_path / "qrels"
24
+ if not qrels_path.exists():
25
+ report.append("No qrels directory found.\n")
26
+ return "\n".join(report)
27
+
28
+ # Load queries
29
+ queries = {}
30
+ queries_file = dataset_path / "queries.jsonl"
31
+ if queries_file.exists():
32
+ with open(queries_file, 'r') as f:
33
+ for line in f:
34
+ q = json.loads(line)
35
+ queries[q['_id']] = q['text']
36
+ report.append(f"- Total queries available: {len(queries)}\n")
37
+
38
+ # Load MORE corpus for better examples
39
+ corpus = {}
40
+ corpus_count = 0
41
+ corpus_file = dataset_path / "corpus.jsonl"
42
+ if corpus_file.exists():
43
+ with open(corpus_file, 'r') as f:
44
+ for i, line in enumerate(f):
45
+ corpus_count += 1
46
+ if i < 50000: # Load MUCH MORE for examples
47
+ doc = json.loads(line)
48
+ corpus[doc['_id']] = {
49
+ 'title': doc.get('title', ''),
50
+ 'text': doc.get('text', '')
51
+ }
52
+ report.append(f"- Total corpus documents: {corpus_count}\n")
53
+ report.append(f"- Corpus documents loaded for examples: {len(corpus)}\n")
54
+
55
+ # Analyze each split
56
+ split_data = {}
57
+ all_score_examples = {} # Collect examples across all splits
58
+
59
+ for split in ["train", "dev", "test"]:
60
+ qrel_file = qrels_path / f"{split}.tsv"
61
+
62
+ if qrel_file.exists():
63
+ score_dist = {}
64
+ queries_set = set()
65
+ total_pairs = 0
66
+ examples_by_query = {}
67
+
68
+ with open(qrel_file, 'r') as f:
69
+ # Skip header if exists
70
+ first_line = f.readline().strip()
71
+ if not first_line.startswith("query"):
72
+ f.seek(0)
73
+
74
+ for line in f:
75
+ parts = line.strip().split('\t')
76
+ if len(parts) >= 3:
77
+ if len(parts) == 3:
78
+ query_id, doc_id, score = parts
79
+ else: # len(parts) == 4, TREC format
80
+ query_id = parts[0]
81
+ doc_id = parts[2]
82
+ score = parts[3]
83
+ score = int(float(score))
84
+
85
+ queries_set.add(query_id)
86
+ score_dist[score] = score_dist.get(score, 0) + 1
87
+ total_pairs += 1
88
+
89
+ # Collect ALL examples WITH SPLIT INFO
90
+ if query_id in queries and doc_id in corpus:
91
+ if score not in all_score_examples:
92
+ all_score_examples[score] = {}
93
+ key = f"{split}:{query_id}"
94
+ if key not in all_score_examples[score]:
95
+ all_score_examples[score][key] = []
96
+ all_score_examples[score][key].append(doc_id)
97
+
98
+ if query_id not in examples_by_query:
99
+ examples_by_query[query_id] = {}
100
+ if score not in examples_by_query[query_id]:
101
+ examples_by_query[query_id][score] = []
102
+ examples_by_query[query_id][score].append(doc_id)
103
+
104
+ split_data[split] = {
105
+ "queries": len(queries_set),
106
+ "total_pairs": total_pairs,
107
+ "score_dist": score_dist,
108
+ "examples": examples_by_query
109
+ }
110
+
111
+ # Write split statistics
112
+ report.append("\n## Split Statistics\n")
113
+ report.append("| Split | Queries | Total Pairs | Score Distribution |\n")
114
+ report.append("|-------|---------|-------------|-------------------|\n")
115
+
116
+ for split in ["train", "dev", "test"]:
117
+ if split in split_data:
118
+ data = split_data[split]
119
+ scores_str = ", ".join([f"{k}:{v}" for k, v in sorted(data["score_dist"].items())])
120
+ report.append(f"| {split} | {data['queries']} | {data['total_pairs']} | {scores_str} |\n")
121
+ else:
122
+ report.append(f"| {split} | N/A | N/A | N/A |\n")
123
+
124
+ # Analyze score meanings
125
+ report.append("\n## Score Analysis\n")
126
+
127
+ unique_scores = set()
128
+ for split in split_data.values():
129
+ unique_scores.update(split["score_dist"].keys())
130
+
131
+ if len(unique_scores) == 1:
132
+ report.append(f"**Only one score level ({list(unique_scores)[0]}) found in this dataset.**\n")
133
+ report.append("- This means all labeled documents are considered equally relevant\n")
134
+ report.append("- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)\n")
135
+ else:
136
+ report.append(f"**Multiple score levels found: {sorted(unique_scores)}**\n")
137
+
138
+
139
+ # Add examples
140
+ report.append("\n## Examples\n")
141
+
142
+ # Show multiple examples with score comparisons
143
+ report.append("### Score Comparison Examples\n")
144
+
145
+ # Find queries with multiple score levels
146
+ queries_with_multiple_scores = []
147
+ for split_name, split in split_data.items():
148
+ for query_id, scores_dict in split["examples"].items():
149
+ if len(scores_dict) > 1 and query_id in queries:
150
+ queries_with_multiple_scores.append((split_name, query_id, scores_dict))
151
+
152
+ # Show up to 5 comparison examples
153
+ if queries_with_multiple_scores:
154
+ for i, (split_name, query_id, scores_dict) in enumerate(queries_with_multiple_scores[:5], 1):
155
+ report.append(f"#### Comparison Example {i}\n")
156
+ report.append(f"**Query:** {queries[query_id]}\n")
157
+ report.append(f"**From:** {split_name} split\n\n")
158
+
159
+ for score in sorted(scores_dict.keys(), reverse=True):
160
+ doc_ids = scores_dict[score]
161
+ report.append(f"**Score {score} Documents:**\n")
162
+
163
+ shown = 0
164
+ for doc_id in doc_ids[:3]: # Show up to 3 docs per score
165
+ if doc_id in corpus:
166
+ doc = corpus[doc_id]
167
+ shown += 1
168
+ report.append(f"\n*Document {shown}:*\n")
169
+ if doc['title']:
170
+ report.append(f"- Title: {doc['title'][:200]}\n")
171
+ text_preview = doc['text'][:400].replace('\n', ' ')
172
+ report.append(f"- Text: {text_preview}...\n")
173
+
174
+ report.append("\n")
175
+ report.append("---\n\n")
176
+
177
+ # Show examples PER SPLIT
178
+ report.append("### Examples by Split and Score\n")
179
+
180
+ for split in ["train", "dev", "test"]:
181
+ if split not in split_data:
182
+ continue
183
+
184
+ report.append(f"\n#### {split.upper()} Split\n")
185
+
186
+ # Get scores for this split
187
+ split_scores = sorted(split_data[split]["score_dist"].keys())
188
+ report.append(f"**Scores in {split}: {split_scores}**\n")
189
+
190
+ for score in split_scores:
191
+ report.append(f"\n##### Score {score} Examples ({split})\n")
192
+
193
+ # Find examples for this score in this split
194
+ examples_shown = 0
195
+ for key, doc_ids in all_score_examples.get(score, {}).items():
196
+ if key.startswith(f"{split}:"):
197
+ query_id = key.split(':', 1)[1]
198
+ if query_id in queries and examples_shown < 10: # Changed from 3 to 5 then to 10
199
+ examples_shown += 1
200
+ report.append(f"\n**Example {examples_shown}:**\n")
201
+ report.append(f"- Query: {queries[query_id]}\n")
202
+
203
+ # Show first doc
204
+ for doc_id in doc_ids[:1]:
205
+ if doc_id in corpus:
206
+ doc = corpus[doc_id]
207
+ if doc['title']:
208
+ report.append(f"- Doc Title: {doc['title']}\n")
209
+ text_preview = doc['text'].replace('\n', ' ')
210
+ report.append(f"- Doc Text: {text_preview}...\n")
211
+ break
212
+
213
+ if examples_shown == 0:
214
+ report.append("No examples found in loaded corpus.\n")
215
+
216
+ # Add sections to be filled
217
+ report.append("\n## Training Recommendations\n")
218
+ report.append("[TO BE FILLED BASED ON ANALYSIS]\n")
219
+
220
+ report.append("\n## Observations\n")
221
+
222
+ # Check for train/test mismatch
223
+ if "train" in split_data and "test" in split_data:
224
+ train_scores = set(split_data["train"]["score_dist"].keys())
225
+ test_scores = set(split_data["test"]["score_dist"].keys())
226
+
227
+ if train_scores != test_scores:
228
+ report.append(f"⚠️ **Score mismatch between train and test:**\n")
229
+ report.append(f"- Train has scores: {sorted(train_scores)}\n")
230
+ report.append(f"- Test has scores: {sorted(test_scores)}\n")
231
+ report.append(f"- This could cause issues when training models\n")
232
+
233
+ # Check for single score in train
234
+ if "train" in split_data:
235
+ train_scores = set(split_data["train"]["score_dist"].keys())
236
+ if len(train_scores) == 1 and 1 in train_scores:
237
+ report.append(f"⚠️ **Training data only has Score 1:**\n")
238
+ report.append(f"- All training documents are marked as equally relevant\n")
239
+ report.append(f"- Model cannot learn to distinguish relevance levels\n")
240
+ report.append(f"- May indicate collapsed/merged relevance levels\n")
241
+
242
+ return "\n".join(report)
243
+
244
+ def main():
245
+ beir_base = "../beir_data"
246
+ output_dir = Path(beir_base).parent / "test_scores" / "dataset_reports"
247
+ output_dir.mkdir(exist_ok=True)
248
+
249
+ # ONLY PROCESS WEBIS-TOUCHE2020
250
+ dataset = "webis-touche2020"
251
+
252
+ print("="*80)
253
+ print(f"Creating report for {dataset} ONLY...")
254
+ print("="*80)
255
+
256
+ print(f"Processing {dataset}...")
257
+ report = create_dataset_report(dataset, beir_base)
258
+
259
+ if report:
260
+ # Save report
261
+ output_file = output_dir / f"{dataset}_analysis.md"
262
+ with open(output_file, 'w', encoding='utf-8') as f:
263
+ f.write(report)
264
+ print(f" ✓ Report saved to: {output_file}")
265
+
266
+ print(f"\nReport saved in: {output_dir}")
267
+
268
+ if __name__ == "__main__":
269
+ main()
test_scores/beir_analysis.json ADDED
File without changes
test_scores/dataset_reports/FINAL_TRAINING_SUMMARY.md ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BEIR Datasets Training Summary
2
+
3
+ ## Overview
4
+ Comprehensive analysis of all 12 BEIR datasets for three-category training (positive, hard_negative, easy_negative) to train a tiny LLM retriever.
5
+
6
+ ## Three-Category Training System
7
+
8
+ ### Categories:
9
+ 1. **Positive**: Relevant documents (Score 1 or 2 depending on dataset)
10
+ 2. **Hard Negative**: Documents with keywords but wrong intent (Score 1 from DBPedia-Entity and Webis-Touche2020)
11
+ 3. **Easy Negative**: Clearly irrelevant documents (Score 0 or from other queries within same dataset)
12
+
13
+ ### Sampling Strategy:
14
+ - **Adaptive Ratio**: For each query, count hard negatives (H), then sample H × 10 easy negatives
15
+ - **Within-Dataset Priority**: Sample negatives from same dataset first (harder, same domain)
16
+ - **Cross-Dataset Fallback**: Only use other datasets' docs if needed
17
+
18
+ ## Dataset-by-Dataset Analysis
19
+
20
+ ### 1. ArguAna
21
+ - **Verdict**: GOOD FOR TRAINING ✓
22
+ - **Scores Available**: Only 1
23
+ - **Score 1 Meaning**: RELEVANT (counter-arguments to the query)
24
+ - **Splits**: test only (1406 queries)
25
+ - **Training Strategy**: Binary classification
26
+ - **Special Notes**: Counter-arguments are considered relevant (opposing stance, same topic)
27
+
28
+ ### 2. Climate-FEVER
29
+ - **Verdict**: SKIP (NO POSITIVE EXAMPLES) ✗
30
+ - **Scores Available**: Only 1
31
+ - **Score 1 Meaning**: QUESTIONABLY RELEVANT (often just keyword matching)
32
+ - **Splits**: test only (1535 queries)
33
+ - **Training Strategy**: DO NOT USE for training
34
+ - **Special Notes**: Can provide easy negatives to other datasets
35
+
36
+ ### 3. DBPedia-Entity
37
+ - **Verdict**: GOLD MINE FOR HARD NEGATIVES ✓🏆
38
+ - **Scores Available**: 0, 1, 2
39
+ - **Score 1 Meaning**: HARD NEGATIVE (keyword match without intent)
40
+ - **Score 2 Meaning**: POSITIVE (true entity relevance)
41
+ - **Score 0 Meaning**: EASY NEGATIVE
42
+ - **Splits**: dev (67 queries), test (400 queries)
43
+ - **Training Categories**: Score 2 → positive, Score 1 → hard_negative, Score 0 → easy_negative
44
+ - **Special Notes**: Critical for teaching models not to be fooled by keyword matching!
45
+
46
+ ### 4. FEVER
47
+ - **Verdict**: GOOD FOR TRAINING ✓
48
+ - **Scores Available**: Only 1
49
+ - **Score 1 Meaning**: RELEVANT (evidence supporting OR refuting claims)
50
+ - **Splits**: train (140085 queries), dev (6666 queries), test (6666 queries)
51
+ - **Training Strategy**: Binary classification
52
+ - **Special Notes**: Documents that refute claims ARE relevant (fact verification task)
53
+
54
+ ### 5. FiQA
55
+ - **Verdict**: GOOD FOR TRAINING ✓
56
+ - **Scores Available**: Only 1
57
+ - **Score 1 Meaning**: RELEVANT (answers to financial questions)
58
+ - **Splits**: train (5500 queries), dev (500 queries), test (648 queries)
59
+ - **Training Strategy**: Binary classification
60
+ - **Special Notes**: Financial Q&A dataset with opinionated answers
61
+
62
+ ### 6. HotPotQA
63
+ - **Verdict**: GOOD FOR TRAINING ✓
64
+ - **Scores Available**: Only 1
65
+ - **Score 1 Meaning**: RELEVANT (documents for multi-hop reasoning)
66
+ - **Splits**: train (85000 queries), dev (5447 queries), test (7405 queries)
67
+ - **Training Strategy**: Binary classification
68
+ - **Special Notes**: Multi-hop Q&A requiring reasoning over multiple documents
69
+
70
+ ### 7. NFCorpus
71
+ - **Verdict**: GOOD FOR TRAINING ✓
72
+ - **Scores Available**: 1, 2
73
+ - **Score 1 Meaning**: RELEVANT (related medical papers)
74
+ - **Score 2 Meaning**: HIGHLY RELEVANT (direct answers)
75
+ - **Splits**: train (110575 queries), dev (324 queries), test (323 queries)
76
+ - **Training Strategy**: Binary (1+2 as positive) or graded relevance
77
+ - **Special Notes**: WARNING - train has only Score 1, test/dev have 1 and 2
78
+
79
+ ### 8. Quora
80
+ - **Verdict**: GOOD FOR TRAINING ✓
81
+ - **Scores Available**: Only 1
82
+ - **Score 1 Meaning**: RELEVANT (duplicate questions)
83
+ - **Splits**: dev (5000 queries), test (10000 queries)
84
+ - **Training Strategy**: Binary classification
85
+ - **Special Notes**: Duplicate question detection - excellent for semantic similarity
86
+
87
+ ### 9. SciDocs
88
+ - **Verdict**: GOOD FOR TRAINING ✓
89
+ - **Scores Available**: 0, 1
90
+ - **Score 1 Meaning**: RELEVANT (citation-worthy papers)
91
+ - **Splits**: test only (1000 queries)
92
+ - **Training Strategy**: Binary classification
93
+ - **Special Notes**: Citation prediction for scientific papers
94
+
95
+ ### 10. SciFact
96
+ - **Verdict**: GOOD FOR TRAINING ✓
97
+ - **Scores Available**: Only 1
98
+ - **Score 1 Meaning**: RELEVANT (evidence for scientific claims)
99
+ - **Splits**: train (920 queries), test (300 queries)
100
+ - **Training Strategy**: Binary classification
101
+ - **Special Notes**: Scientific claim verification
102
+
103
+ ### 11. TREC-COVID
104
+ - **Verdict**: GOOD FOR TRAINING ✓
105
+ - **Scores Available**: 0, 1, 2
106
+ - **Score 1 Meaning**: RELEVANT (related COVID-19 info)
107
+ - **Score 2 Meaning**: HIGHLY RELEVANT (direct COVID-19 answers)
108
+ - **Splits**: test only (50 queries)
109
+ - **Training Strategy**: Binary (1+2 as positive) or graded relevance
110
+ - **Special Notes**: No training data - only test split
111
+
112
+ ### 12. Webis-Touche2020
113
+ - **Verdict**: GOLD MINE FOR HARD NEGATIVES ✓🏆
114
+ - **Scores Available**: 0, 1, 2
115
+ - **Score 1 Meaning**: HARD NEGATIVE (wrong topic/context)
116
+ - **Score 2 Meaning**: POSITIVE (proper arguments)
117
+ - **Score 0 Meaning**: EASY NEGATIVE
118
+ - **Splits**: test only (49 queries)
119
+ - **Training Categories**: Score 2 → positive, Score 1 → hard_negative, Score 0 → easy_negative
120
+ - **Special Notes**: Critical for teaching models topic relevance! (e.g., school uniforms → hats is wrong topic)
121
+
122
+ ## Summary Statistics
123
+
124
+ ### Overall Results:
125
+ - **Datasets Providing Positives**: 11/12 datasets (92%)
126
+ - **Datasets Providing Hard Negatives**: 2/12 datasets (17%) - THE GOLD MINES!
127
+ - **Skip Completely**: 1/12 dataset (8%)
128
+
129
+ ### Category Breakdown:
130
+
131
+ #### Standard Positive Providers (9 datasets):
132
+ 1. **ArguAna** - Counter-arguments (test only)
133
+ 2. **FEVER** - Fact verification (140K train!)
134
+ 3. **FiQA** - Financial Q&A (5.5K train)
135
+ 4. **HotPotQA** - Multi-hop reasoning (85K train!)
136
+ 5. **NFCorpus** - Medical IR (110K train, Score 1&2 both positive)
137
+ 6. **Quora** - Duplicate questions (dev/test)
138
+ 7. **SciDocs** - Citation prediction (test only)
139
+ 8. **SciFact** - Scientific claims (train/test)
140
+ 9. **TREC-COVID** - COVID-19 IR (test only, Score 1&2 both positive)
141
+
142
+ #### Hard Negative Gold Mines (2 datasets):
143
+ 1. **DBPedia-Entity** - Score 1 = keyword match failures
144
+ 2. **Webis-Touche2020** - Score 1 = wrong topic/context
145
+
146
+ #### Skip (1 dataset):
147
+ 1. **Climate-FEVER** - No reliable positives
148
+
149
+ ## Key Findings
150
+
151
+ 1. **Hard Negatives Are Gold**: Only 2 datasets (DBPedia-Entity, Webis-Touche2020) provide hard negatives
152
+ - These are CRITICAL for teaching models not to be fooled by keyword matching
153
+ - Score 1 in these datasets = perfect training examples of what NOT to retrieve
154
+
155
+ 2. **Within-Dataset Sampling Is Key**:
156
+ - Easy negatives from same dataset are harder (same domain)
157
+ - Cross-dataset negatives are too easy (completely different domains)
158
+ - Example: COVID query with other COVID papers (hard) vs cooking recipes (trivial)
159
+
160
+ 3. **Adaptive Sampling Strategy**:
161
+ - Count hard negatives (H) for each query
162
+ - Sample H × 10 easy negatives
163
+ - Creates balanced difficulty gradient
164
+
165
+ ## Training Recommendations
166
+
167
+ ### Three-Category Training Implementation:
168
+
169
+ 1. **Use the JSON config** (`training_config_v2.json`) with:
170
+ - `score_to_category`: Maps scores to positive/hard_negative/easy_negative
171
+ - `provides_hard_negatives`: Identifies DBPedia-Entity and Webis-Touche2020
172
+ - `sampling`: Within-dataset priority
173
+
174
+ 2. **Training Data Priority**:
175
+ - **MUST INCLUDE**: DBPedia-Entity and Webis-Touche2020 (for hard negatives)
176
+ - **Large Training Sets**: FEVER (140K), HotPotQA (85K), NFCorpus (110K)
177
+ - **Domain Variety**: FiQA (financial), NFCorpus (medical), SciFact (scientific)
178
+
179
+ 3. **Sampling Strategy**:
180
+ ```python
181
+ for query in dataset:
182
+ positives = get_positives(query) # Score 1 or 2
183
+ hard_negatives = get_hard_negatives(query) # Score 1 from DBPedia/Webis
184
+ num_easy = len(hard_negatives) * 10
185
+ easy_negatives = sample_within_dataset(query, num_easy)
186
+ ```
187
+
188
+ 4. **Skip Only**:
189
+ - Climate-FEVER (no reliable positive examples)
test_scores/dataset_reports/arguana_analysis.md ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ARGUANA Dataset Analysis
2
+
3
+ ## Overview
4
+
5
+ - Total queries available: 1406
6
+
7
+ - Total corpus documents: 8674
8
+
9
+ - Corpus documents loaded for examples: 8674
10
+
11
+
12
+ ## Split Statistics
13
+
14
+ | Split | Queries | Total Pairs | Score Distribution |
15
+
16
+ |-------|---------|-------------|-------------------|
17
+
18
+ | train | N/A | N/A | N/A |
19
+
20
+ | dev | N/A | N/A | N/A |
21
+
22
+ | test | 1406 | 1406 | 1:1406 |
23
+
24
+
25
+ ## Score Analysis
26
+
27
+ **Only one score level (1) found in this dataset.**
28
+
29
+ - This means all labeled documents are considered equally relevant
30
+
31
+ - Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)
32
+
33
+
34
+ ## Examples
35
+
36
+ ### Score Comparison Examples
37
+
38
+ ### Examples by Split and Score
39
+
40
+
41
+ #### TEST Split
42
+
43
+ **Scores in test: [1]**
44
+
45
+
46
+ ##### Score 1 Examples (test)
47
+
48
+
49
+ **Example 1:**
50
+
51
+ - Query: Being vegetarian helps the environment Becoming a vegetarian is an environmentally friendly thing to do. Modern farming is one of the main sources of pollution in our rivers. Beef farming is one of the main causes of deforestation, and as long as people continue to buy fast food in their billions, there will be a financial incentive to continue cutting down trees to make room for cattle. Because of our desire to eat fish, our rivers and seas are being emptied of fish and many species are facing extinction. Energy resources are used up much more greedily by meat farming than my farming cereals, pulses etc. Eating meat and fish not only causes cruelty to animals, it causes serious harm to the environment and to biodiversity. For example consider Meat production related pollution and deforestation At Toronto’s 1992 Royal Agricultural Winter Fair, Agriculture Canada displayed two contrasting statistics: “it takes four football fields of land (about 1.6 hectares) to feed each Canadian” and “one apple tree produces enough fruit to make 320 pies.” Think about it — a couple of apple trees and a few rows of wheat on a mere fraction of a hectare could produce enough food for one person! [1] The 2006 U.N. Food and Agriculture Organization (FAO) report concluded that worldwide livestock farming generates 18% of the planet's greenhouse gas emissions — by comparison, all the world's cars, trains, planes and boats account for a combined 13% of greenhouse gas emissions. [2] As a result of the above point producing meat damages the environment. The demand for meat drives deforestation. Daniel Cesar Avelino of Brazil's Federal Public Prosecution Office says “We know that the single biggest driver of deforestation in the Amazon is cattle.” This clearing of tropical rainforests such as the Amazon for agriculture is estimated to produce 17% of the world's greenhouse gas emissions. [3] Not only this but the production of meat takes a lot more energy than it ultimately gives us chicken meat production consumes energy in a 4:1 ratio to protein output; beef cattle production requires an energy input to protein output ratio of 54:1. The same is true with water use due to the same phenomenon of meat being inefficient to produce in terms of the amount of grain needed to produce the same weight of meat, production requires a lot of water. Water is another scarce resource that we will soon not have enough of in various areas of the globe. Grain-fed beef production takes 100,000 liters of water for every kilogram of food. Raising broiler chickens takes 3,500 liters of water to make a kilogram of meat. In comparison, soybean production uses 2,000 liters for kilogram of food produced; rice, 1,912; wheat, 900; and potatoes, 500 liters. [4] This is while there are areas of the globe that have severe water shortages. With farming using up to 70 times more water than is used for domestic purposes: cooking and washing. A third of the population of the world is already suffering from a shortage of water. [5] Groundwater levels are falling all over the world and rivers are beginning to dry up. Already some of the biggest rivers such as China’s Yellow river do not reach the sea. [6] With a rising population becoming vegetarian is the only responsible way to eat. [1] Stephen Leckie, ‘How Meat-centred Eating Patterns Affect Food Security and the Environment’, International development research center [2] Bryan Walsh, Meat: Making Global Warming Worse, Time magazine, 10 September 2008 . [3] David Adam, Supermarket suppliers ‘helping to destroy Amazon rainforest’, The Guardian, 21st June 2009. [4] Roger Segelken, U.S. could feed 800 million people with grain that livestock eat, Cornell Science News, 7th August 1997. [5] Fiona Harvey, Water scarcity affects one in three, FT.com, 21st August 2003 [6] Rupert Wingfield-Hayes, Yellow river ‘drying up’, BBC News, 29th July 2004
52
+
53
+ - Doc Title: animals environment general health health general weight philosophy ethics
54
+
55
+ - Doc Text: You don’t have to be vegetarian to be green. Many special environments have been created by livestock farming – for example chalk down land in England and mountain pastures in many countries. Ending livestock farming would see these areas go back to woodland with a loss of many unique plants and animals. Growing crops can also be very bad for the planet, with fertilisers and pesticides polluting rivers, lakes and seas. Most tropical forests are now cut down for timber, or to allow oil palm trees to be grown in plantations, not to create space for meat production. British farmer and former editor Simon Farrell also states: “Many vegans and vegetarians rely on one source from the U.N. calculation that livestock generates 18% of global carbon emissions, but this figure contains basic mistakes. It attributes all deforestation from ranching to cattle, rather than logging or development. It also muddles up one-off emissions from deforestation with on-going pollution.” He also refutes the statement of meat production inefficiency: “Scientists have calculated that globally the ratio between the amounts of useful plant food used to produce meat is about 5 to 1. If you feed animals only food that humans can eat — which is, indeed, largely the case in the Western world — that may be true. But animals also eat food we can't eat, such as grass. So the real conversion figure is 1.4 to 1.” [1] At the same time eating a vegetarian diet may be no more environmentally friendly than a meat based diet if it is not sustainably sourced or uses perishable fruit and vegetables that are flown in from around the world. Eating locally sourced food can has as big an impact as being vegetarian. [2] [1] Tara Kelly, Simon Fairlie: How Eating Meat Can Save the World, 12 October 2010 [2] Lucy Siegle, ‘It is time to become a vegetarian?’ The Observer, 18th May 2008...
56
+
57
+
58
+ **Example 2:**
59
+
60
+ - Query: It is immoral to kill animals As evolved human beings it is our moral duty to inflict as little pain as possible for our survival. So if we do not need to inflict pain to animals in order to survive, we should not do it. Farm animals such as chickens, pigs, sheep, and cows are sentient living beings like us - they are our evolutionary cousins and like us they can feel pleasure and pain. The 18th century utilitarian philosopher Jeremy Bentham even believed that animal suffering was just as serious as human suffering and likened the idea of human superiority to racism. It is wrong to farm and kill these animals for food when we do not need to do so. The methods of farming and slaughter of these animals are often barbaric and cruel - even on supposedly 'free range' farms. [1] Ten billion animals were slaughtered for human consumption each year, stated PETA. And unlike the farms long time ago, where animals roamed freely, today, most animals are factory farmed: —crammed into cages where they can barely move and fed a diet adulterated with pesticides and antibiotics. These animals spend their entire lives in their “prisoner cells” so small that they can't even turn around. Many suffer serious health problems and even death because they are selectively bred to grow or produce milk or eggs at a far greater rate than their bodies are capable of coping with. At the slaughterhouse, there were millions of others who are killed every year for food. Further on Tom Regan explains that all duties regarding animals are indirect duties to one another from a philosophical point of view. He illustrates it with an analogy regarding children: “Children, for example, are unable to sign contracts and lack rights. But they are protected by the moral contract nonetheless because of the sentimental interests of others. So we have, then, duties involving these children, duties regarding them, but no duties to them. Our duties in their case are indirect duties to other human beings, usually their parents.” [2] With this he supports the theory that animals must be protected from suffering, as it is moral to protect any living being from suffering, not because we have a moral contract with them, but mainly due to respect of life and recognition of suffering itself. [1] Claire Suddath, A brief history of Veganism, Time, 30 October 2008 [2] Tom Regan, The case for animal rights, 1989
61
+
62
+ - Doc Title: animals environment general health health general weight philosophy ethics
63
+
64
+ - Doc Text: There is a great moral difference between humans and animals. Unlike animals, humans are capable of rational thought and can alter the world around them. Other creatures were put on this earth for mankind to use, and that includes eating meat. For all these reasons we say that men and women have rights and that animals don’t. This means that eating meat is in no way like murder. It is natural for human beings to farm, kill, and eat other species. In the wild there is a brutal struggle for existence. The fact that we humans have succeeded in that struggle by exploiting our natural environment means that we have a natural right over lower species. In fact farming animals is much less brutal than the pain and hardship that animals inflict on each other naturally in the wild. Eating meat does not need to mean cruelty to animals. There are a growing number of organic and free-range farms that can provide meat without cruelty to animals. Similarly, it might be reasonable to argue for an extension of animal welfare laws to protect farm animals - but that does not mean that it is wrong in principle to eat meat....
65
+
66
+
67
+ **Example 3:**
68
+
69
+ - Query: Vegetarianism is healthier There are significant health benefits to 'going veggie'; a vegetarian diet contains high quantities of fibre, vitamins, and minerals, and is low in fat. (A vegan diet is even better since eggs and dairy products are high in cholesterol.) The risk of contracting many forms of cancer is increased by eating meat: in 1996 the American Cancer Society recommended that red meat should be excluded from the diet entirely. Eating meat also increases the risk of heart disease - vegetables contain no cholesterol, which can build up to cause blocked arteries in meat-eaters. An American study found out that: “that men in the highest quintile of red-meat consumption — those who ate about 5 oz. of red meat a day, roughly the equivalent of a small steak had a 31% higher risk of death over a 10-year period than men in the lowest-consumption quintile, who ate less than 1 oz. of red meat per day, or approximately three slices of corned beef.” [1] A vegetarian diet reduces the risk for chronic degenerative diseases such as obesity, high blood pressure, diabetes and types of cancer including colon, breast, stomach, and lung cancer because of it's low fat/cholesterol content. There are plenty of vegetarian sources of protein, such as beans and bean curd; and spinach is one of the best sources of iron. [1] Tiffany Sharples, ‘The Growing Case Against Red Meat’, Time, 23rd March 2009
70
+
71
+ - Doc Title: animals environment general health health general weight philosophy ethics
72
+
73
+ - Doc Text: The key to good health is a balanced diet, not a meat- and fish-free diet. Meat and fish are good sources of protein, iron, and other vitamins and minerals. Most of the health benefits of a vegetarian diet derive from its being high in fibre and low in fat and cholesterol. These can be achieved by avoiding fatty and fried foods, eating only lean grilled meat and fish, and including a large amount of fruit and vegetables in your diet along with meat and fish. In general, raw, unprocessed meat from the muscle is made up of the following: protein 15 - 22 % Fat 3 - 15 % Minerals, carbohydrates 1 - 5 % Water 65 - 75 %, all things that we need in moderation. [1] A meat- and fish-free diet is unbalanced and makes it more likely that you will go short of protein, iron and some minerals such as B12 for which we are primarily dependent on animal foodstuffs. Also, a vegetarian diet, in the West, is a more expensive option - a luxury for the middle classes. Fresh fruit and vegetables are extremely expensive compared to processed meats, bacon, burgers, sausages etc. [1] Bell, ‘Nutrition &amp; Well-Being’...
74
+
75
+
76
+ **Example 4:**
77
+
78
+ - Query: Being vegetarian reduces risks of food poisoning Almost all dangerous types of food poisoning are passed on through meat or eggs. So Campylobacter bacteria, the most common cause of food poisoning in England, are usually found in raw meat and poultry, unpasteurised milk and untreated water. Salmonella come from raw meat, poultry and dairy products and most cases of escherichia coli (E-Coli) food poisoning occur after eating undercooked beef or drinking unpasteurised milk. [1] Close contact between humans and animals also leads to zoonosis – diseases such as bird ‘flu which can be passed on from animals to humans. Using animal brains in the processed feed for livestock led to BSE in cattle and to CJD in humans who ate beef from infected cows. [1] Causes of food poisoning, nhs.co.uk, 23rd June 2009
79
+
80
+ - Doc Title: animals environment general health health general weight philosophy ethics
81
+
82
+ - Doc Text: Food safety and hygiene are very important for everyone, and governments should act to ensure that high standards are in place particularly in restaurants and other places where people get their food from. But food poisoning can occur anywhere “People don't like to admit that the germs might have come from their own home” [1] and while meat is particularly vulnerable to contamination there are bacteria that can be transmitted on vegetables, for example Listeria monocytogenes can be transmitted raw vegetables. [2] Almost three-quarters of zoonotic transmissions are caused by pathogens of wildlife origin; even some that could have been caused by livestock such as avian flu could equally have come from wild animals. There is little we can do about the transmission of such diseases except by reducing close contact. Thus changing to vegetarianism may reduce such diseases by reducing contact but would not eliminate them. [3] Just as meat production can raise health issues, so does the arable farming of plants – examples include GM crops and worries about pesticide residues on fruit and vegetables. The important thing is not whether the diet is meat based or vegetarian; just that we should ensure all food is produced in a safe and healthy way. [1] ‘ 10 ways to prevent food poisoning’, nhs.co.uk, 28th November 2010. [2] Food Poisoning, emedicinehealth. [3] Ulrich Desselberger, ‘The significance of zoonotic transmission of viruses in human disease’, Microbiology Today, November 2009....
83
+
84
+
85
+ **Example 5:**
86
+
87
+ - Query: There are problems with being vegetarian A vegetarian or vegan diet may result in a person not getting enough iron. This is because, although you can get iron from foods such as pulses, green leafy vegetables and nuts, the iron in these foods isn't absorbed so easily. The symptoms of this feeling breathless after little exercise, feeling tired and a short attention span and poor concentration. [1] These symptoms could negatively affect proficiency in school and the ability to perform well at work ultimately leading to a loss of productivity which has both personal effects and broader effects for the economy. Other conditions include frequently becoming ill, frequently becoming depressed, and malnourishment. [1] Bupa's Health Information Team, ‘Iron-deficiency anaemia’, bupa.co.uk, March 2010,
88
+
89
+ - Doc Title: animals environment general health health general weight philosophy ethics
90
+
91
+ - Doc Text: The problems with fatigue, apathetic behaviour and concentration are mostly a result from a lack of iron in the diet. However as with any diet this is only a problem when not eating the right things, this regularly means that such iron deficiency can be a problem in the developing world where vegetarians have little choice – usually eating little else except what they grow, normally just cereals. “Although the iron stores of vegetarians are sometimes reduced, the incidence of iron-deficiency anaemia in vegetarians is not significantly different from that in the general population”, there are plenty of sources of iron that can be eaten by vegetarians such as legumes and whole grains that are a substantial part of most western vegetarian’s diets meaning it is not a problem. [1] Research done in Australia concludes that "There was no significant difference between mean daily iron intakes of vegetarians and omnivores". [2] [1] David Ogilvie, Nutrition: Iron and Vegetarian Diets, Vegetarian Network Victoria, September 2010. [2] Madeleine J Ball and Melinda A Bartlett, ‘Dietary intake and iron status of Australian vegetarian women’, American Society for Clinical Nutrition, 1999...
92
+
93
+
94
+ **Example 6:**
95
+
96
+ - Query: Survival of the fittest It is natural for human beings to farm, kill, and eat other species. In the wild there is a brutal struggle for existence as is shown by Darwin’s On the Origin of the Species. [1] The fact that we humans have succeeded in that struggle by exploiting our natural environment means that we have a natural right over lower species. The concept of survival of the fittest may seem outdated but it is still the defining order of nature. In fact farming animals is much less brutal than the pain and hardship that animals inflict on each other naturally in the wild. [1] Darwin, Charles, On the Origin of Species by Means of Natural Selection, or the Preservation of Favoured Races in the Struggle for Life., Literature.org
97
+
98
+ - Doc Title: animals environment general health health general weight philosophy ethics
99
+
100
+ - Doc Text: To suggest that battery farms are in some way 'natural' is absurd - they are unnatural and cruel. To eat meat is to perpetuate animal suffering on a huge scale - a larger, crueler, and more systematic scale than anything found in the wild. Furthermore, the very fact of humanity's 'superiority' over other animals means they have the reason and moral instinct to stop exploiting other species. If an alien species from another planet, much more intelligent and powerful than humans, came and colonized the earth and farmed (and force-fed) human beings in battery farm conditions we would think it was morally abhorrent. If this would be wrong, then is it not wrong for we 'superior' humans to farm 'lower' species on earth simply because of our ability to do so?...
101
+
102
+
103
+ **Example 7:**
104
+
105
+ - Query: Humans can choose their own nutrition plan Humans are omnivores – we are meant to eat both meat and plants. Like our early ancestors we have sharp canine teeth for tearing animal flesh and digestive systems adapted to eating meat and fish as well as vegetables. Our stomachs are also adapted to eating both meat and vegetable matter. All of this means that eating meat is part of being human. Only in a few western countries are people self-indulgent enough to deny their nature and get upset about a normal human diet. We were made to eat both meat and vegetables - cutting out half of this diet will inevitably mean we lose that natural balance. Eating meat is entirely natural. Like many other species, human beings were once hunters. In the wild animals kill and are killed, often very brutally and with no idea of “rights”. As mankind has progressed over thousands of years we have largely stopped hunting wild animals. Instead we have found kinder and less wasteful ways of getting the meat in our diets through domestication. Farm animals today are descended from the animals we once hunted in the wild.
106
+
107
+ - Doc Title: animals environment general health health general weight philosophy ethics
108
+
109
+ - Doc Text: Human evolved as omnivores over thousands of years. Yet since the invention of farming there is no longer a need for us to be omnivores. Even if we wished to we could no longer collect, hunt and eat our food in the same way as our ancestors as we could not support the human population. We have outstripped the pace of our evolution and if we do not want to be turning ever more land over to farming we have get our food from the most efficient sources, which means being vegetarian....
110
+
111
+
112
+ **Example 8:**
113
+
114
+ - Query: Animal research causes severe harm to the animals involved The point of animal research is that animals are harmed. Even if they don’t suffer in the experiment, almost all are killed afterwards. With 115 million animals used a year this is a big problem. Releasing medical research animals in to the wild would be dangerous for them, and they would not be usable as pets. [4]. The only solution is that they are wild from birth. It is obvious that it’s not in the interest of animals to be killed or harmed. Research should be banned in order to prevent the deaths of millions of animals.
115
+
116
+ - Doc Title: animals science science general ban animal testing junior
117
+
118
+ - Doc Text: What then is the interest of the animal? If releasing these animals into the wild would kill them then surely it is humane to put them down after the experiment. It must also be remembered that the interest of the animal is not the main and is outweighed by the benefits to humans. [5]...
119
+
120
+
121
+ **Example 9:**
122
+
123
+ - Query: It would send out a consistent message Most countries have animal welfare laws to prevent animal cruelty but have laws like the UK’s Animals (Scientific Procedures) Act 1986, [10] that stop animal testing being a crime. This makes means some people can do things to animals, but not others. If the government are serious about animal abuse, why allow anyone to do it?
124
+
125
+ - Doc Title: animals science science general ban animal testing junior
126
+
127
+ - Doc Text: There is a moral difference between harm for the sake of harming an animal and harm in order to save lives. Lifesaving drugs is a very different purpose to betting or enjoyment that animal welfare laws are aimed at....
128
+
129
+
130
+ **Example 10:**
131
+
132
+ - Query: Animals shouldn’t be harmed The difference between us and other animals is a matter of degree rather than type [2]. Their bodies resemble ours, as do their ways of conveying meaning. They recoil from pain, appear to express fear of a tormentor, and appear to take pleasure in activities; a point clear to anyone who has observed a pet dog on hearing the word “walk”. We believe other people experience feelings like us because they are like us in appearance and behaviour. An animal sharing our anatomical, physiological, and behavioural characteristics is surely likely to have feelings like us. If people have a right to not be harmed, we must ask ourselves what makes animals different? If animals feel what we feel, and suffer like us, to condemn one to testing because of them being of a different species is similar to racism or sexism.[3]
133
+
134
+ - Doc Title: animals science science general ban animal testing junior
135
+
136
+ - Doc Text: The right of a human not to be harmed is based not on appearance but on not harming others. Animals don’t participate in this. Animals won’t stop hunting because of the pain and feelings of other animals. Even if animal testing were to be abolished people would still eat meat, and kill animals for other less worthwhile reasons than animal testing....
137
+
138
+
139
+ ## CORE
140
+
141
+ **Only Score 1 exists**
142
+ **Score 1 = RELEVANT** ✓
143
+
144
+ ## Special Notes
145
+
146
+ - **Original Purpose**: Find the best counterargument to any argument
147
+ - **What Score 1 Really Means**: Documents that argue AGAINST the query's position (opposing stance, same topic)
148
+ - **Dataset Type**: Argumentation/Debate
149
+ - **Unique Characteristic**: Counterarguments ARE relevant (same topic, different stance)
150
+
151
+ ## Training Verdict
152
+
153
+ **GOOD FOR TRAINING** ✓
154
+ - Score 1 documents are truly relevant (topically related counterarguments)
155
+ - Can use for binary classification: Score 1 = positive class
156
+
157
+ ## Observations
test_scores/dataset_reports/climate-fever_analysis.md ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CLIMATE-FEVER Dataset Analysis
2
+
3
+ ## Overview
4
+
5
+ - Total queries available: 1535
6
+
7
+ - Total corpus documents: 5416593
8
+
9
+ - Corpus documents loaded for examples: 50000
10
+
11
+
12
+ ## Split Statistics
13
+
14
+ | Split | Queries | Total Pairs | Score Distribution |
15
+
16
+ |-------|---------|-------------|-------------------|
17
+
18
+ | train | N/A | N/A | N/A |
19
+
20
+ | dev | N/A | N/A | N/A |
21
+
22
+ | test | 1535 | 4681 | 1:4681 |
23
+
24
+
25
+ ## Score Analysis
26
+
27
+ **Only one score level (1) found in this dataset.**
28
+
29
+ - This means all labeled documents are considered equally relevant
30
+
31
+ - Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)
32
+
33
+
34
+ ## Examples
35
+
36
+ ### Score Comparison Examples
37
+
38
+ ### Examples by Split and Score
39
+
40
+
41
+ #### TEST Split
42
+
43
+ **Scores in test: [1]**
44
+
45
+
46
+ ##### Score 1 Examples (test)
47
+
48
+
49
+ **Example 1:**
50
+
51
+ - Query: Duffy pointed out that his chart was from a single tide gauge station, near San Francisco, and that sea levels rise at different rates around the world
52
+
53
+ - Doc Title: 1906 San Francisco earthquake
54
+
55
+ - Doc Text: The 1906 San Francisco earthquake struck the coast of Northern California at 5:12 a.m. on April 18 with an estimated moment magnitude of 7.8 and a maximum Mercalli intensity of XI ( Extreme ) . Severe shaking was felt from Eureka on the North Coast to the Salinas Valley , an agricultural region to the south of the San Francisco Bay Area . Devastating fires soon broke out in the city and lasted for several days . As a result , about 3,000 people died and over 80 % of the city of San Francisco was destroyed . The events are remembered as one of the worst and deadliest natural disasters in the history of the United States . The death toll remains the greatest loss of life from a natural disaster in California 's history and high in the lists of American urban disasters ....
56
+
57
+
58
+ **Example 2:**
59
+
60
+ - Query: While such a storm does not happen in New England anymore, it happened again there in 1675, with elderly eyewitnesses comparing it to the 1635 storm.
61
+
62
+ - Doc Title: 1993 Storm of the Century
63
+
64
+ - Doc Text: The 1993 Storm of the Century ( also known as the 93 Super Storm or the Great Blizzard of 1993 ) was a large cyclonic storm that formed over the Gulf of Mexico on March 12 , 1993 . The storm eventually dissipated in the North Atlantic Ocean on March 15 , 1993 . It was unique for its intensity , massive size , and wide-reaching effects . At its height , the storm stretched from Canada to the Gulf of Mexico . The cyclone moved through the Gulf of Mexico and then through the eastern United States before moving onto Canada . Heavy snow was first reported in highland areas as far south as Alabama and northern Georgia , with Union County , Georgia reporting up to 35 inches of snow in the north Georgia mountains . Birmingham , Alabama , reported a rare 13 in of snow . The Florida Panhandle reported up to 4 in , with hurricane-force wind gusts and record low barometric pressures . Between Louisiana and Cuba , the hurricane-force winds produced high storm surges across Northwestern Florida which , in combination with scattered tornadoes , killed dozens of people . Record cold temperatures were seen across portions of the south and east of the US in the wake of this storm . In the United States , the storm was responsible for the loss of electric power to more than 10 million households . An estimated 40 percent of the country 's population experienced the effects of the storm with a total of 208 fatalities ....
65
+
66
+
67
+ **Example 3:**
68
+
69
+ - Query: While volcanic eruptions are natural events, it was the timing of these that had such a noticeable effect on the trend
70
+
71
+ - Doc Title: 1257 Samalas eruption
72
+
73
+ - Doc Text: The 1257 Samalas eruption was a major volcanic eruption of the Samalas volcano , next to Mount Rinjani on the island of Lombok , Indonesia . It left a large caldera next to Rinjani volcano , the lake Segara Anak . This volcanic eruption reached a volcanic explosivity index of 7 , making it one of the largest eruptions of the Holocene . The occurrence of this eruption was originally deduced from ice core information revealing a spike in sulfate deposition around the year 1257 ; in 2013 the source of this eruption at Samalas was definitively proven as a result of historical records from that time . This eruption proceeded during four distinct phases , alternately creating eruption columns reaching up for tens of kilometres and pyroclastic flows that buried large parts of Lombok , destroying human habitations including the city of Pamatan . Ash from the eruption fell as far as Java . The volume of the deposits left exceeds 10 km3 . Later activity from the volcano created additional volcanic centres in the caldera , including the present day active Barujari cone . The eruption was witnessed by humans on Lombok , who recorded its effects on text written on palm leaves . The aerosols injected by the volcano into the atmosphere reduced the solar radiation reaching the surface of the Earth , triggering a volcanic winter and cooling lasting for several years . It is possible that the 1257 Samalas eruption helped trigger the Little Ice Age ....
74
+
75
+
76
+ **Example 4:**
77
+
78
+ - Query: ‘While volcanic eruptions are natural events, it was the timing of these that had such a noticeable effect on the trend.
79
+
80
+ - Doc Title: 1257 Samalas eruption
81
+
82
+ - Doc Text: The 1257 Samalas eruption was a major volcanic eruption of the Samalas volcano , next to Mount Rinjani on the island of Lombok , Indonesia . It left a large caldera next to Rinjani volcano , the lake Segara Anak . This volcanic eruption reached a volcanic explosivity index of 7 , making it one of the largest eruptions of the Holocene . The occurrence of this eruption was originally deduced from ice core information revealing a spike in sulfate deposition around the year 1257 ; in 2013 the source of this eruption at Samalas was definitively proven as a result of historical records from that time . This eruption proceeded during four distinct phases , alternately creating eruption columns reaching up for tens of kilometres and pyroclastic flows that buried large parts of Lombok , destroying human habitations including the city of Pamatan . Ash from the eruption fell as far as Java . The volume of the deposits left exceeds 10 km3 . Later activity from the volcano created additional volcanic centres in the caldera , including the present day active Barujari cone . The eruption was witnessed by humans on Lombok , who recorded its effects on text written on palm leaves . The aerosols injected by the volcano into the atmosphere reduced the solar radiation reaching the surface of the Earth , triggering a volcanic winter and cooling lasting for several years . It is possible that the 1257 Samalas eruption helped trigger the Little Ice Age ....
83
+
84
+
85
+ **Example 5:**
86
+
87
+ - Query: A drop in volcanic activity caused warming.
88
+
89
+ - Doc Title: 1257 Samalas eruption
90
+
91
+ - Doc Text: The 1257 Samalas eruption was a major volcanic eruption of the Samalas volcano , next to Mount Rinjani on the island of Lombok , Indonesia . It left a large caldera next to Rinjani volcano , the lake Segara Anak . This volcanic eruption reached a volcanic explosivity index of 7 , making it one of the largest eruptions of the Holocene . The occurrence of this eruption was originally deduced from ice core information revealing a spike in sulfate deposition around the year 1257 ; in 2013 the source of this eruption at Samalas was definitively proven as a result of historical records from that time . This eruption proceeded during four distinct phases , alternately creating eruption columns reaching up for tens of kilometres and pyroclastic flows that buried large parts of Lombok , destroying human habitations including the city of Pamatan . Ash from the eruption fell as far as Java . The volume of the deposits left exceeds 10 km3 . Later activity from the volcano created additional volcanic centres in the caldera , including the present day active Barujari cone . The eruption was witnessed by humans on Lombok , who recorded its effects on text written on palm leaves . The aerosols injected by the volcano into the atmosphere reduced the solar radiation reaching the surface of the Earth , triggering a volcanic winter and cooling lasting for several years . It is possible that the 1257 Samalas eruption helped trigger the Little Ice Age ....
92
+
93
+
94
+ **Example 6:**
95
+
96
+ - Query: A drop of volcanic activity in the early 20th century may have had a warming effect.
97
+
98
+ - Doc Title: 1257 Samalas eruption
99
+
100
+ - Doc Text: The 1257 Samalas eruption was a major volcanic eruption of the Samalas volcano , next to Mount Rinjani on the island of Lombok , Indonesia . It left a large caldera next to Rinjani volcano , the lake Segara Anak . This volcanic eruption reached a volcanic explosivity index of 7 , making it one of the largest eruptions of the Holocene . The occurrence of this eruption was originally deduced from ice core information revealing a spike in sulfate deposition around the year 1257 ; in 2013 the source of this eruption at Samalas was definitively proven as a result of historical records from that time . This eruption proceeded during four distinct phases , alternately creating eruption columns reaching up for tens of kilometres and pyroclastic flows that buried large parts of Lombok , destroying human habitations including the city of Pamatan . Ash from the eruption fell as far as Java . The volume of the deposits left exceeds 10 km3 . Later activity from the volcano created additional volcanic centres in the caldera , including the present day active Barujari cone . The eruption was witnessed by humans on Lombok , who recorded its effects on text written on palm leaves . The aerosols injected by the volcano into the atmosphere reduced the solar radiation reaching the surface of the Earth , triggering a volcanic winter and cooling lasting for several years . It is possible that the 1257 Samalas eruption helped trigger the Little Ice Age ....
101
+
102
+
103
+ ## CORE
104
+
105
+ **Only Score 1 exists**
106
+ **Score 1 = QUESTIONABLY RELEVANT** ⚠️
107
+
108
+ ## Special Notes
109
+
110
+ - **Original Purpose**: Fact verification - find evidence that supports/refutes climate claims
111
+ - **What Score 1 Really Means**: Documents supposedly providing evidence but often just share keywords
112
+ - **Major Problem**: Documents don't actually address claims (e.g., "volcanic warming" query gets "1257 eruption" doc)
113
+ - **Dataset Type**: Fact verification (but poorly executed in BEIR version)
114
+
115
+ ## Training Verdict
116
+
117
+ **BAD FOR TRAINING** ✗
118
+ - Score 1 documents often just keyword match without addressing the claim
119
+ - High false positive rate - many "relevant" docs aren't actually relevant
120
+ - Will train model to do keyword matching instead of true relevance
121
+
122
+ ## Observations
test_scores/dataset_reports/dbpedia-entity_analysis.md ADDED
@@ -0,0 +1,908 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DBPEDIA-ENTITY Dataset Analysis
2
+
3
+ ## Overview
4
+
5
+ - Total queries available: 467
6
+
7
+ - Total corpus documents: 4635922
8
+
9
+ - Corpus documents loaded for examples: 50000
10
+
11
+
12
+ ## Split Statistics
13
+
14
+ | Split | Queries | Total Pairs | Score Distribution |
15
+
16
+ |-------|---------|-------------|-------------------|
17
+
18
+ | train | N/A | N/A | N/A |
19
+
20
+ | dev | 67 | 5673 | 0:4268, 1:1024, 2:381 |
21
+
22
+ | test | 400 | 43515 | 0:28229, 1:8785, 2:6501 |
23
+
24
+
25
+ ## Score Analysis
26
+
27
+ **Multiple score levels found: [0, 1, 2]**
28
+
29
+
30
+ ## Examples
31
+
32
+ ### Score Comparison Examples
33
+
34
+ #### Comparison Example 1
35
+
36
+ **Query:** Eiffel
37
+
38
+ **From:** dev split
39
+
40
+
41
+ **Score 2 Documents:**
42
+
43
+
44
+ *Document 1:*
45
+
46
+ - Title: Eiffel Tower
47
+
48
+ - Text: The Eiffel Tower (/ˈaɪfəl ˈtaʊər/ EYE-fəl TOWR; French: tour Eiffel [tuʁ‿ɛfɛl] About this sound listen) is an iron lattice tower located on the Champ de Mars in Paris, France. It was named after the engineer Alexandre Gustave Eiffel, whose company designed and built the tower....
49
+
50
+
51
+ *Document 2:*
52
+
53
+ - Title: Gustave Eiffel
54
+
55
+ - Text: Alexandre Gustave Eiffel (born Bönickhausen; /ˈaɪfəl/; French pronunciation: ​[efɛl]; 15 December 1832 – 27 December 1923) was a French civil engineer and architect. A graduate of the prestigious École Centrale des Arts et Manufactures of France, he made his name with various bridges for the French railway network, most famously the Garabit viaduct....
56
+
57
+
58
+
59
+ **Score 1 Documents:**
60
+
61
+
62
+ *Document 1:*
63
+
64
+ - Title: Eiffel (programming language)
65
+
66
+ - Text: Eiffel is an ISO-standardized, object-oriented programming language designed by Bertrand Meyer (an object-orientation proponent and author of Object-Oriented Software Construction) and Eiffel Software. The design of the language is closely connected with the Eiffel programming method....
67
+
68
+
69
+
70
+ **Score 0 Documents:**
71
+
72
+
73
+ *Document 1:*
74
+
75
+ - Title: Lightning
76
+
77
+ - Text: Lightning is a sudden electrostatic discharge during an electrical storm between electrically charged regions of a cloud (called intra-cloud lightning or IC), between that cloud and another cloud (CC lightning), or between a cloud and the ground (CG lightning). The charged regions in the atmosphere temporarily equalize themselves through this discharge referred to as a strike if it hits an object ...
78
+
79
+
80
+ *Document 2:*
81
+
82
+ - Title: Nice
83
+
84
+ - Text: Nice (/ˈniːs/, French pronunciation: ​[nis]; Niçard Occitan: Niça [classical norm] or Nissa [nonstandard], Italian: Nizza or Nizza Marittima, Greek: Νίκαια, Latin: Nicaea) is the fifth most populous city in France, after Paris, Marseille, Lyon and Toulouse, and it is the capital of the Alpes Maritimes département. The urban area of Nice extends beyond the administrative city limits with a populati...
85
+
86
+
87
+
88
+ ---
89
+
90
+
91
+ #### Comparison Example 2
92
+
93
+ **Query:** europe solar power facility
94
+
95
+ **From:** dev split
96
+
97
+
98
+ **Score 1 Documents:**
99
+
100
+
101
+ *Document 1:*
102
+
103
+ - Title: Solar energy
104
+
105
+ - Text: Solar energy is radiant light and heat from the Sun harnessed using a range of ever-evolving technologies such as solar heating, photovoltaics, solar thermal energy, solar architecture and artificial photosynthesis.It is an important source of renewable energy and its technologies are broadly characterized as either passive solar or active solar depending on the way they capture and distribute sol...
106
+
107
+
108
+
109
+ **Score 0 Documents:**
110
+
111
+
112
+ *Document 1:*
113
+
114
+ - Title: Germany
115
+
116
+ - Text: Germany (/ˈdʒɜrməni/; German: Deutschland [ˈdɔʏtʃlant]), officially the Federal Republic of Germany (German: Bundesrepublik Deutschland, About this sound listen ), is a federal parliamentary republic in western-central Europe. It includes 16 constituent states and covers an area of 357,021 square kilometres (137,847 sq mi) with a largely temperate seasonal climate. Its capital and largest city is ...
117
+
118
+
119
+ *Document 2:*
120
+
121
+ - Title: Renewable energy
122
+
123
+ - Text: Renewable energy is generally defined as energy that comes from resources which are naturally replenished on a human timescale, such as sunlight, wind, rain, tides, waves, and geothermal heat....
124
+
125
+
126
+
127
+ ---
128
+
129
+
130
+ #### Comparison Example 3
131
+
132
+ **Query:** vietnam war facts
133
+
134
+ **From:** dev split
135
+
136
+
137
+ **Score 2 Documents:**
138
+
139
+
140
+ *Document 1:*
141
+
142
+ - Title: Gulf of Tonkin incident
143
+
144
+ - Text: The Gulf of Tonkin incident (Vietnamese: Sự kiện Vịnh Bắc Bộ), also known as the USS Maddox incident, is the name given to what were originally claimed to be two separate confrontations involving North Vietnam and the United States in the waters of the Gulf of Tonkin. The original American report blamed North Vietnam for both incidents, but eventually became very controversial with widespread clai...
145
+
146
+
147
+ *Document 2:*
148
+
149
+ - Title: Vietnam War
150
+
151
+ - Text: The Vietnam War (Vietnamese: Chiến tranh Việt Nam), also known as the Second Indochina War, and also known in Vietnam as Resistance War Against America (Vietnamese: Kháng chiến chống Mỹ) or simply the American War, was a Cold War-era proxy war that occurred in Vietnam, Laos, and Cambodia from 1 November 1955 to the fall of Saigon on 30 April 1975....
152
+
153
+
154
+
155
+ **Score 1 Documents:**
156
+
157
+
158
+ *Document 1:*
159
+
160
+ - Title: Agent Orange
161
+
162
+ - Text: Agent Orange—or Herbicide Orange (HO)—is one of the herbicides and defoliants used by the U.S. military as part of its herbicidal warfare program, Operation Ranch Hand, during the Vietnam War from 1961 to 1971. It was a mixture of equal parts of two herbicides, 2,4,5-T and 2,4-D.During the late 1940s and 1950s, the US and British collaborated on development of herbicides with potential application...
163
+
164
+
165
+ *Document 2:*
166
+
167
+ - Title: My Lai Massacre
168
+
169
+ - Text: The Mỹ Lai Massacre (Vietnamese: thảm sát Mỹ Lai [tʰɐ̃ːm ʂɐ̌ːt mǐˀ lɐːj], [mǐˀlɐːj]; /ˌmiːˈlaɪ/, /ˌmiːˈleɪ/, or /ˌmaɪˈlaɪ/) was the Vietnam War mass killing of between 347 and 504 unarmed civilians in South Vietnam on March 16, 1968. It was committed by U.S. Army soldiers from the Company C of the 1st Battalion, 20th Infantry Regiment, 11th Brigade of the 23rd (Americal) Infantry Division. Victims...
170
+
171
+
172
+ *Document 3:*
173
+
174
+ - Title: People's Army of Vietnam
175
+
176
+ - Text: The Vietnam People's Army (VPA; Vietnamese: Quân Đội Nhân Dân Việt Nam), also known as the Vietnamese People's Army and the People's Army of Vietnam, is the military forces of the Socialist Republic of Vietnam. VPA is a part of Vietnam People's Armed Forces. The VPA includes: Ground Force (including Strategic Rear Forces), Navy, Air Force, Border Defense Force, and Coast Guard. However, Vietnam d...
177
+
178
+
179
+
180
+ **Score 0 Documents:**
181
+
182
+
183
+ *Document 1:*
184
+
185
+ - Title: Bảo Đại
186
+
187
+ - Text: In the Vietnamese name below, Nguyễn Phúc is the surname.Bảo Đại (lit. "keeper of greatness", 22 October 1913 – 30 July 1997), born Nguyễn Phúc Vĩnh Thụy, was the 13th and final emperor of the Nguyễn Dynasty, which was the last dynasty of Vietnam. From 1926 to 1945, he was king of Annam. During this period, Annam was a protectorate within French Indochina, covering the central two-thirds of the ...
188
+
189
+
190
+ *Document 2:*
191
+
192
+ - Title: French Indochina
193
+
194
+ - Text: French Indo-China (now commonly known as French Indochina) (French: Indochine française; Khmer: សហភាពឥណ្ឌូចិន; Vietnamese: Đông Dương thuộc Pháp, pronounced [ɗoŋm zɰəŋ tʰuə̀k fǎp], frequently abbreviated to Đông Pháp; Lao: ຝຣັ່ງແຫຼັມອິນດູຈີນ), officially known as the Indo-Chinese Union (French: Union indochinoise) after 1887 and the Indo-Chinese Federation (French: Fédération indochinoise) after ...
195
+
196
+
197
+ *Document 3:*
198
+
199
+ - Title: Mainland Southeast Asia
200
+
201
+ - Text: Mainland Southeast Asia, also known as Indochina or the Indochinese Peninsula, refers to the continental portion of Southeast Asia lying east of India and roughly south or southwest of China....
202
+
203
+
204
+
205
+ ---
206
+
207
+
208
+ #### Comparison Example 4
209
+
210
+ **Query:** guitar chord minor
211
+
212
+ **From:** dev split
213
+
214
+
215
+ **Score 1 Documents:**
216
+
217
+
218
+ *Document 1:*
219
+
220
+ - Title: Flamenco
221
+
222
+ - Text: Flamenco (Spanish pronunciation: [flaˈmeŋko]) is a genre of music and dance native to the southern Spanish regions of Andalusia, Extremadura and Murcia. It includes cante (singing), toque (guitar playing), baile (dance) and jaleo, which refers to the vocalizations and rhythmic sounds of palmas (handclapping) and pitos (finger snapping). First mentioned in literature in 1774, the genre is thought t...
223
+
224
+
225
+ *Document 2:*
226
+
227
+ - Title: Guitar
228
+
229
+ - Text: The guitar is a popular musical instrument classified as a string instrument with anywhere from 4 to 18 strings, usually having 6. The sound is projected either acoustically or through electrical amplification (for an acoustic guitar or an electric guitar, respectively). It is typically played by strumming or plucking the strings with the right hand while fretting (or pressing against the fret) t...
230
+
231
+
232
+ *Document 3:*
233
+
234
+ - Title: Jazz guitar
235
+
236
+ - Text: The term jazz guitar may refer to either a type of guitar or to the variety of guitar playing styles used in the various genres which are commonly termed "jazz". The jazz-type guitar was born as a result of using electric amplification to increase the volume of conventional acoustic guitars.Conceived in the early 1930s, the electric guitar became a necessity as jazz musicians sought to amplify the...
237
+
238
+
239
+
240
+ **Score 0 Documents:**
241
+
242
+
243
+ *Document 1:*
244
+
245
+ - Title: Index of music articles
246
+
247
+ - Text: This page is a list articles related to music....
248
+
249
+
250
+ *Document 2:*
251
+
252
+ - Title: Interval (music)
253
+
254
+ - Text: In music theory, an interval is the difference between two pitches. An interval may be described as horizontal, linear, or melodic if it refers to successively sounding tones, such as two adjacent pitches in a melody, and vertical or harmonic if it pertains to simultaneously sounding tones, such as in a chord.In Western music, intervals are most commonly differences between notes of a diatonic sca...
255
+
256
+
257
+ *Document 3:*
258
+
259
+ - Title: Rhythm guitar
260
+
261
+ - Text: Rhythm guitar is a technique and role that performs a combination of two functions: to provide all or part of the rhythmic pulse in conjunction with singers or other instruments; and to provide all or part of the harmony, i.e. the chords, where a chord is a group of notes played together. Therefore, the basic technique of rhythm guitar is to hold down a series of chords with the fretting hand whil...
262
+
263
+
264
+
265
+ ---
266
+
267
+
268
+ #### Comparison Example 5
269
+
270
+ **Query:** guitar classical bach
271
+
272
+ **From:** dev split
273
+
274
+
275
+ **Score 1 Documents:**
276
+
277
+
278
+ *Document 1:*
279
+
280
+ - Title: Andrés Segovia
281
+
282
+ - Text: Andrés Segovia Torres, 1st Marquis of Salobreña (Spanish: [anˈdɾes seˈɣoβja ˈtores]) (21 February 1893 – 2 June 1987), known as Andrés Segovia, was a virtuoso Spanish classical guitarist from Linares, Spain. Regarded as one of the greatest guitarists of all time, he is seen as the grandfather of the classical guitar....
283
+
284
+
285
+
286
+ **Score 0 Documents:**
287
+
288
+
289
+ *Document 1:*
290
+
291
+ - Title: Carl Philipp Emanuel Bach
292
+
293
+ - Text: Carl Philipp Emanuel Bach (8 March 1714 – 14 December 1788) was a German Classical period musician and composer, the fifth child and second (surviving) son of Johann Sebastian Bach and Maria Barbara Bach. His second name was given in honor of his godfather Georg Philipp Telemann, a friend of Johann Sebastian Bach.C. P. E. Bach was an influential composer working at a time of transition between his...
294
+
295
+
296
+ *Document 2:*
297
+
298
+ - Title: Classical guitar
299
+
300
+ - Text: The classical guitar (also called the Spanish guitar, or less specifically, the nylon-string guitar) is the member of the guitar family used in classical music....
301
+
302
+
303
+ *Document 3:*
304
+
305
+ - Title: Classical period (music)
306
+
307
+ - Text: The dates of the Classical period in Western music are generally accepted as being between about 1750 and 1820. However, the term classical music is used in a colloquial sense as a synonym for Western art music, which describes a variety of Western musical styles from the ninth century to the present, and especially from the sixteenth or seventeenth to the nineteenth....
308
+
309
+
310
+
311
+ ---
312
+
313
+
314
+ ### Examples by Split and Score
315
+
316
+
317
+ #### DEV Split
318
+
319
+ **Scores in dev: [0, 1, 2]**
320
+
321
+
322
+ ##### Score 0 Examples (dev)
323
+
324
+
325
+ **Example 1:**
326
+
327
+ - Query: Eiffel
328
+
329
+ - Doc Title: Lightning
330
+
331
+ - Doc Text: Lightning is a sudden electrostatic discharge during an electrical storm between electrically charged regions of a cloud (called intra-cloud lightning or IC), between that cloud and another cloud (CC lightning), or between a cloud and the ground (CG lightning). The charged regions in the atmosphere ...
332
+
333
+
334
+ **Example 2:**
335
+
336
+ - Query: europe solar power facility
337
+
338
+ - Doc Title: Germany
339
+
340
+ - Doc Text: Germany (/ˈdʒɜrməni/; German: Deutschland [ˈdɔʏtʃlant]), officially the Federal Republic of Germany (German: Bundesrepublik Deutschland, About this sound listen ), is a federal parliamentary republic in western-central Europe. It includes 16 constituent states and covers an area of 357,021 square ki...
341
+
342
+
343
+ **Example 3:**
344
+
345
+ - Query: vietnam war facts
346
+
347
+ - Doc Title: Bảo Đại
348
+
349
+ - Doc Text: In the Vietnamese name below, Nguyễn Phúc is the surname.Bảo Đại (lit. "keeper of greatness", 22 October 1913 – 30 July 1997), born Nguyễn Phúc Vĩnh Thụy, was the 13th and final emperor of the Nguyễn Dynasty, which was the last dynasty of Vietnam. From 1926 to 1945, he was king of Annam. During th...
350
+
351
+
352
+ **Example 4:**
353
+
354
+ - Query: vietnam travel airports
355
+
356
+ - Doc Title: Charles de Gaulle Airport
357
+
358
+ - Doc Text: Paris Charles de Gaulle Airport (French: Aéroport de Paris-Charles-de-Gaulle, IATA: CDG, ICAO: LFPG), also known as Roissy Airport (or just Roissy in French), is one of the world's principal aviation centres, as well as France's largest international airport. It is named after Charles de Gaulle (189...
359
+
360
+
361
+ **Example 5:**
362
+
363
+ - Query: guitar chord minor
364
+
365
+ - Doc Title: Index of music articles
366
+
367
+ - Doc Text: This page is a list articles related to music....
368
+
369
+
370
+ **Example 6:**
371
+
372
+ - Query: guitar classical bach
373
+
374
+ - Doc Title: Carl Philipp Emanuel Bach
375
+
376
+ - Doc Text: Carl Philipp Emanuel Bach (8 March 1714 – 14 December 1788) was a German Classical period musician and composer, the fifth child and second (surviving) son of Johann Sebastian Bach and Maria Barbara Bach. His second name was given in honor of his godfather Georg Philipp Telemann, a friend of Johann ...
377
+
378
+
379
+ **Example 7:**
380
+
381
+ - Query: tango music instruments
382
+
383
+ - Doc Title: Accordion
384
+
385
+ - Doc Text: Accordions (from 19th-century German Akkordeon, from Akkord - "musical chord, concord of sounds") are a family of box-shaped musical instruments of the bellows-driven free-reed aerophone type, colloquially referred to as a squeezebox. A person who plays the accordion is called an accordionist....
386
+
387
+
388
+ **Example 8:**
389
+
390
+ - Query: bicycle benefits environment
391
+
392
+ - Doc Title: Bicycle
393
+
394
+ - Doc Text: A bicycle, often called a bike or cycle, is a human-powered, pedal-driven, single-track vehicle, having two wheels attached to a frame, one behind the other. A bicycle rider is called a cyclist, or bicyclist.Bicycles were introduced in the 19th century in Europe and, as of 2003, more than a billion ...
395
+
396
+
397
+ **Example 9:**
398
+
399
+ - Query: July, 1850 president died Millard Fillmore sworn following day
400
+
401
+ - Doc Title: 1850
402
+
403
+ - Doc Text: Year 1850 (MDCCCL) was a common year starting on Tuesday (link will display the full calendar) of the Gregorian calendar and a common year starting on Sunday of the 12-day-slower Julian calendar....
404
+
405
+
406
+ **Example 10:**
407
+
408
+ - Query: 1906 territory Papua island Australian
409
+
410
+ - Doc Title: Australian dollar
411
+
412
+ - Doc Text: The Australian dollar (sign: $; code: AUD) is the currency of the Commonwealth of Australia, including Christmas Island, Cocos (Keeling) Islands, and Norfolk Island, as well as the independent Pacific Island states of Kiribati, Nauru and Tuvalu. Within Australia it is almost always abbreviated with ...
413
+
414
+
415
+ ##### Score 1 Examples (dev)
416
+
417
+
418
+ **Example 1:**
419
+
420
+ - Query: Eiffel
421
+
422
+ - Doc Title: Eiffel (programming language)
423
+
424
+ - Doc Text: Eiffel is an ISO-standardized, object-oriented programming language designed by Bertrand Meyer (an object-orientation proponent and author of Object-Oriented Software Construction) and Eiffel Software. The design of the language is closely connected with the Eiffel programming method....
425
+
426
+
427
+ **Example 2:**
428
+
429
+ - Query: europe solar power facility
430
+
431
+ - Doc Title: Solar energy
432
+
433
+ - Doc Text: Solar energy is radiant light and heat from the Sun harnessed using a range of ever-evolving technologies such as solar heating, photovoltaics, solar thermal energy, solar architecture and artificial photosynthesis.It is an important source of renewable energy and its technologies are broadly charac...
434
+
435
+
436
+ **Example 3:**
437
+
438
+ - Query: vietnam war facts
439
+
440
+ - Doc Title: Agent Orange
441
+
442
+ - Doc Text: Agent Orange—or Herbicide Orange (HO)—is one of the herbicides and defoliants used by the U.S. military as part of its herbicidal warfare program, Operation Ranch Hand, during the Vietnam War from 1961 to 1971. It was a mixture of equal parts of two herbicides, 2,4,5-T and 2,4-D.During the late 1940...
443
+
444
+
445
+ **Example 4:**
446
+
447
+ - Query: guitar chord minor
448
+
449
+ - Doc Title: Flamenco
450
+
451
+ - Doc Text: Flamenco (Spanish pronunciation: [flaˈmeŋko]) is a genre of music and dance native to the southern Spanish regions of Andalusia, Extremadura and Murcia. It includes cante (singing), toque (guitar playing), baile (dance) and jaleo, which refers to the vocalizations and rhythmic sounds of palmas (hand...
452
+
453
+
454
+ **Example 5:**
455
+
456
+ - Query: guitar classical bach
457
+
458
+ - Doc Title: Andrés Segovia
459
+
460
+ - Doc Text: Andrés Segovia Torres, 1st Marquis of Salobreña (Spanish: [anˈdɾes seˈɣoβja ˈtores]) (21 February 1893 – 2 June 1987), known as Andrés Segovia, was a virtuoso Spanish classical guitarist from Linares, Spain. Regarded as one of the greatest guitarists of all time, he is seen as the grandfather of the...
461
+
462
+
463
+ **Example 6:**
464
+
465
+ - Query: tango music instruments
466
+
467
+ - Doc Title: Brass instrument
468
+
469
+ - Doc Text: A brass instrument is a musical instrument that produces sound by sympathetic vibration of air in a tubular resonator in sympathy with the vibration of the player's lips. Brass instruments are also called labrosones, literally meaning "lip-vibrated instruments".There are several factors involved in ...
470
+
471
+
472
+ **Example 7:**
473
+
474
+ - Query: bicycle benefits environment
475
+
476
+ - Doc Title: Cycling
477
+
478
+ - Doc Text: Cycling, also called bicycling or biking, is the use of bicycles for transport, recreation, exercise or sport. Persons engaged in cycling are referred to as "cyclists", "bikers", or less commonly, as "bicyclists". Apart from two-wheeled bicycles, "cycling" also includes the riding of unicycles, tric...
479
+
480
+
481
+ **Example 8:**
482
+
483
+ - Query: 1906 territory Papua island Australian
484
+
485
+ - Doc Title: History of Papua New Guinea
486
+
487
+ - Doc Text: The prehistory of Papua New Guinea can be traced back to about 60,000 years ago when people first migrated towards the Australian continent. The written history began when European navigators first sighted New Guinea in the early part of the 16th century....
488
+
489
+
490
+ **Example 9:**
491
+
492
+ - Query: seat Florida country Dade
493
+
494
+ - Doc Title: Dade City, Florida
495
+
496
+ - Doc Text: Dade City is a city in Pasco County, Florida. It is popular with tourists for its antique stores, restaurants and historic architecture including the Pasco County Courthouse, Hugh Embry Library, and Edwinola. The annual Kumquat Festival is hosted downtown and the surrounding area is a large producer...
497
+
498
+
499
+ **Example 10:**
500
+
501
+ - Query: animals lay eggs mammals
502
+
503
+ - Doc Title: Mammal
504
+
505
+ - Doc Text: Mammals (class Mammalia /məˈmeɪli.ə/ from Latin mamma "breast") are any members of a clade of endothermic amniotes distinguished from reptiles and birds by the possession of hair, three middle ear bones, mammary glands, and a neocortex (a region of the brain)....
506
+
507
+
508
+ ##### Score 2 Examples (dev)
509
+
510
+
511
+ **Example 1:**
512
+
513
+ - Query: Eiffel
514
+
515
+ - Doc Title: Eiffel Tower
516
+
517
+ - Doc Text: The Eiffel Tower (/ˈaɪfəl ˈtaʊər/ EYE-fəl TOWR; French: tour Eiffel [tuʁ‿ɛfɛl] About this sound listen) is an iron lattice tower located on the Champ de Mars in Paris, France. It was named after the engineer Alexandre Gustave Eiffel, whose company designed and built the tower....
518
+
519
+
520
+ **Example 2:**
521
+
522
+ - Query: vietnam war facts
523
+
524
+ - Doc Title: Gulf of Tonkin incident
525
+
526
+ - Doc Text: The Gulf of Tonkin incident (Vietnamese: Sự kiện Vịnh Bắc Bộ), also known as the USS Maddox incident, is the name given to what were originally claimed to be two separate confrontations involving North Vietnam and the United States in the waters of the Gulf of Tonkin. The original American report bl...
527
+
528
+
529
+ **Example 3:**
530
+
531
+ - Query: tango music instruments
532
+
533
+ - Doc Title: Concertina
534
+
535
+ - Doc Text: A concertina is a free-reed musical instrument, like the various accordions and the harmonica. It has a bellows, and buttons typically on both ends of it. When pressed, the buttons travel in the same direction as the bellows, unlike accordion buttons, which travel perpendicularly to the bellows.The ...
536
+
537
+
538
+ **Example 4:**
539
+
540
+ - Query: animals lay eggs mammals
541
+
542
+ - Doc Title: Echidna
543
+
544
+ - Doc Text: Echidnas /ɨˈkɪdnə/, sometimes known as spiny anteaters, belong to the family Tachyglossidae in the monotreme order of egg-laying mammals. The four extant species, together with the platypus, are the only surviving members of that order and are the only extant mammals that lay eggs. Their diet consis...
545
+
546
+
547
+ **Example 5:**
548
+
549
+ - Query: Operating systems to which Steve Jobs related
550
+
551
+ - Doc Title: Apple Lisa
552
+
553
+ - Doc Text: The Lisa is a personal computer designed by Apple Computer, Inc. during the early 1980s. It was the first personal computer to offer a graphical user interface in a machine aimed at individual business users. Development of the Lisa began in 1978. The Lisa sold poorly, with only 100,000 units sold.I...
554
+
555
+
556
+ **Example 6:**
557
+
558
+ - Query: countries which have won the FIFA world cup
559
+
560
+ - Doc Title: Brazil
561
+
562
+ - Doc Text: Brazil (/brəˈzɪl/; Portuguese: Brasil [bɾaˈziw] ), officially the Federative Republic of Brazil (Portuguese: República Federativa do Brasil, About this sound listen ), is the largest country in both South America and the Latin American region. It is the world's fifth-largest country, both by geogra...
563
+
564
+
565
+ **Example 7:**
566
+
567
+ - Query: Compilers that can compile both C and C++
568
+
569
+ - Doc Title: Cfront
570
+
571
+ - Doc Text: Cfront was the original compiler for C++ (then known as "C with Classes") from around 1983, which converted C++ to C; developed by Bjarne Stroustrup. The preprocessor did not understand all of the language and much of the code was written via translations. Cfront had a complete parser, built symbol ...
572
+
573
+
574
+ **Example 8:**
575
+
576
+ - Query: What is the longest river?
577
+
578
+ - Doc Title: Missouri River
579
+
580
+ - Doc Text: The Missouri River is the longest river in North America. Rising in the Rocky Mountains of western Montana, the Missouri flows east and south for 2,341 miles (3,767 km) before entering the Mississippi River north of St. Louis, Missouri. The river takes drainage from a sparsely populated, semi-arid w...
581
+
582
+
583
+ **Example 9:**
584
+
585
+ - Query: Who founded Intel?
586
+
587
+ - Doc Title: Intel
588
+
589
+ - Doc Text: Intel Corporation (commonly referred to as Intel) is an American multinational technology company headquartered in Santa Clara, California. Intel is one of the world's largest and highest valued semiconductor chip makers, based on revenue. It is the inventor of the x86 series of microprocessors, the...
590
+
591
+
592
+ **Example 10:**
593
+
594
+ - Query: What is the largest city in Australia?
595
+
596
+ - Doc Title: Sydney
597
+
598
+ - Doc Text: Sydney /ˈsɪdni/ is the state capital of New South Wales and the most populous city in Australia and Oceania. Located on Australia's east coast, the metropolis surrounds the world's largest natural harbour, and sprawls towards the Blue Mountains to the west. Residents of Sydney are known as \Sydneysi...
599
+
600
+
601
+ #### TEST Split
602
+
603
+ **Scores in test: [0, 1, 2]**
604
+
605
+
606
+ ##### Score 0 Examples (test)
607
+
608
+
609
+ **Example 1:**
610
+
611
+ - Query: Szechwan dish food cuisine
612
+
613
+ - Doc Title: Black cardamom
614
+
615
+ - Doc Text: Black cardamom, also known as hill cardamom, Bengal cardamom, greater cardamom, Indian cardamom, Nepal cardamom, winged cardamom, or brown cardamom, comes from either of two species in the family Zingiberaceae. Its seed pods have a strong camphor-like flavor, with a smoky character derived from the ...
616
+
617
+
618
+ **Example 2:**
619
+
620
+ - Query: roman architecture
621
+
622
+ - Doc Title: Antoninus Pius
623
+
624
+ - Doc Text: Antoninus Pius (Latin: Titus Fulvus Aelius Hadrianus Antoninus Augustus Pius; born 19 September, 86 AD – died 7 March, 161 AD), also known as Antoninus, was Roman Emperor from 138 to 161. He was one of the Five Good Emperors in the Nerva–Antonine dynasty and the Aurelii.He acquired the name Pius af...
625
+
626
+
627
+ **Example 3:**
628
+
629
+ - Query: finland car industry manufacturer saab sisu
630
+
631
+ - Doc Title: DKW
632
+
633
+ - Doc Text: DKW (Dampf-Kraft-Wagen, English: steam-driven car) is a defunct German car and motorcycle marque.In 1916, Danish engineer Jørgen Skafte Rasmussen founded a factory in Zschopau, Saxony, Germany, to produce steam fittings. That year he attempted to produce a steam-driven car, called the DKW. Although ...
634
+
635
+
636
+ **Example 4:**
637
+
638
+ - Query: france second world war normandy
639
+
640
+ - Doc Title: Normandy
641
+
642
+ - Doc Text: Normandy (/ˈnɔrməndi/; French: Normandie, pronounced [nɔʁ.mɑ̃.di], Norman: Normaundie, from Old French Normanz, plural of Normant, originally from the word for "northman" in several Scandinavian languages) is a geographical region of France corresponding to the former Duchy of Normandy.Since 1956, N...
643
+
644
+
645
+ **Example 5:**
646
+
647
+ - Query: social network group selection
648
+
649
+ - Doc Title: Evolutionary psychology
650
+
651
+ - Doc Text: Evolutionary psychology (EP) is a theoretical approach in the social and natural sciences that examines psychological structure from a modern evolutionary perspective. It seeks to identify which human psychological traits are evolved adaptations – that is, the functional products of natural selectio...
652
+
653
+
654
+ **Example 6:**
655
+
656
+ - Query: D-Day normandy invasion
657
+
658
+ - Doc Title: Bedford, Virginia
659
+
660
+ - Doc Text: Bedford is an incorporated town located within Bedford County in the U.S. state of Virginia. It serves as the county seat of Bedford County. As of the 2010 census, the population was 6,622....
661
+
662
+
663
+ **Example 7:**
664
+
665
+ - Query: web ranking scoring algorithm
666
+
667
+ - Doc Title: Algorithm
668
+
669
+ - Doc Text: In mathematics and computer science, an algorithm (/ˈælɡərɪðəm/ AL-gə-ri-dhəm) is a self-contained step-by-step set of operations to be performed. Algorithms exist that perform calculation, data processing, and automated reasoning.An algorithm is an effective method that can be expressed within a f...
670
+
671
+
672
+ **Example 8:**
673
+
674
+ - Query: virtual museums
675
+
676
+ - Doc Title: List of museums by country
677
+
678
+ - Doc Text: This list of museums is defined for this context as institutions (including nonprofit organizations, government entities, and private businesses) that collect and care for objects of cultural, artistic, scientific, or historical interest and make their collections or related exhibits available for p...
679
+
680
+
681
+ **Example 9:**
682
+
683
+ - Query: Indian food
684
+
685
+ - Doc Title: Rice
686
+
687
+ - Doc Text: Rice is the seed of the grass species Oryza sativa (Asian rice) or Oryza glaberrima (African rice). As a cereal grain, it is the most widely consumed staple food for a large part of the world's human population, especially in Asia....
688
+
689
+
690
+ **Example 10:**
691
+
692
+ - Query: composer museum
693
+
694
+ - Doc Title: Béla Bartók
695
+
696
+ - Doc Text: Béla Viktor János Bartók (/ˈbɑrtɒk/; Hungarian pronunciation: [ˈbeːlɒ ˈbɒrtoːk]; March 25, 1881 – September 26, 1945) was a Hungarian composer and pianist. He is considered one of the most important composers of the 20th century; he and Liszt are regarded as Hungary's greatest composers (Gillies 200...
697
+
698
+
699
+ ##### Score 1 Examples (test)
700
+
701
+
702
+ **Example 1:**
703
+
704
+ - Query: Szechwan dish food cuisine
705
+
706
+ - Doc Title: American Chinese cuisine
707
+
708
+ - Doc Text: American Chinese cuisine, known in the United States as simply Chinese cuisine, is a style of food developed by Americans of Chinese descent and served in many North American Chinese restaurants. The dishes typically served in restaurants cater to American tastes and differ significantly from Chines...
709
+
710
+
711
+ **Example 2:**
712
+
713
+ - Query: roman architecture
714
+
715
+ - Doc Title: Arch
716
+
717
+ - Doc Text: An arch is a curved structure that spans a space and may or may not support weight above it. Arch may be synonymous with vault but vault may be distinguished as a continuous arch forming a roof. Arches appeared as early as the 2nd millennium BC in Mesopotamian brick architecture and their systemati...
718
+
719
+
720
+ **Example 3:**
721
+
722
+ - Query: finland car industry manufacturer saab sisu
723
+
724
+ - Doc Title: Valmet
725
+
726
+ - Doc Text: The Valmet Corporation is a Finnish company, a leading global developer and supplier of services and technologies for the pulp, paper and energy industries. It was reborn through the demerger of the pulp, paper and power businesses from Metso Group in December 2013. Valmet is organized around three ...
727
+
728
+
729
+ **Example 4:**
730
+
731
+ - Query: france second world war normandy
732
+
733
+ - Doc Title: Invasion of Normandy
734
+
735
+ - Doc Text: The Invasion of Normandy was the invasion by and establishment of Western Allied forces in Normandy, during Operation Overlord in 1944 during World War II; the largest amphibious invasion to ever take place.D-Day, the day of the initial assaults, was Tuesday 6 June 1944. Allied land forces that saw ...
736
+
737
+
738
+ **Example 5:**
739
+
740
+ - Query: social network group selection
741
+
742
+ - Doc Title: Community
743
+
744
+ - Doc Text: A community is a social unit of any size that shares common values. Although embodied or face-to-face communities are usually small, larger or more extended communities such as a national community, international community and virtual community are also studied....
745
+
746
+
747
+ **Example 6:**
748
+
749
+ - Query: D-Day normandy invasion
750
+
751
+ - Doc Title: 1944
752
+
753
+ - Doc Text: 1944 (MCMXLIV)was a leap year starting on Saturday of the Gregorian calendar (dominical letter BA), the 1944th year of the Common Era (CE) and Anno Domini (AD) designations, the 944th year of the 2nd millennium, the 44th year of the 20th century, and the 5th year of the 1940s decade....
754
+
755
+
756
+ **Example 7:**
757
+
758
+ - Query: web ranking scoring algorithm
759
+
760
+ - Doc Title: Google Search
761
+
762
+ - Doc Text: Google Search, commonly referred to as Google Web Search or just Google, is a web search engine owned by Google Inc. It is the most-used search engine on the World Wide Web, handling more than three billion searches each day. As of February 2015 it is the most used search engine in the US with 64.5%...
763
+
764
+
765
+ **Example 8:**
766
+
767
+ - Query: virtual museums
768
+
769
+ - Doc Title: ArtCyclopedia
770
+
771
+ - Doc Text: Artcyclopedia is an online database of museum-quality fine art founded by Canadian John Malyon....
772
+
773
+
774
+ **Example 9:**
775
+
776
+ - Query: Indian food
777
+
778
+ - Doc Title: Cardamom
779
+
780
+ - Doc Text: Cardamom (/ˈkɑrdəməm/), sometimes called cardamon (mostly in the UK), is a spice made from the seeds of several plants in the genera Elettaria and Amomum in the family Zingiberaceae. Both genera are native to Bangladesh, Bhutan, India, Indonesia, Nepal, and Pakistan; they are recognised by their sma...
781
+
782
+
783
+ **Example 10:**
784
+
785
+ - Query: composer museum
786
+
787
+ - Doc Title: Edward Elgar
788
+
789
+ - Doc Text: Sir Edward William Elgar, 1st Baronet OM GCVO (2 June 1857 – 23 February 1934) was an English composer, many of whose works have entered the British and international classical concert repertoire. Among his best-known compositions are orchestral works including the Enigma Variations, the Pomp and Ci...
790
+
791
+
792
+ ##### Score 2 Examples (test)
793
+
794
+
795
+ **Example 1:**
796
+
797
+ - Query: Szechwan dish food cuisine
798
+
799
+ - Doc Title: Chinese cuisine
800
+
801
+ - Doc Text: Chinese cuisine includes styles originating from the diverse regions of China, as well as from Chinese people in other parts of the world. The history of Chinese cuisine in China stretches back for thousands of years and has changed from period to period and in each region according to climate, impe...
802
+
803
+
804
+ **Example 2:**
805
+
806
+ - Query: roman architecture
807
+
808
+ - Doc Title: Aedicula
809
+
810
+ - Doc Text: In ancient Roman religion, an aedicula (plural aediculae) is a small shrine. The word aedicula is the diminutive of the Latin aedes, a temple building or house.Many aediculae were household shrines that held small altars or statues of the Lares and Penates. The Lares were Roman deities protecting th...
811
+
812
+
813
+ **Example 3:**
814
+
815
+ - Query: D-Day normandy invasion
816
+
817
+ - Doc Title: Juno Beach
818
+
819
+ - Doc Text: Juno or Juno Beach was one of five beaches of the Allied invasion of German occupied France in the Normandy landings on 6 June 1944, during the Second World War. The beach spanned from Courseulles-sur-Mer, a village just east of the British beach Gold, to Saint-Aubin-sur-Mer, just west of the Britis...
820
+
821
+
822
+ **Example 4:**
823
+
824
+ - Query: Indian food
825
+
826
+ - Doc Title: Curry
827
+
828
+ - Doc Text: Curry (/ˈkʌri/, plural curries) is a dish originating in the cuisine of the Indian Subcontinent and Southeast Asia. The common feature is the incorporation of complex combinations of spices or herbs, usually including fresh or dried hot chillies . Some limit the use of the term curry to dishes prep...
829
+
830
+
831
+ **Example 5:**
832
+
833
+ - Query: List of films from the surrealist category
834
+
835
+ - Doc Title: Blue Velvet (film)
836
+
837
+ - Doc Text: Blue Velvet is a 1986 American neo-noir mystery film, written and directed by David Lynch. Blending psychological horror with film noir, the film stars Kyle MacLachlan, Isabella Rossellini, Dennis Hopper and Laura Dern. The title is taken from Bobby Vinton's 1963 song of the same name. Although init...
838
+
839
+
840
+ **Example 6:**
841
+
842
+ - Query: Einstein Relativity theory
843
+
844
+ - Doc Title: Spacetime
845
+
846
+ - Doc Text: In physics, spacetime (also space–time, space time or space–time continuum) is any mathematical model that combines space and time into a single interwoven continuum. The spacetime of our universe is usually interpreted from a Euclidean space perspective, which regards space as consisting of three d...
847
+
848
+
849
+ **Example 7:**
850
+
851
+ - Query: summer flowers
852
+
853
+ - Doc Title: Achillea millefolium
854
+
855
+ - Doc Text: Achillea millefolium, known commonly as yarrow /ˈjæroʊ/ or common yarrow, is a flowering plant in the family Asteraceae. It is native to temperate regions of the Northern Hemisphere in Asia, Europe, and North America. It has been introduced as a feed for live stock in places like New Zealand and Aus...
856
+
857
+
858
+ **Example 8:**
859
+
860
+ - Query: vietnam war movie
861
+
862
+ - Doc Title: The Deer Hunter
863
+
864
+ - Doc Text: The Deer Hunter is a 1978 American epic war drama film co-written and directed by Michael Cimino about a trio of Russian American steelworkers and their service in the Vietnam War. The film stars Robert De Niro, Christopher Walken, John Savage, John Cazale, Meryl Streep, and George Dzundza....
865
+
866
+
867
+ **Example 9:**
868
+
869
+ - Query: guitar origin blues
870
+
871
+ - Doc Title: Blind Blake
872
+
873
+ - Doc Text: Arthur "Blind" Blake (1896 – December 1, 1934) was an American blues and ragtime singer and guitarist. He is known for his series of recordings for Paramount Records between 1926 and 1932 and the mystery surrounding his life....
874
+
875
+
876
+ **Example 10:**
877
+
878
+ - Query: tango music composers
879
+
880
+ - Doc Title: Astor Piazzolla
881
+
882
+ - Doc Text: Ástor Pantaleón Piazzolla (Spanish pronunciation: [piasola], Italian pronunciation: [pjattsɔlla]; March 11, 1921 – July 4, 1992) was an Argentine tango composer, bandoneon player and arranger. His oeuvre revolutionized the traditional tango into a new style termed nuevo tango, incorporating elements...
883
+
884
+
885
+ ## CORE
886
+
887
+ **Has Scores 0, 1, 2**
888
+ **Score 2 = RELEVANT** ✓
889
+ **Score 1 = NOT RELEVANT** ✗
890
+
891
+ ## Special Notes
892
+
893
+ - **Original Purpose**: Entity retrieval - find relevant entities for queries
894
+ - **What Scores Actually Mean**:
895
+ - Score 2: Truly relevant entities (e.g., "Eiffel" → Eiffel Tower)
896
+ - Score 1: KEYWORD MATCHES ONLY (e.g., "Eiffel" → Eiffel programming language - WTF!)
897
+ - Score 0: Not relevant
898
+ - **Major Problem**: Score 1 is just keyword matching, not intent matching!
899
+
900
+ ## Training Verdict
901
+
902
+ **BAD FOR TRAINING** ✗
903
+ - Score 1 is NOT relevant - just keyword overlap
904
+ - Only Score 2 is truly relevant
905
+ - For binary: Must use ONLY Score 2 as positive, Score 0+1 as negative
906
+ - Using Score 1 as positive will teach model to do keyword matching instead of relevance!
907
+
908
+ ## Observations
test_scores/dataset_reports/fever_analysis.md ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FEVER Dataset Analysis
2
+
3
+ ## Overview
4
+
5
+ - Total queries available: 123142
6
+
7
+ - Total corpus documents: 5416568
8
+
9
+ - Corpus documents loaded for examples: 50000
10
+
11
+
12
+ ## Split Statistics
13
+
14
+ | Split | Queries | Total Pairs | Score Distribution |
15
+
16
+ |-------|---------|-------------|-------------------|
17
+
18
+ | train | 109810 | 140085 | 1:140085 |
19
+
20
+ | dev | 6666 | 8079 | 1:8079 |
21
+
22
+ | test | 6666 | 7937 | 1:7937 |
23
+
24
+
25
+ ## Score Analysis
26
+
27
+ **Only one score level (1) found in this dataset.**
28
+
29
+ - This means all labeled documents are considered equally relevant
30
+
31
+ - Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)
32
+
33
+
34
+ ## Examples
35
+
36
+ ### Score Comparison Examples
37
+
38
+ ### Examples by Split and Score
39
+
40
+
41
+ #### TRAIN Split
42
+
43
+ **Scores in train: [1]**
44
+
45
+
46
+ ##### Score 1 Examples (train)
47
+
48
+
49
+ **Example 1:**
50
+
51
+ - Query: 10 Cloverfield Lane stars the actor John Gallagher, Jr.
52
+
53
+ - Doc Title: 10 Cloverfield Lane
54
+
55
+ - Doc Text: 10 Cloverfield Lane is a 2016 American science fiction psychological thriller film directed by Dan Trachtenberg , produced by J. J. Abrams and Lindsey Weber and written by Josh Campbell , Matthew Stucken and Damien Chazelle . The film stars John Goodman , Mary Elizabeth Winstead , and John Gallagher...
56
+
57
+
58
+ **Example 2:**
59
+
60
+ - Query: The 1991 NBA Finals had the Lakers in it.
61
+
62
+ - Doc Title: 1991 NBA Finals
63
+
64
+ - Doc Text: The 1991 NBA Finals was the championship round of the 1990 -- 91 NBA season . It was also the first NBA Finals broadcast by NBC after 17 years with CBS . The documentary `` Learning to Fly , '' narrated by Jeff Kaye , recaps Chicago 's successful first championship season . The theme song is `` Le...
65
+
66
+
67
+ **Example 3:**
68
+
69
+ - Query: 100 Greatest of All Time had timed episodes.
70
+
71
+ - Doc Title: 100 Greatest of All Time
72
+
73
+ - Doc Text: 100 Greatest of All Time was a television series of five one-hour episodes , produced and first aired by Tennis Channel in March 2012 . It presented a list of the `` 100 greatest tennis players of all time '' , both men and women . The series was hosted by Jack Nicklaus , Jerry Rice , Wayne Gretzky ...
74
+
75
+
76
+ **Example 4:**
77
+
78
+ - Query: In 1992, the 1992 Los Angeles riots occurred.
79
+
80
+ - Doc Title: 1992 Los Angeles riots
81
+
82
+ - Doc Text: The 1992 Los Angeles riots , also known as the Rodney King riots , the South Central riots , the 1992 Los Angeles civil disturbance , the 1992 Los Angeles civil unrest , and the Los Angeles uprising , were a series of riots , lootings , arsons , and civil disturbances that occurred in Los Angeles Co...
83
+
84
+
85
+ **Example 5:**
86
+
87
+ - Query: The 1992 Los Angeles riots were a series of events of civil disorder.
88
+
89
+ - Doc Title: 1992 Los Angeles riots
90
+
91
+ - Doc Text: The 1992 Los Angeles riots , also known as the Rodney King riots , the South Central riots , the 1992 Los Angeles civil disturbance , the 1992 Los Angeles civil unrest , and the Los Angeles uprising , were a series of riots , lootings , arsons , and civil disturbances that occurred in Los Angeles Co...
92
+
93
+
94
+ **Example 6:**
95
+
96
+ - Query: Melbourne is in Russia.
97
+
98
+ - Doc Title: 1956 Summer Olympics
99
+
100
+ - Doc Text: The 1956 Summer Olympics , officially known as the Games of the XVI Olympiad , were an international multi-sport event which was held in Melbourne , Victoria , Australia , in 1956 , apart from the equestrian events , which were held five months earlier in Stockholm . The 1956 Games were the first to...
101
+
102
+
103
+ **Example 7:**
104
+
105
+ - Query: R. Kelly is incapable of being a musician.
106
+
107
+ - Doc Title: 12 Play
108
+
109
+ - Doc Text: 12 Play is the debut studio album by American R&B and soul musician R. Kelly ; it was released on November 9 , 1993 , by Jive Records . The album follows his tenure with R&B group Public Announcement , with whom he released one album , Born into the 90 's ( 1992 ) . It went on to top the R&B albums ...
110
+
111
+
112
+ **Example 8:**
113
+
114
+ - Query: Brad Pitt produced at least two Best Picture Oscar winners.
115
+
116
+ - Doc Title: 12 Years a Slave (film)
117
+
118
+ - Doc Text: 12 Years a Slave is a 2013 period drama film and an adaptation of the 1853 slave narrative memoir Twelve Years a Slave by Solomon Northup , a New York State-born free African-American man who was kidnapped in Washington , D.C. , in 1841 and sold into slavery . Northup worked on plantations in the st...
119
+
120
+
121
+ **Example 9:**
122
+
123
+ - Query: 19 Kids and Counting was cancelled in 2001.
124
+
125
+ - Doc Title: 19 Kids and Counting
126
+
127
+ - Doc Text: 19 Kids and Counting ( formerly 17 Kids and Counting and 18 Kids and Counting ) was an American reality television show that aired on the cable channel TLC for seven years , until its cancellation in 2015 . The show featured the Duggar family : parents Jim Bob and Michelle Duggar and their nineteen ...
128
+
129
+
130
+ **Example 10:**
131
+
132
+ - Query: 19 Kids and Counting was cancelled.
133
+
134
+ - Doc Title: 19 Kids and Counting
135
+
136
+ - Doc Text: 19 Kids and Counting ( formerly 17 Kids and Counting and 18 Kids and Counting ) was an American reality television show that aired on the cable channel TLC for seven years , until its cancellation in 2015 . The show featured the Duggar family : parents Jim Bob and Michelle Duggar and their nineteen ...
137
+
138
+
139
+ #### DEV Split
140
+
141
+ **Scores in dev: [1]**
142
+
143
+
144
+ ##### Score 1 Examples (dev)
145
+
146
+
147
+ **Example 1:**
148
+
149
+ - Query: Damon Albarn has released something.
150
+
151
+ - Doc Title: 13 (Blur album)
152
+
153
+ - Doc Text: 13 is the sixth studio album by English alternative rock band Blur , released on 15 March 1999 . Jettisoning the Britpop sound of Blur 's early career , 13 explores experimental , psychedelic and electronic music . Recording took place from June to October 1998 in London and Reykjavík . The album ...
154
+
155
+
156
+ **Example 2:**
157
+
158
+ - Query: The New York Knicks are in the Prince of Wales Conference of the National Hockey League.
159
+
160
+ - Doc Title: 1969–70 NBA season
161
+
162
+ - Doc Text: The 1969 -- 70 NBA Season was the 24th season of the National Basketball Association . The season ended with the New York Knicks winning the NBA Championship , beating the Los Angeles Lakers 4 games to 3 in the NBA Finals ....
163
+
164
+
165
+ **Example 3:**
166
+
167
+ - Query: Aleister Crowley was European.
168
+
169
+ - Doc Title: 100 Greatest Britons
170
+
171
+ - Doc Text: 100 Greatest Britons was broadcast in 2002 by the BBC . The programme was based on a television poll conducted to determine whom the United Kingdom public considered the greatest British people in history . The series , Great Britons , included individual programmes featuring the individuals who fea...
172
+
173
+
174
+ **Example 4:**
175
+
176
+ - Query: Aleister Crowley was French.
177
+
178
+ - Doc Title: 100 Greatest Britons
179
+
180
+ - Doc Text: 100 Greatest Britons was broadcast in 2002 by the BBC . The programme was based on a television poll conducted to determine whom the United Kingdom public considered the greatest British people in history . The series , Great Britons , included individual programmes featuring the individuals who fea...
181
+
182
+
183
+ **Example 5:**
184
+
185
+ - Query: Nestor Carbonell played Mayor Anthony Garcia in The Dark Knight and The Dark Knight Rises during the 19th century.
186
+
187
+ - Doc Title: 19th century
188
+
189
+ - Doc Text: The 19th century ( 1 January 1801 -- 31 December 1900 ) was the century marked by the collapse of the Spanish , Napoleonic , Holy Roman and Mughal empires . This paved the way for the growing influence of the British Empire , the Russian Empire , the United States , the German Empire , the French co...
190
+
191
+
192
+ #### TEST Split
193
+
194
+ **Scores in test: [1]**
195
+
196
+
197
+ ##### Score 1 Examples (test)
198
+
199
+
200
+ **Example 1:**
201
+
202
+ - Query: John Goodman is not an actor.
203
+
204
+ - Doc Title: 10 Cloverfield Lane
205
+
206
+ - Doc Text: 10 Cloverfield Lane is a 2016 American science fiction psychological thriller film directed by Dan Trachtenberg , produced by J. J. Abrams and Lindsey Weber and written by Josh Campbell , Matthew Stucken and Damien Chazelle . The film stars John Goodman , Mary Elizabeth Winstead , and John Gallagher...
207
+
208
+
209
+ **Example 2:**
210
+
211
+ - Query: The Africa Cup of Nations is a friendly global soccer exhibition.
212
+
213
+ - Doc Title: 1957 African Cup of Nations
214
+
215
+ - Doc Text: The 1957 African Cup of Nations was the first edition of the Africa Cup of Nations , the football championship of Africa ( CAF ) . It was hosted by Sudan , and three teams participated : Egypt , Sudan , and Ethiopia . The tournament has the fewest goals to be scored , as well as the fewest games to ...
216
+
217
+
218
+ **Example 3:**
219
+
220
+ - Query: John Goodman starred in 10 Cloverfield Lane.
221
+
222
+ - Doc Title: 10 Cloverfield Lane
223
+
224
+ - Doc Text: 10 Cloverfield Lane is a 2016 American science fiction psychological thriller film directed by Dan Trachtenberg , produced by J. J. Abrams and Lindsey Weber and written by Josh Campbell , Matthew Stucken and Damien Chazelle . The film stars John Goodman , Mary Elizabeth Winstead , and John Gallagher...
225
+
226
+
227
+ **Example 4:**
228
+
229
+ - Query: Jimi Hendrix received training for air assault operations.
230
+
231
+ - Doc Title: 101st Airborne Division
232
+
233
+ - Doc Text: The 101st Airborne Division ( `` Screaming Eagles '' ) is an elite modular specialized light infantry division of the United States Army trained for air assault operations . It is the most potent and tactically mobile of the U.S. Army 's divisions , and has been referred to as `` the tip of the spea...
234
+
235
+
236
+ **Example 5:**
237
+
238
+ - Query: Jimi Hendrix was not trained for air assault operations.
239
+
240
+ - Doc Title: 101st Airborne Division
241
+
242
+ - Doc Text: The 101st Airborne Division ( `` Screaming Eagles '' ) is an elite modular specialized light infantry division of the United States Army trained for air assault operations . It is the most potent and tactically mobile of the U.S. Army 's divisions , and has been referred to as `` the tip of the spea...
243
+
244
+
245
+ **Example 6:**
246
+
247
+ - Query: B. R. Ambedkar was born in the 20th century.
248
+
249
+ - Doc Title: 19th century
250
+
251
+ - Doc Text: The 19th century ( 1 January 1801 -- 31 December 1900 ) was the century marked by the collapse of the Spanish , Napoleonic , Holy Roman and Mughal empires . This paved the way for the growing influence of the British Empire , the Russian Empire , the United States , the German Empire , the French co...
252
+
253
+
254
+ ## CORE
255
+
256
+ **Only Score 1 exists**
257
+ **Score 1 = RELEVANT** ✓
258
+
259
+ ## Special Notes
260
+
261
+ - **Original Purpose**: Fact verification - find evidence that supports OR refutes claims
262
+ - **What Score 1 Really Means**: Documents containing evidence to verify claims (support or refute)
263
+ - **Dataset Type**: Fact verification
264
+ - **Key Characteristic**: Documents that refute claims ARE relevant (e.g., "Melbourne is in Russia" → doc proving it's in Australia)
265
+
266
+ ## Training Verdict
267
+
268
+ **GOOD FOR TRAINING** ✓
269
+ - Score 1 documents provide actual evidence for/against claims
270
+ - True relevance matching (not just keyword matching)
271
+ - Can use for binary classification: Score 1 = positive class
272
+
273
+ ## Observations
274
+
275
+ ⚠️ **Training data only has Score 1:**
276
+
277
+ - All training documents are marked as equally relevant
278
+
279
+ - Model cannot learn to distinguish relevance levels
280
+
281
+ - May indicate collapsed/merged relevance levels
test_scores/dataset_reports/fiqa_analysis.md ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FIQA Dataset Analysis
2
+
3
+ ## Overview
4
+
5
+ - Total queries available: 6648
6
+
7
+ - Total corpus documents: 57638
8
+
9
+ - Corpus documents loaded for examples: 50000
10
+
11
+
12
+ ## Split Statistics
13
+
14
+ | Split | Queries | Total Pairs | Score Distribution |
15
+
16
+ |-------|---------|-------------|-------------------|
17
+
18
+ | train | 5500 | 14166 | 1:14166 |
19
+
20
+ | dev | 500 | 1238 | 1:1238 |
21
+
22
+ | test | 648 | 1706 | 1:1706 |
23
+
24
+
25
+ ## Score Analysis
26
+
27
+ **Only one score level (1) found in this dataset.**
28
+
29
+ - This means all labeled documents are considered equally relevant
30
+
31
+ - Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)
32
+
33
+
34
+ ## Examples
35
+
36
+ ### Score Comparison Examples
37
+
38
+ ### Examples by Split and Score
39
+
40
+
41
+ #### TRAIN Split
42
+
43
+ **Scores in train: [1]**
44
+
45
+
46
+ ##### Score 1 Examples (train)
47
+
48
+
49
+ **Example 1:**
50
+
51
+ - Query: What is considered a business expense on a business trip?
52
+
53
+ - Doc Text: The IRS Guidance pertaining to the subject. In general the best I can say is your business expense may be deductible. But it depends on the circumstances and what it is you want to deduct. Travel Taxpayers who travel away from home on business may deduct related expenses, including the cost of r...
54
+
55
+
56
+ **Example 2:**
57
+
58
+ - Query: Business Expense - Car Insurance Deductible For Accident That Occurred During a Business Trip
59
+
60
+ - Doc Text: As a general rule, you must choose between a mileage deduction or an actual expenses deduction. The idea is that the mileage deduction is supposed to cover all costs of using the car. Exceptions include parking fees and tolls, which can be deducted separately under either method. You explicitly c...
61
+
62
+
63
+ **Example 3:**
64
+
65
+ - Query: Starting a new online business
66
+
67
+ - Doc Text: Most US states have rules that go something like this: You will almost certainly have to pay some registration fees, as noted above. Depending on how you organize, you may or may not need to file a separate tax return for the business. (If you're sole proprietor for tax purposes, then you file on S...
68
+
69
+
70
+ **Example 4:**
71
+
72
+ - Query: “Business day” and “due date” for bills
73
+
74
+ - Doc Text: You definitely have an argument for getting them to reverse the late fee, especially if it hasn't happened very often. (If you are late every month they may be less likely to forgive.) As for why this happens, it's not actually about business days, but instead it's based on when they know that you p...
75
+
76
+
77
+ **Example 5:**
78
+
79
+ - Query: New business owner - How do taxes work for the business vs individual?
80
+
81
+ - Doc Text: Through your question and then clarification through the comments, it looks like you have a U.S. LLC with at least two members. If you did not elect some other tax treatment, your LLC will be treated as a partnership by the IRS. The partnership should file a tax return on Form 1065. Then each par...
82
+
83
+
84
+ **Example 6:**
85
+
86
+ - Query: Hobby vs. Business
87
+
88
+ - Doc Text: Miscellaneous income -- same category used for hobbies....
89
+
90
+
91
+ **Example 7:**
92
+
93
+ - Query: Does U.S. tax code call for small business owners to count business purchases as personal income?
94
+
95
+ - Doc Text: "I am going to keep things very simple and explain the common-sense reason why the accountant is right: Also, my sister in law owns a small restaurant, where they claim their accountant informed them of the same thing, where a portion of their business purchases had to be counted as taxable personal...
96
+
97
+
98
+ **Example 8:**
99
+
100
+ - Query: How can I register a UK business without providing a business address?
101
+
102
+ - Doc Text: You don't have to provide your personal home address per se. You can provide a legal address where Companies house can send across paper correspondence to. Companies house legally requires an address because directors are liable to their shareholders(even if you are the only shareholder) and to stop...
103
+
104
+
105
+ **Example 9:**
106
+
107
+ - Query: What are 'business fundamentals'?
108
+
109
+ - Doc Text: From http://financial-dictionary.thefreedictionary.com/Business+Fundamentals The facts that affect a company's underlying value. Examples of business fundamentals include debt, cash flow, supply of and demand for the company's products, and so forth. For instance, if a company does not hav...
110
+
111
+
112
+ **Example 10:**
113
+
114
+ - Query: Business Investment Loss from prior year
115
+
116
+ - Doc Text: You need to give specific dates! In the United States, you have three years to file an amended tax return. https://www.irs.gov/uac/Newsroom/Ten-Facts-about-Amended-Tax-Returns Did the restaurant fail in 2012? If so, that's probably the year to take the loss. If you need to amend your 2012 return, wh...
117
+
118
+
119
+ #### DEV Split
120
+
121
+ **Scores in dev: [1]**
122
+
123
+
124
+ ##### Score 1 Examples (dev)
125
+
126
+
127
+ **Example 1:**
128
+
129
+ - Query: Claiming business expenses for a business with no income
130
+
131
+ - Doc Text: Yes you can claim your business deductions if you are not making any income yet. But first you should decide what structure you want to have for your business. Either a Company structure or a Sole Trader or Partnership. Company Structure If you choose a Company Structure (which is more expensive to ...
132
+
133
+
134
+ **Example 2:**
135
+
136
+ - Query: Transferring money from One business checking to another business checking
137
+
138
+ - Doc Text: "You should have separate files for each of the two businesses. The business that transfers money out should ""write check"" in its QB file. The business that receives money should ""make deposit"" in its QB file. (In QB you ""write check"" even when you make the payment by some other means like A...
139
+
140
+
141
+ **Example 3:**
142
+
143
+ - Query: Having a separate bank account for business/investing, but not a “business account?”
144
+
145
+ - Doc Text: "Having a separate checking account for the business makes sense. It simplifies documenting your income/expenses. You can ""explain"" every dollar entering and exiting the account without having to remember that some of them were for non-business items. My credit union allowed me to have a 2nd check...
146
+
147
+
148
+ **Example 4:**
149
+
150
+ - Query: Income tax exemptions for small business?
151
+
152
+ - Doc Text: Yes, you should be able to deduct at least some of these expenses. For expense incurred before you started the business: What Are Deductible Startup Costs? The IRS defines “startup costs” as deductible capital expenses that are used to pay for: 1) The cost of “investigating the creation or acquisit...
153
+
154
+
155
+ **Example 5:**
156
+
157
+ - Query: Do I need a business credit card?
158
+
159
+ - Doc Text: It can certainly help build a credit score, but remember that businesses gain credit differently from individuals. Depending on the country, there isn't usually a national register of business credit ratings the way there is for individuals. The credit record you'd be gaining is with your own bank...
160
+
161
+
162
+ **Example 6:**
163
+
164
+ - Query: Personal credit card for business expenses
165
+
166
+ - Doc Text: Early on, one might not be able to get credit for their business. For convenience, and the card perks, it makes sense to use the personal card. But for sake of a clean paper trail, I'd choose 1 card and use it exclusively, 100% for the business. Not one card here, one card there....
167
+
168
+
169
+ **Example 7:**
170
+
171
+ - Query: Borrow money to invest in a business venture with equity?
172
+
173
+ - Doc Text: It's clearly a risk, but is it any different than investing in your own business? Yes, it is different. If you own a business, you determine the path of the business. You determine how much risk the business takes. You can put in extra effort to try to make the business work. You can choose to l...
174
+
175
+
176
+ **Example 8:**
177
+
178
+ - Query: What's “wrong” with taking money from your own business?
179
+
180
+ - Doc Text: "I'm no expert on this, but I would say that, if you own the business entirely yourself, there is nothing terribly wrong with using it for your own purposes as you would any other asset that you own. What is wrong is not keeping accurate records that distinguish between your money and the business'...
181
+
182
+
183
+ **Example 9:**
184
+
185
+ - Query: Can Health-Releated Services be a Business Expense?
186
+
187
+ - Doc Text: Chris, since you own your own company, nobody can stop you from charging your personal expenses to your business account. IRS is not a huge fan of mixing business and personal expenses and this practice might indicate to them that you are not treating your business seriously, and it should classify ...
188
+
189
+
190
+ **Example 10:**
191
+
192
+ - Query: What forms do I need to fill out for a super basic LLC closing?
193
+
194
+ - Doc Text: If it is a sole proprietorship and you didn't make another mistake by explicitly asking the IRS to treat it as a corporation - there are no IRS forms to fill. You'll need to dissolve the LLC with your State, though, check the State's department of State/Corporations (depending on the State, the nam...
195
+
196
+
197
+ #### TEST Split
198
+
199
+ **Scores in test: [1]**
200
+
201
+
202
+ ##### Score 1 Examples (test)
203
+
204
+
205
+ **Example 1:**
206
+
207
+ - Query: How to deposit a cheque issued to an associate in my business into my business account?
208
+
209
+ - Doc Text: Just have the associate sign the back and then deposit it. It's called a third party cheque and is perfectly legal. I wouldn't be surprised if it has a longer hold period and, as always, you don't get the money if the cheque doesn't clear. Now, you may have problems if it's a large amount or you'r...
210
+
211
+
212
+ **Example 2:**
213
+
214
+ - Query: Can I send a money order from USPS as a business?
215
+
216
+ - Doc Text: Sure you can. You can fill in whatever you want in the From section of a money order, so your business name and address would be fine. The price only includes the money order itself. You can hand deliver it yourself if you want, but if you want to mail it, you'll have to provide an envelope and a ...
217
+
218
+
219
+ **Example 3:**
220
+
221
+ - Query: 1 EIN doing business under multiple business names
222
+
223
+ - Doc Text: You're confusing a lot of things here. Company B LLC will have it's sales run under Company A LLC, and cease operating as a separate entity These two are contradicting each other. If B LLC ceases to exist - it is not going to have it's sales run under A LLC, since there will be no sales to run for a...
224
+
225
+
226
+ **Example 4:**
227
+
228
+ - Query: Applying for and receiving business credit
229
+
230
+ - Doc Text: "I'm afraid the great myth of limited liability companies is that all such vehicles have instant access to credit. Limited liability on a company with few physical assets to underwrite the loan, or with insufficient revenue, will usually mean that the owners (or others) will be asked to stand suret...
231
+
232
+
233
+ **Example 5:**
234
+
235
+ - Query: What are the ins/outs of writing equipment purchases off as business expenses in a home based business?
236
+
237
+ - Doc Text: Most items used in business have to be depreciated; you get to deduct a small fraction of the cost each year depending on the lifetime of the item as per IRS rules. That is, you cannot assume a one-year life for an electronic item even if it will be obsolete in three months. Some items can be expe...
238
+
239
+
240
+ **Example 6:**
241
+
242
+ - Query: Intentions of Deductible Amount for Small Business
243
+
244
+ - Doc Text: "If your sole proprietorship losses exceed all other sources of taxable income, then you have what's called a Net Operating Loss (NOL). You will have the option to ""carry back"" and amend a return you filed in the last 2 years where you owed tax, or you can ""carry forward"" the losses and decrease...
245
+
246
+
247
+ **Example 7:**
248
+
249
+ - Query: How can I deposit a check made out to my business into my personal account?
250
+
251
+ - Doc Text: You should have a separate business account. Mixing business and personal funds is a bad practice. Shop around, you should be able to find a bank that will let you open a free checking account, especially if you are going to have minimal activity (e.g. less than 20 of checks per month) and perhaps m...
252
+
253
+
254
+ **Example 8:**
255
+
256
+ - Query: Filing personal with 1099s versus business s-corp?
257
+
258
+ - Doc Text: Depends whom the 1099 was issued to. If it was issued to your corporation - then its your corporation's income, not yours. Why would it go to your tax return? Your corporation and you are two separate legal entities. You will have to file the 1120S, whether you have corporate income or not, it has t...
259
+
260
+
261
+ **Example 9:**
262
+
263
+ - Query: Using credit card points to pay for tax deductible business expenses
264
+
265
+ - Doc Text: "For simplicity, let's start by just considering cash back. In general, cash back from credit cards for personal use is not taxable, but for business use it is taxable (sort of, I'll explain later). The reason is most personal purchases are made with after tax dollars; you typically aren't deducting...
266
+
267
+
268
+ **Example 10:**
269
+
270
+ - Query: What approaches are there for pricing a small business?
271
+
272
+ - Doc Text: I don't have any experience in this, but this is my academic understanding of business pricing. The LOWEST amount a seller would accept is the liquidation value. For a B&B, what would the value of the land, the house, the furnishings, accounts payable, etc. be if it had to be sold today, minus any ...
273
+
274
+
275
+ ## CORE
276
+
277
+ **Only Score 1 exists**
278
+ **Score 1 = RELEVANT** ✓
279
+
280
+ ## Special Notes
281
+
282
+ - **Original Purpose**: Financial opinion-based question answering
283
+ - **What Score 1 Really Means**: Documents that answer financial questions
284
+ - **Dataset Type**: Domain-specific Q&A (financial)
285
+ - **Key Characteristic**: Includes opinionated answers about financial topics
286
+
287
+ ## Training Verdict
288
+
289
+ **GOOD FOR TRAINING** ✓
290
+ - Score 1 documents genuinely answer the queries
291
+ - True relevance matching in financial domain
292
+ - Can use for binary classification: Score 1 = positive class
293
+
294
+ ## Observations
295
+
296
+ ⚠️ **Training data only has Score 1:**
297
+
298
+ - All training documents are marked as equally relevant
299
+
300
+ - Model cannot learn to distinguish relevance levels
301
+
302
+ - May indicate collapsed/merged relevance levels
test_scores/dataset_reports/hotpotqa_analysis.md ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HOTPOTQA Dataset Analysis
2
+
3
+ ## Overview
4
+
5
+ - Total queries available: 97852
6
+
7
+ - Total corpus documents: 5233329
8
+
9
+ - Corpus documents loaded for examples: 50000
10
+
11
+
12
+ ## Split Statistics
13
+
14
+ | Split | Queries | Total Pairs | Score Distribution |
15
+
16
+ |-------|---------|-------------|-------------------|
17
+
18
+ | train | 85000 | 170000 | 1:170000 |
19
+
20
+ | dev | 5447 | 10894 | 1:10894 |
21
+
22
+ | test | 7405 | 14810 | 1:14810 |
23
+
24
+
25
+ ## Score Analysis
26
+
27
+ **Only one score level (1) found in this dataset.**
28
+
29
+ - This means all labeled documents are considered equally relevant
30
+
31
+ - Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)
32
+
33
+
34
+ ## Examples
35
+
36
+ ### Score Comparison Examples
37
+
38
+ ### Examples by Split and Score
39
+
40
+
41
+ #### TRAIN Split
42
+
43
+ **Scores in train: [1]**
44
+
45
+
46
+ ##### Score 1 Examples (train)
47
+
48
+
49
+ **Example 1:**
50
+
51
+ - Query: What name was given to the son of the man who defeated the usurper Allectus ?
52
+
53
+ - Doc Title: Constantius Chlorus
54
+
55
+ - Doc Text: Constantius I (Latin: "Marcus Flavius Valerius Constantius Herculius Augustus" ; 31 March  250 25 July 306), commonly known as Constantius Chlorus (Greek: Κωνστάντιος Χλωρός , "Kōnstantios Khlōrós", literally "Constantius the Pale"), was "Caesar", a form of Roman co-emperor, from 293 to 306. He was ...
56
+
57
+
58
+ **Example 2:**
59
+
60
+ - Query: What profession does Lewis Milestone and All Quiet on the Western Front have in common?
61
+
62
+ - Doc Title: Lewis Milestone
63
+
64
+ - Doc Text: Lewis Milestone (born Leib Milstein; September 30, 1895 – September 25, 1980) was a Moldovan-born American motion picture director. He is known for directing "Two Arabian Knights" (1927) and "All Quiet on the Western Front" (1930), both of which received Academy Awards for Best Director. He also dir...
65
+
66
+
67
+ **Example 3:**
68
+
69
+ - Query: University of Alabama in Huntsville is a college located in what county?
70
+
71
+ - Doc Title: Huntsville, Alabama
72
+
73
+ - Doc Text: Huntsville is a city located primarily in Madison County in the Appalachian region of northern Alabama. Huntsville is the county seat of Madison County. The city extends west into neighboring Limestone County. Huntsville's population was 180,105 as of the 2010 census. Huntsville is the third-largest...
74
+
75
+
76
+ **Example 4:**
77
+
78
+ - Query: What 1937 magazine did "Bringing Up Baby" film star and one of classic Hollywood's definitive leading men appear in?
79
+
80
+ - Doc Title: Bringing Up Baby
81
+
82
+ - Doc Text: Bringing Up Baby is a 1938 American screwball comedy film directed by Howard Hawks, starring Katharine Hepburn and Cary Grant, and released by RKO Radio Pictures. The film tells the story of a paleontologist in a number of predicaments involving a scatterbrained woman and a leopard named Baby. The s...
83
+
84
+
85
+ **Example 5:**
86
+
87
+ - Query: Who founded the organization whose Boston branch excluded the Christian discussion group later housed in the Boston Young Men's Christian Union?
88
+
89
+ - Doc Title: YMCA
90
+
91
+ - Doc Text: The Young Men's Christian Association, commonly known as the YMCA or simply the Y, is a worldwide organization based in Geneva, Switzerland, with more than 2 billion beneficiaries from 125 national associations. It was founded on 6 June 1844 by George Williams in London and aims to put Christian pri...
92
+
93
+
94
+ **Example 6:**
95
+
96
+ - Query: What language family is the language of the tribe of the man who instructed Jeff Ball in?
97
+
98
+ - Doc Title: Choctaw
99
+
100
+ - Doc Text: The Choctaw (In the Choctaw language, Chahta) are a Native American people originally occupying what is now the Southeastern United States (modern-day Alabama, Florida, Mississippi, and Louisiana). Their Choctaw language belongs to the Muskogean language family group....
101
+
102
+
103
+ **Example 7:**
104
+
105
+ - Query: Dayton, Newark is part of the county in New Jersey having what population as of 2016?
106
+
107
+ - Doc Title: Essex County, New Jersey
108
+
109
+ - Doc Text: Essex County is a county in the northeastern part of the U.S. state of New Jersey. As of the 2016 Census estimate, the county's population was 796,914, making it the state's third-most populous county, an increase of 1.7% from the 2010 United States Census, when its population was enumerated at 783,...
110
+
111
+
112
+ **Example 8:**
113
+
114
+ - Query: What album produced by George Martin was supposed to contain a song that ended up unreleased until 1996?
115
+
116
+ - Doc Title: Help! (album)
117
+
118
+ - Doc Text: Help! is the fifth studio album by English rock band the Beatles, the soundtrack from their film "Help!", and released on 6 August 1965. Produced by George Martin, it was the fifth UK album release by the band, and contains fourteen songs in its original British form. Seven of these, including the s...
119
+
120
+
121
+ **Example 9:**
122
+
123
+ - Query: Which of the following is acclaimed for his "lyrical flow of his statements": Nâzım Hikmet or Arthur Miller?
124
+
125
+ - Doc Title: Arthur Miller
126
+
127
+ - Doc Text: Arthur Asher Miller (October 17, 1915 – February 10, 2005) was an American playwright, essayist, and figure in twentieth-century American theater. Among his most popular plays are "All My Sons" (1947), "Death of a Salesman" (1949), "The Crucible" (1953) and "A View from the Bridge" (1955, revised 19...
128
+
129
+
130
+ **Example 10:**
131
+
132
+ - Query: Michel Wachenheim a French ambassador and permanent representative of France of what specialized agency of the United Nations?
133
+
134
+ - Doc Title: International Civil Aviation Organization
135
+
136
+ - Doc Text: The International Civil Aviation Organization (ICAO) (pronounced ; French: "Organisation de l'aviation civile internationale" , OACI), is a specialized agency of the United Nations. It codifies the principles and techniques of international air navigation and fosters the planning and development of ...
137
+
138
+
139
+ #### DEV Split
140
+
141
+ **Scores in dev: [1]**
142
+
143
+
144
+ ##### Score 1 Examples (dev)
145
+
146
+
147
+ **Example 1:**
148
+
149
+ - Query: Daniel Márcio Fernandes plays for a club founded in which year ?
150
+
151
+ - Doc Title: Lillestrøm SK
152
+
153
+ - Doc Text: Lillestrøm Sportsklubb is a Norwegian football club from the city of Lillestrøm. It was founded in 1917, after the merger of two local football clubs. Their home ground is Åråsen Stadion, which has a capacity of 12,250 people, while the principal training ground is Lillestrøm stadion, or the indoor ...
154
+
155
+
156
+ **Example 2:**
157
+
158
+ - Query: How many members actually compete in the conference in which the Northwestern Wildcats compete?
159
+
160
+ - Doc Title: Big Ten Conference
161
+
162
+ - Doc Text: The Big Ten Conference (B1G), formerly Western Conference and Big Nine Conference, is the oldest Division I collegiate athletic conference in the United States. Despite its name, the conference consists of 14 members (as of 2016). They compete in the NCAA Division I; its football teams compete in th...
163
+
164
+
165
+ **Example 3:**
166
+
167
+ - Query: The composer of The String Quartet No. 3 in E-flat minor was awarded a lifetime pension by what emperor?
168
+
169
+ - Doc Title: Pyotr Ilyich Tchaikovsky
170
+
171
+ - Doc Text: Pyotr Ilyich Tchaikovsky ( ; Russian: Пётр Ильи́ч Чайко́вский ; 25 April/7 May 1840 – 25 October/6 November 1893), often anglicized as Peter Ilich Tchaikovsky, was a Russian composer of the romantic period, some of whose works are among the most popular music in the classical repertoire. He was the ...
172
+
173
+
174
+ **Example 4:**
175
+
176
+ - Query: Which Russian linguist was a member of the Kazan School?
177
+
178
+ - Doc Title: Nikolai Trubetzkoy
179
+
180
+ - Doc Text: Prince Nikolai Sergeyevich Trubetzkoy (also Troubetskoy; Russian: Никола́й Серге́евич Трубецко́й; Moscow, April 16, 1890 – Vienna, June 25, 1938) was a Russian linguist and historian whose teachings formed a nucleus of the Prague School of structural linguistics. He is widely considered to be the fo...
181
+
182
+
183
+ **Example 5:**
184
+
185
+ - Query: The election of Donald rump to the position that encompasses the duty of commander-in-chief of the United States Armed Forces was an expression of what political movement?
186
+
187
+ - Doc Title: President of the United States
188
+
189
+ - Doc Text: The President of the United States (informally referred to as "POTUS") is the head of state and head of government of the United States. The president directs the executive branch of the federal government and is the commander-in-chief of the United States Armed Forces....
190
+
191
+
192
+ **Example 6:**
193
+
194
+ - Query: In Star Wars: A New Hope, what planet is the leader of the rebel alliance from?
195
+
196
+ - Doc Title: Star Wars (film)
197
+
198
+ - Doc Text: Star Wars (later retitled Star Wars: Episode IV – A New Hope) is a 1977 American epic space opera film written and directed by George Lucas. It is the first film in the original "Star Wars" trilogy, the first "Star Wars" movie in general, and the beginning of the "Star Wars" franchise. Starring Mark...
199
+
200
+
201
+ **Example 7:**
202
+
203
+ - Query: What primarily competed with Nintendo 64 and Sega Staurn has a horror-themed adventure game published by Jaleco in 1999?
204
+
205
+ - Doc Title: PlayStation (console)
206
+
207
+ - Doc Text: The PlayStation (officially abbreviated to PS, and commonly known as the PS1 or PSX) is a home video game console developed and marketed by Sony Computer Entertainment. The console was released on 3 December 1994 in Japan, 9 September 1995 in North America, 29 September 1995 in Europe, and for 15 No...
208
+
209
+
210
+ **Example 8:**
211
+
212
+ - Query: Franklin Armstrong is a comic strip character create by a man that died in what year?
213
+
214
+ - Doc Title: Charles M. Schulz
215
+
216
+ - Doc Text: Charles Monroe Schulz (November 26, 1922 – February 12, 2000), nicknamed Sparky, was an American cartoonist best known for the comic strip "Peanuts" (which featured the characters Charlie Brown and Snoopy, among others). He is widely regarded as one of the most influential cartoonists of all time, c...
217
+
218
+
219
+ **Example 9:**
220
+
221
+ - Query: Did Fred Savage and James Tinling both work during the Silent Era?
222
+
223
+ - Doc Title: Fred Savage
224
+
225
+ - Doc Text: Frederick Aaron "Fred" Savage (born July 9, 1976) is an American actor, television director, and producer. He is best known for his role as Kevin Arnold in the American television series "The Wonder Years", which ran from 1988 to 1993. He has earned several awards and nominations, such as People's C...
226
+
227
+
228
+ **Example 10:**
229
+
230
+ - Query: Cliff Clavin was a character on the show that was produced by what company?
231
+
232
+ - Doc Title: Cheers
233
+
234
+ - Doc Text: Cheers is an American sitcom that ran on NBC from September 30, 1982, to May 20, 1993, with a total of 275 half-hour episodes spanning over eleven seasons. The show was produced by Charles/Burrows/Charles Productions in association with Paramount Network Television. The show was created by the team ...
235
+
236
+
237
+ #### TEST Split
238
+
239
+ **Scores in test: [1]**
240
+
241
+
242
+ ##### Score 1 Examples (test)
243
+
244
+
245
+ **Example 1:**
246
+
247
+ - Query: Were Scott Derrickson and Ed Wood of the same nationality?
248
+
249
+ - Doc Title: Ed Wood
250
+
251
+ - Doc Text: Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, writer, producer, and director....
252
+
253
+
254
+ **Example 2:**
255
+
256
+ - Query: What screenwriter with credits for "Evolution" co-wrote a film starring Nicolas Cage and Téa Leoni?
257
+
258
+ - Doc Title: The Family Man
259
+
260
+ - Doc Text: The Family Man is a 2000 American romantic comedy-drama film directed by Brett Ratner, written by David Diamond and David Weissman, and starring Nicolas Cage and Téa Leoni. Cage's production company, Saturn Films, helped produce the film. The film centers on a man who sees what could have been had h...
261
+
262
+
263
+ **Example 3:**
264
+
265
+ - Query: The football manager who recruited David Beckham managed Manchester United during what timeframe?
266
+
267
+ - Doc Title: Alex Ferguson
268
+
269
+ - Doc Text: Sir Alexander Chapman Ferguson, CBE (born 31 December 1941) is a Scottish former football manager and player who managed Manchester United from 1986 to 2013. He is regarded by many players, managers and analysts to be one of the greatest and most successful managers of all time....
270
+
271
+
272
+ **Example 4:**
273
+
274
+ - Query: Brown State Fishing Lake is in a country that has a population of how many inhabitants ?
275
+
276
+ - Doc Title: Brown County, Kansas
277
+
278
+ - Doc Text: Brown County (county code BR) is a county located in the northeast portion of the U.S. state of Kansas. As of the 2010 census, the county population was 9,984. Its county seat and most populous city is Hiawatha. Brown County is the location of the Kickapoo Indian Reservation of Kansas, the majority ...
279
+
280
+
281
+ **Example 5:**
282
+
283
+ - Query: Are Giuseppe Verdi and Ambroise Thomas both Opera composers ?
284
+
285
+ - Doc Title: Giuseppe Verdi
286
+
287
+ - Doc Text: Giuseppe Fortunino Francesco Verdi (] ; 9 or 10 October 1813 – 27 January 1901) was an Italian opera composer....
288
+
289
+
290
+ **Example 6:**
291
+
292
+ - Query: Roger O. Egeberg was Assistant Secretary for Health and Scientific Affairs during the administration of a president that served during what years?
293
+
294
+ - Doc Title: Richard Nixon
295
+
296
+ - Doc Text: Richard Milhous Nixon (January 9, 1913 – April 22, 1994) was the 37th President of the United States from 1969 until 1974, when he resigned from office, the only U.S. president to do so. He had previously served as the 36th Vice President of the United States from 1953 to 1961, and prior to that as ...
297
+
298
+
299
+ **Example 7:**
300
+
301
+ - Query: What is the name for the adventure in "Tunnels and Trolls", a game designed by Ken St. Andre?
302
+
303
+ - Doc Title: Tunnels &amp; Trolls
304
+
305
+ - Doc Text: Tunnels & Trolls (abbreviated "T&T") is a fantasy role-playing game designed by Ken St. Andre and first published in 1975 by Flying Buffalo. The second modern role-playing game published, it was written by Ken St. Andre to be a more accessible alternative to "Dungeons & Dragons" and is suitable for ...
306
+
307
+
308
+ **Example 8:**
309
+
310
+ - Query: Alexander Kerensky was defeated and destroyed by the Bolsheviks in the course of a civil war that ended when ?
311
+
312
+ - Doc Title: Russian Civil War
313
+
314
+ - Doc Text: The Russian Civil War (Russian: Гражда́нская война́ в Росси́и , "Grazhdanskaya voyna v Rossiyi" ; November 1917 – October 1922) was a multi-party war in the former Russian Empire immediately after the Russian Revolutions of 1917, as many factions vied to determine Russia's political future. The two ...
315
+
316
+
317
+ **Example 9:**
318
+
319
+ - Query: Ralph Hefferline was a psychology professor at a university that is located in what city?
320
+
321
+ - Doc Title: Columbia University
322
+
323
+ - Doc Text: Columbia University (Columbia; officially Columbia University in the City of New York), established in 1754, is a private Ivy League research university in Upper Manhattan, New York City, often cited as one of the world's most prestigious universities....
324
+
325
+
326
+ **Example 10:**
327
+
328
+ - Query: The battle in which Giuseppe Arimondi lost his life secured what for Ethiopia?
329
+
330
+ - Doc Title: Battle of Adwa
331
+
332
+ - Doc Text: The Battle of Adwa (Amharic: አድዋ; Amharic translated: Adowa, or sometimes by the Italian name "Adua") was fought on 2 March 1896/Yekatit 23, 1889 according to Ethiopian calendar between the Ethiopian Empire and the Kingdom of Italy near the town of Adwa, Ethiopia, in Tigray. This climactic battle of...
333
+
334
+
335
+ ## CORE
336
+
337
+ **Only Score 1 exists**
338
+ **Score 1 = RELEVANT** ✓
339
+
340
+ ## Special Notes
341
+
342
+ - **Original Purpose**: Multi-hop question answering requiring reasoning over multiple documents
343
+ - **What Score 1 Really Means**: Documents containing facts needed for multi-hop reasoning
344
+ - **Dataset Type**: Multi-hop Q&A
345
+ - **Key Characteristic**: Each query typically needs 2+ Wikipedia articles to answer
346
+
347
+ ## Training Verdict
348
+
349
+ **GOOD FOR TRAINING** ✓
350
+ - Score 1 documents provide necessary facts for answering
351
+ - True relevance matching for complex reasoning
352
+ - Can use for binary classification: Score 1 = positive class
353
+
354
+ ## Observations
355
+
356
+ ⚠️ **Training data only has Score 1:**
357
+
358
+ - All training documents are marked as equally relevant
359
+
360
+ - Model cannot learn to distinguish relevance levels
361
+
362
+ - May indicate collapsed/merged relevance levels
test_scores/dataset_reports/nfcorpus_analysis.md ADDED
@@ -0,0 +1,865 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NFCORPUS Dataset Analysis
2
+
3
+ ## Overview
4
+
5
+ - Total queries available: 3237
6
+
7
+ - Total corpus documents: 3633
8
+
9
+ - Corpus documents loaded for examples: 3633
10
+
11
+
12
+ ## Split Statistics
13
+
14
+ | Split | Queries | Total Pairs | Score Distribution |
15
+
16
+ |-------|---------|-------------|-------------------|
17
+
18
+ | train | 2590 | 110575 | 1:110575 |
19
+
20
+ | dev | 324 | 11385 | 1:10864, 2:521 |
21
+
22
+ | test | 323 | 12334 | 1:11758, 2:576 |
23
+
24
+
25
+ ## Score Analysis
26
+
27
+ **Multiple score levels found: [1, 2]**
28
+
29
+ - Higher scores indicate higher relevance
30
+
31
+ - Score 0 typically means not relevant
32
+
33
+ - Score 1 typically means relevant
34
+
35
+ - Score 2 typically means highly relevant
36
+
37
+
38
+ ## Examples
39
+
40
+ ### Score Comparison Examples
41
+
42
+ #### Comparison Example 1
43
+
44
+ **Query:** Why Deep Fried Foods May Cause Cancer
45
+
46
+ **From:** dev split
47
+
48
+
49
+ **Score 2 Documents:**
50
+
51
+
52
+ *Document 1:*
53
+
54
+ - Title: Birth Weight, Head Circumference, and Prenatal Exposure to Acrylamide from Maternal Diet: The European Prospective Mother–Child Study (NewGeneris)
55
+
56
+ - Text: Background: Acrylamide is a common dietary exposure that crosses the human placenta. It is classified as a probable human carcinogen, and developmental toxicity has been observed in rodents. Objectives: We examined the associations between prenatal exposure to acrylamide and birth outcomes in a prospective European mother–child study. Methods: Hemoglobin (Hb) adducts of acrylamide and its metaboli...
57
+
58
+
59
+ *Document 2:*
60
+
61
+ - Title: A statistical regression model for the estimation of acrylamide concentrations in French fries for excess lifetime cancer risk assessment.
62
+
63
+ - Text: Human exposure to acrylamide (AA) through consumption of French fries and other foods has been recognized as a potential health concern. Here, we used a statistical non-linear regression model, based on the two most influential factors, cooking temperature and time, to estimate AA concentrations in French fries. The R(2) of the predictive model is 0.83, suggesting the developed model was significa...
64
+
65
+
66
+ *Document 3:*
67
+
68
+ - Title: Chronic intake of potato chips in humans increases the production of reactive oxygen radicals by leukocytes and increases plasma C-reactive protein...
69
+
70
+ - Text: BACKGROUND: Relatively high concentrations of acrylamide in commonly ingested food products, such as French fries, potato chips, or cereals, may constitute a potential risk to human health. OBJECTIVE: The objective of this pilot study was to investigate the possible connection between chronic ingestion of acrylamide-containing potato chips and oxidative stress or inflammation. DESIGN: Fourteen hea...
71
+
72
+
73
+
74
+ **Score 1 Documents:**
75
+
76
+
77
+ *Document 1:*
78
+
79
+ - Title: Review of epidemiologic studies of dietary acrylamide intake and the risk of cancer.
80
+
81
+ - Text: Conjectured associations between dietary acrylamide intake and cancer have been evaluated in more than 15 epidemiologic studies examining almost every major cancer site. We have critically reviewed the epidemiologic studies of estimated dietary acrylamide exposure and cancer. As substantially greater acrylamide exposure occurs through tobacco smoke than dietary exposure, we present the results sep...
82
+
83
+
84
+ *Document 2:*
85
+
86
+ - Title: Intake of fried meat and risk of cancer: a follow-up study in Finland.
87
+
88
+ - Text: It has been suggested that mutagens in fried meat may be involved in the cancer process. Therefore the relationships between intake of fried meat and subsequent risk of cancers at different sites were studied among 9,990 Finnish men and women, 15-99 years of age and initially free of cancer. The baseline study was carried out in 1966-1972, and cases of cancer were identified through data linkage w...
89
+
90
+
91
+ *Document 3:*
92
+
93
+ - Title: Dietary acrylamide exposure of the French population: results of the second French Total Diet Study.
94
+
95
+ - Text: Acrylamide is a heat-induced carcinogen compound that is found in some foods consequently to cooking or other thermal processes. In the second French Total Diet Study (TDS), acrylamide was analysed in 192 food samples collected in mainland France to be representative of the population diet and prepared "as consumed". Highest mean concentrations were found in potato chips/crisps (954 μg/kg), French...
96
+
97
+
98
+
99
+ ---
100
+
101
+
102
+ #### Comparison Example 2
103
+
104
+ **Query:** Living Longer by Reducing Leucine Intake
105
+
106
+ **From:** dev split
107
+
108
+
109
+ **Score 2 Documents:**
110
+
111
+
112
+ *Document 1:*
113
+
114
+ - Title: Insights into the beneficial effect of caloric/ dietary restriction for a healthy and prolonged life
115
+
116
+ - Text: Over the last several years, new evidence has kept pouring in about the remarkable effect of caloric restriction (CR) on the conspicuous bedfellows- aging and cancer. Through the use of various animal models, it is now well established that by reducing calorie intake one can not only increase life span but, also, lower the risk of various age related diseases such as cancer. Cancer cells are belie...
117
+
118
+
119
+ *Document 2:*
120
+
121
+ - Title: Macronutrient balance and lifespan
122
+
123
+ - Text: Dietary restriction (DR) without malnutrition is widely regarded to be a universal mechanism for prolonging lifespan. It is generally believed that the benefits of DR arise from eating fewer calories (termed caloric restriction, CR). Here we argue that, rather than calories, the key determinant of the relationship between diet and longevity is the balance of protein to non-protein energy ingested....
124
+
125
+
126
+ *Document 3:*
127
+
128
+ - Title: Dietary Restriction, Growth Factors and Aging: from yeast to humans
129
+
130
+ - Text: Dietary restriction (DR) and reduced growth factor signaling both elevate resistance to oxidative stress, reduce macromolecular damage, and increase lifespan in model organisms. In rodents, both DR and decreased growth factor signaling reduce the incidence of tumors and slow down cognitive decline and aging. DR reduces cancer and cardiovascular disease and mortality in monkeys, and reduces metabol...
131
+
132
+
133
+
134
+ **Score 1 Documents:**
135
+
136
+
137
+ *Document 1:*
138
+
139
+ - Title: mTOR is a key modulator of ageing and age-related disease
140
+
141
+ - Text: Many experts in the biology of ageing believe that pharmacological interventions to slow ageing are a matter of ‘when’ rather than ‘if’. A leading target for such interventions is the nutrient response pathway defined by the mechanistic target of rapamycin (mTOR). Inhibition of this pathway extends lifespan in model organisms and confers protection against a growing list of age-related pathologies...
142
+
143
+
144
+ *Document 2:*
145
+
146
+ - Title: Amino acid sensing and regulation of mTORC1.
147
+
148
+ - Text: Amino acids play fundamental roles in the cell both as the building blocks of new proteins and as metabolic precursors. To adapt to their limitation during periods of protein starvation, multiple adaptive mechanisms have evolved, including a rapid cessation of new protein synthesis, an increase in amino acid biosynthesis and transport, and autophagy. Here, we discuss what we currently know about h...
149
+
150
+
151
+ *Document 3:*
152
+
153
+ - Title: Caloric restriction, the traditional Okinawan diet, and healthy aging: the diet of the world's longest-lived people and its potential impact on mor...
154
+
155
+ - Text: Long-term caloric restriction (CR) is a robust means of reducing age-related diseases and extending life span in multiple species, but the effects in humans are unknown. The low caloric intake, long life expectancy, and the high prevalence of centenarians in Okinawa have been used as an argument to support the CR hypothesis in humans. However, no long-term, epidemiologic analysis has been conducte...
156
+
157
+
158
+
159
+ ---
160
+
161
+
162
+ #### Comparison Example 3
163
+
164
+ **Query:** Why are Cancer Rates so Low in India?
165
+
166
+ **From:** dev split
167
+
168
+
169
+ **Score 2 Documents:**
170
+
171
+
172
+ *Document 1:*
173
+
174
+ - Title: Curcumin and cancer: an "old-age" disease with an "age-old" solution.
175
+
176
+ - Text: Cancer is primarily a disease of old age, and that life style plays a major role in the development of most cancers is now well recognized. While plant-based formulations have been used to treat cancer for centuries, current treatments usually involve poisonous mustard gas, chemotherapy, radiation, and targeted therapies. While traditional plant-derived medicines are safe, what are the active prin...
177
+
178
+
179
+ *Document 2:*
180
+
181
+ - Title: Curcumin, a component of turmeric: from farm to pharmacy.
182
+
183
+ - Text: Curcumin, an active polyphenol of the golden spice turmeric, is a highly pleiotropic molecule with the potential to modulate the biological activity of a number of signaling molecules. Traditionally, this polyphenol has been used in Asian countries to treat such human ailments as acne, psoriasis, dermatitis, and rash. Recent studies have indicated that curcumin can target newly identified signalin...
184
+
185
+
186
+ *Document 3:*
187
+
188
+ - Title: Dietary turmeric potentially reduces the risk of cancer.
189
+
190
+ - Text: Turmeric, a plant rhizome that is often dried, ground and used as a cooking spice, has also been used medicinally for several thousand years. Curcumin, the phytochemical that gives turmeric its golden color, is responsible for most of the therapeutic effects of turmeric. In recent years curcumin has been studied for its effects on chronic diseases such as diabetes, Alzheimer's, and cancer. Though ...
191
+
192
+
193
+
194
+ **Score 1 Documents:**
195
+
196
+
197
+ *Document 1:*
198
+
199
+ - Title: Apoptosis induced by death receptors.
200
+
201
+ - Text: Death receptors belong to the TNF receptor family and are characterised by an intracellular death domain that serves to recruit adapter proteins such as TRADD and FADD and cysteine proteases such as Caspase-8. Activation of Caspase-8 on the aggregated receptor leads to apoptosis. Triggering of death receptors is mediated through the binding of specific ligands of the TNF family, which are homotrim...
202
+
203
+
204
+ *Document 2:*
205
+
206
+ - Title: Curcumin sensitizes tumor necrosis factor-related apoptosis-inducing ligand (TRAIL)-induced apoptosis through reactive oxygen species-mediated upre...
207
+
208
+ - Text: Curcumin exhibits anti-inflammatory and antitumor activities. Although its functional mechanism has not been elucidated so far, numerous studies have shown that curcumin induces apoptosis in cancer cells. In the present study, we show that subtoxic concentrations of curcumin sensitize human renal cancer cells to the tumor necrosis factor-related apoptosis inducing ligand (TRAIL)-mediated apoptosis...
209
+
210
+
211
+ *Document 3:*
212
+
213
+ - Title: New perspectives of curcumin in cancer prevention
214
+
215
+ - Text: Numerous natural compounds have been extensively investigated for their potential for cancer prevention over decades. Curcumin, from Curcuma longa, is a highly promising natural compound that can be potentially used for chemoprevention of multiple cancers. Curcumin modulates multiple molecular pathways involved in the lengthy carcinogenesis process to exert its chemopreventive effects through seve...
216
+
217
+
218
+
219
+ ---
220
+
221
+
222
+ #### Comparison Example 4
223
+
224
+ **Query:** Organic Milk and Prostate Cancer
225
+
226
+ **From:** dev split
227
+
228
+
229
+ **Score 2 Documents:**
230
+
231
+
232
+ *Document 1:*
233
+
234
+ - Title: Milk Intake in Early Life and Risk of Advanced Prostate Cancer
235
+
236
+ - Text: The authors investigated whether early-life residency in certain areas of Iceland marked by distinct differences in milk intake was associated with risk of prostate cancer in a population-based cohort of 8,894 men born between 1907 and 1935. Through linkage to cancer and mortality registers, the men were followed for prostate cancer diagnosis and mortality from study entry (in waves from 1967 to 1...
237
+
238
+
239
+ *Document 2:*
240
+
241
+ - Title: The experience of Japan as a clue to the etiology of breast and ovarian cancers: relationship between death from both malignancies and dietary prac...
242
+
243
+ - Text: Although breast and ovarian cancers are rare in Japan compared with other developed countries, the death rates for both are increasing. In Japan, dramatic lifestyle changes occurred after World War II. Over the past 50 years (1947-1997), the age-standardized death rates of breast and ovarian cancers increased about 2- and 4-fold, respectively, and the respective intake of milk, meat, and eggs incr...
244
+
245
+
246
+ *Document 3:*
247
+
248
+ - Title: Milk consumption is a risk factor for prostate cancer in Western countries: evidence from cohort studies.
249
+
250
+ - Text: We have previously found a positive association between milk consumption and prostate cancer risk using meta-analysis to analyze published case-control studies. In the present study, further meta-analysis was conducted to estimate the summary relative risk (RR) between the consumption of milk and dairy products and prostate cancer from cohort studies published between 1966- 2006. We found 18 relev...
251
+
252
+
253
+
254
+ **Score 1 Documents:**
255
+
256
+
257
+ *Document 1:*
258
+
259
+ - Title: The experience of Japan as a clue to the etiology of testicular and prostatic cancers.
260
+
261
+ - Text: In Japan dramatic lifestyle changes occurred after World War 2. To examine the experience of Japan as a clue to the etiology, trends in the mortality rates of testicular and prostatic cancers from 1947 to 1998 were related to changes in dietary practices. The male population born before 1945 had a peak in death from testicular cancer in their thirties or forties, whereas those born after 1946 had ...
262
+
263
+
264
+ *Document 2:*
265
+
266
+ - Title: Effect of diet and exercise on serum insulin, IGF-I, and IGFBP-1 levels and growth of LNCaP cells in vitro (United States).
267
+
268
+ - Text: OBJECTIVE: Accumulating evidence indicates that prostate cancer is associated with high levels of serum IGF-I. This study was conducted to determine whether a low-fat diet and exercise (DE) intervention may modulate the IGF axis and reduce prostate cancer cell growth in vitro. METHODS: Fasting serum was obtained from 14 men (age 60 +/- 3 years) participating in an 11-day DE program and from eight ...
269
+
270
+
271
+ *Document 3:*
272
+
273
+ - Title: Intensive lifestyle changes may affect the progression of prostate cancer.
274
+
275
+ - Text: PURPOSE: Men with prostate cancer are often advised to make changes in diet and lifestyle, although the impact of these changes has not been well documented. Therefore, we evaluated the effects of comprehensive lifestyle changes on prostate specific antigen (PSA), treatment trends and serum stimulated LNCaP cell growth in men with early, biopsy proven prostate cancer after 1 year. MATERIALS AND ME...
276
+
277
+
278
+
279
+ ---
280
+
281
+
282
+ #### Comparison Example 5
283
+
284
+ **Query:** Foods for Macular Degeneration
285
+
286
+ **From:** dev split
287
+
288
+
289
+ **Score 2 Documents:**
290
+
291
+
292
+ *Document 1:*
293
+
294
+ - Title: Blueberry anthocyanins: protection against ageing and light-induced damage in retinal pigment epithelial cells.
295
+
296
+ - Text: Retinal pigment epithelium (RPE) cells are vital for retinal health. However, they are susceptible to injury with ageing and exposure to excessive light, including UV (100-380 nm) and visible (380-760 nm) radiation. To evaluate the protective effect of blueberry anthocyanins on RPE cells, in vitro cell models of replicative senescent and light-induced damage were established in the present study. ...
297
+
298
+
299
+ *Document 2:*
300
+
301
+ - Title: Macular pigment and risk for age-related macular degeneration in subjects from a Northern European population.
302
+
303
+ - Text: PURPOSE: Age and advanced disease in the fellow eye are the two most important risk factors for age-related macular degeneration (AMD). In this study, the authors investigated the relationship between these variables and the optical density of macular pigment (MP) in a group of subjects from a northern European population. METHODS: The optical density of MP was measured psychophysically in 46 subj...
304
+
305
+
306
+ *Document 3:*
307
+
308
+ - Title: A 12-wk egg intervention increases serum zeaxanthin and macular pigment optical density in women.
309
+
310
+ - Text: Two carotenoids found in egg yolk, lutein and zeaxanthin, accumulate in the macular retina where they may reduce photostress. Increases in serum lutein and zeaxanthin were observed in previous egg interventions, but no study measured macular carotenoids. The objective of this project was to determine whether increased consumption of eggs would increase retinal lutein and zeaxanthin, or macular pig...
311
+
312
+
313
+
314
+ **Score 1 Documents:**
315
+
316
+
317
+ *Document 1:*
318
+
319
+ - Title: Iris color and macular pigment optical density.
320
+
321
+ - Text: The present study was designed to assess the relationship between iris color and macular pigment optical density. Both melanin and carotenoids (responsible for iris color and macular pigment composition, respectively) appear to protect the retina through similar mechanisms and higher concentrations may reduce the incidence of retinal degenerations. To evaluate this relationship, 95 subjects were e...
322
+
323
+
324
+ *Document 2:*
325
+
326
+ - Title: Nearwork-induced transient myopia (NITM) and permanent myopia--is there a link?
327
+
328
+ - Text: Myopia is a worldwide public health problem. However, its understanding is incomplete, and many of its preventative and therapeutic aspects remain controversial. Nearwork is a primary, environmentally based factor in the aetiology of permanent myopia (PM), with nearwork-induced transient myopia (NITM) being a possible contributory component. A relationship between PM and NITM has been suggested, b...
329
+
330
+
331
+ *Document 3:*
332
+
333
+ - Title: Variations in product choices of frequently purchased herbs: caveat emptor.
334
+
335
+ - Text: BACKGROUND: Patients who report use of herbs to their physicians may not be able to accurately describe the ingredients or recommended dosage because the products for the same herb may differ. The purpose of this study was to describe variations in label information of products for each of the 10 most commonly purchased herbs. METHODS: Products for each of 10 herbs were surveyed in a convenience s...
336
+
337
+
338
+
339
+ ---
340
+
341
+
342
+ ### Examples by Split and Score
343
+
344
+
345
+ #### TRAIN Split
346
+
347
+ **Scores in train: [1]**
348
+
349
+
350
+ ##### Score 1 Examples (train)
351
+
352
+
353
+ **Example 1:**
354
+
355
+ - Query: Breast Cancer Cells Feed on Cholesterol
356
+
357
+ - Doc Title: Content of low density lipoprotein receptors in breast cancer tissue related to survival of patients.
358
+
359
+ - Doc Text: The content of low density lipoprotein (LDL) receptors in tissue from primary breast cancers was determined and its prognostic information compared with that of variables of established prognostic importance. Frozen tumour specimens were selected, and tissue from 72 patients (32 of whom had died) we...
360
+
361
+
362
+ **Example 2:**
363
+
364
+ - Query: Using Diet to Treat Asthma and Eczema
365
+
366
+ - Doc Title: Vegetarian diet ameliorates symptoms of atopic dermatitis through reduction of the number of peripheral eosinophils and of PGE2 synthesis by monocy...
367
+
368
+ - Doc Text: Many patients with atopic dermatitis are dissatisfied with conventional treatments based on topical steroids and have experienced some traditional remedies and alternative therapies. However, most of such therapies have not been evaluated scientifically and clinically by specialists. This study was ...
369
+
370
+
371
+ **Example 3:**
372
+
373
+ - Query: Treating Asthma With Plants vs. Pills
374
+
375
+ - Doc Title: Dietary polyphenols in the prevention and treatment of allergic diseases.
376
+
377
+ - Doc Text: Allergic disorders encompass skin, food and respiratory allergies. Sensitization to a normally harmless allergen results in the immune system being biased to a predominant T-helper type 2 response. Re-exposure to the same allergen leads to a robust secretion of allergy-related mediators that eventua...
378
+
379
+
380
+ **Example 4:**
381
+
382
+ - Query: How Fruits and Vegetables Can Treat Asthma
383
+
384
+ - Doc Title: Diet and asthma, allergic rhinoconjunctivitis and atopic eczema symptom prevalence: an ecological analysis of the International Study of Asthma and...
385
+
386
+ - Doc Text: Several studies have suggested that the increasing prevalence of symptoms of asthma, rhinitis and eczema, could be associated with dietary factors. In the present paper, a global analysis of prevalence rates of wheeze, allergic rhinoconjunctivitis and atopic eczema was performed in relation to diet,...
387
+
388
+
389
+ **Example 5:**
390
+
391
+ - Query: How Fruits and Vegetables Can Prevent Asthma
392
+
393
+ - Doc Title: The association of diet with respiratory symptoms and asthma in schoolchildren in Taipei, Taiwan.
394
+
395
+ - Doc Text: This study aimed to evaluate the association of diet with respiratory symptoms and asthma in schoolchildren in Taipei, Taiwan. An in-class interview survey elicited experiences of asthma and respiratory symptoms and consumption frequencies of the major food categories in 2290 fifth graders. Respirat...
396
+
397
+
398
+ **Example 6:**
399
+
400
+ - Query: Our Tax Dollars Subsidize Unhealthy Foods
401
+
402
+ - Doc Title: Food prices and blood cholesterol.
403
+
404
+ - Doc Text: Cardiovascular diseases (CVD) cost Americans billions of dollars per year. High cholesterol levels, which are closely related to dietary habits, are a major contributor to CVD. In this article, we study whether changes in food prices are related to cholesterol levels and whether taxes or subsidies o...
405
+
406
+
407
+ **Example 7:**
408
+
409
+ - Query: Reducing Arsenic in Chicken and Rice
410
+
411
+ - Doc Title: Arsenic and lead in juice: apple, citrus, and apple-base.
412
+
413
+ - Doc Text: Exposure limits for arsenic and lead in drinking water have long been established by the U.S. Environmental Protection Agency and new regulations regarding the presence of these contaminants in bottled water went into effect in California in 2009. No comparable exposure limits or regulations are ava...
414
+
415
+
416
+ **Example 8:**
417
+
418
+ - Query: How Contaminated Are Our Children?
419
+
420
+ - Doc Title: Cancer and non-cancer health effects from food contaminant exposures for children and adults in California: a risk assessment
421
+
422
+ - Doc Text: Background In the absence of current cumulative dietary exposure assessments, this analysis was conducted to estimate exposure to multiple dietary contaminants for children, who are more vulnerable to toxic exposure than adults. Methods We estimated exposure to multiple food contaminants based on di...
423
+
424
+
425
+ **Example 9:**
426
+
427
+ - Query: Blood Type Diet Perceived as "Crass Fraud"
428
+
429
+ - Doc Title: Blood type diets lack supporting evidence: a systematic review.
430
+
431
+ - Doc Text: BACKGROUND: Diets that are based on the ABO blood group system have been promoted over the past decade and claim to improve health and decrease risk of disease. To our knowledge, the evidence to support the effectiveness of blood type diets has not previously been assessed in the scientific literatu...
432
+
433
+
434
+ **Example 10:**
435
+
436
+ - Query: Why Do Heart Doctors Favor Surgery and Drugs Over Diet?
437
+
438
+ - Doc Title: Cross-analysis of dietary prescriptions and adherence in 356 hypercholesterolaemic patients.
439
+
440
+ - Doc Text: BACKGROUND: One of the major issues in controlling serum cholesterol through dietetic intervention appears to be the need to improve patient adherence. AIMS: To explore the many questions regarding barriers to, and motivators for, cholesterol-lowering diet adherence. METHODS: We surveyed French gene...
441
+
442
+
443
+ #### DEV Split
444
+
445
+ **Scores in dev: [1, 2]**
446
+
447
+
448
+ ##### Score 1 Examples (dev)
449
+
450
+
451
+ **Example 1:**
452
+
453
+ - Query: Why Deep Fried Foods May Cause Cancer
454
+
455
+ - Doc Title: Review of epidemiologic studies of dietary acrylamide intake and the risk of cancer.
456
+
457
+ - Doc Text: Conjectured associations between dietary acrylamide intake and cancer have been evaluated in more than 15 epidemiologic studies examining almost every major cancer site. We have critically reviewed the epidemiologic studies of estimated dietary acrylamide exposure and cancer. As substantially greate...
458
+
459
+
460
+ **Example 2:**
461
+
462
+ - Query: Living Longer by Reducing Leucine Intake
463
+
464
+ - Doc Title: mTOR is a key modulator of ageing and age-related disease
465
+
466
+ - Doc Text: Many experts in the biology of ageing believe that pharmacological interventions to slow ageing are a matter of ‘when’ rather than ‘if’. A leading target for such interventions is the nutrient response pathway defined by the mechanistic target of rapamycin (mTOR). Inhibition of this pathway extends ...
467
+
468
+
469
+ **Example 3:**
470
+
471
+ - Query: Why are Cancer Rates so Low in India?
472
+
473
+ - Doc Title: Apoptosis induced by death receptors.
474
+
475
+ - Doc Text: Death receptors belong to the TNF receptor family and are characterised by an intracellular death domain that serves to recruit adapter proteins such as TRADD and FADD and cysteine proteases such as Caspase-8. Activation of Caspase-8 on the aggregated receptor leads to apoptosis. Triggering of death...
476
+
477
+
478
+ **Example 4:**
479
+
480
+ - Query: Peeks Behind the Egg Industry Curtain
481
+
482
+ - Doc Title: Characterization of bacteria, clostridia and Bacteroides in faeces of vegetarians using qPCR and PCR-DGGE fingerprinting.
483
+
484
+ - Doc Text: BACKGROUND/AIMS: This study aimed to investigate the quantitative and qualitative changes of bacteria, Bacteroides, Bifidobacterium and Clostridium cluster IV in faecal microbiota associated with a vegetarian diet. METHODS: Bacterial abundances were measured in faecal samples of 15 vegetarians and 1...
485
+
486
+
487
+ **Example 5:**
488
+
489
+ - Query: Organic Milk and Prostate Cancer
490
+
491
+ - Doc Title: The experience of Japan as a clue to the etiology of testicular and prostatic cancers.
492
+
493
+ - Doc Text: In Japan dramatic lifestyle changes occurred after World War 2. To examine the experience of Japan as a clue to the etiology, trends in the mortality rates of testicular and prostatic cancers from 1947 to 1998 were related to changes in dietary practices. The male population born before 1945 had a p...
494
+
495
+
496
+ **Example 6:**
497
+
498
+ - Query: Foods for Macular Degeneration
499
+
500
+ - Doc Title: Iris color and macular pigment optical density.
501
+
502
+ - Doc Text: The present study was designed to assess the relationship between iris color and macular pigment optical density. Both melanin and carotenoids (responsible for iris color and macular pigment composition, respectively) appear to protect the retina through similar mechanisms and higher concentrations ...
503
+
504
+
505
+ **Example 7:**
506
+
507
+ - Query: How Beans Help Our Bones
508
+
509
+ - Doc Title: Neuroprotective effect of the natural iron chelator, phytic acid in a cell culture model of Parkinson's disease.
510
+
511
+ - Doc Text: Disrupted iron metabolism and excess iron accumulation has been reported in the brains of Parkinson's disease (PD) patients. Because excessive iron can induce oxidative stress subsequently causing degradation of nigral dopaminergic neurons in PD, we determined the protective effect of a naturally oc...
512
+
513
+
514
+ **Example 8:**
515
+
516
+ - Query: Phosphate Additives in Chicken Banned Elsewhere
517
+
518
+ - Doc Title: The Prevalence of Phosphorus Containing Food Additives in Top Selling Foods in Grocery Stores
519
+
520
+ - Doc Text: Objective To determine the prevalence of phosphorus-containing food additives in best selling processed grocery products and to compare the phosphorus content of a subset of top selling foods with and without phosphorus additives. Design The labels of 2394 best selling branded grocery products in no...
521
+
522
+
523
+ **Example 9:**
524
+
525
+ - Query: How to Boost the Benefits of Exercise
526
+
527
+ - Doc Title: Antigenotoxic activity of watercress extract in an in vitro mammalian system using comet assay.
528
+
529
+ - Doc Text: Watercress (Cruciferae), an integral part of Mediterranean diets, is a nutritive food which is used in the treatment of several diseases. Oxidative DNA damage seems to play a crucial role in chronic, aging-related diseases and it is considered an important and probably carcinogenic factor. The aim o...
530
+
531
+
532
+ **Example 10:**
533
+
534
+ - Query: How to Treat Multiple Sclerosis With Diet
535
+
536
+ - Doc Title: The molecular basis of nutritional intervention in multiple sclerosis: a narrative review.
537
+
538
+ - Doc Text: It is commonly accepted that nutrition is one of the possible environmental factors involved in the pathogenesis of multiple sclerosis (MS), but its role as complementary MS treatment is unclear and largely disregarded. At present, MS therapy is not associated to a particular diet, probably due to l...
539
+
540
+
541
+ ##### Score 2 Examples (dev)
542
+
543
+
544
+ **Example 1:**
545
+
546
+ - Query: Why Deep Fried Foods May Cause Cancer
547
+
548
+ - Doc Title: Birth Weight, Head Circumference, and Prenatal Exposure to Acrylamide from Maternal Diet: The European Prospective Mother–Child Study (NewGeneris)
549
+
550
+ - Doc Text: Background: Acrylamide is a common dietary exposure that crosses the human placenta. It is classified as a probable human carcinogen, and developmental toxicity has been observed in rodents. Objectives: We examined the associations between prenatal exposure to acrylamide and birth outcomes in a pros...
551
+
552
+
553
+ **Example 2:**
554
+
555
+ - Query: Living Longer by Reducing Leucine Intake
556
+
557
+ - Doc Title: Insights into the beneficial effect of caloric/ dietary restriction for a healthy and prolonged life
558
+
559
+ - Doc Text: Over the last several years, new evidence has kept pouring in about the remarkable effect of caloric restriction (CR) on the conspicuous bedfellows- aging and cancer. Through the use of various animal models, it is now well established that by reducing calorie intake one can not only increase life s...
560
+
561
+
562
+ **Example 3:**
563
+
564
+ - Query: Why are Cancer Rates so Low in India?
565
+
566
+ - Doc Title: Curcumin and cancer: an "old-age" disease with an "age-old" solution.
567
+
568
+ - Doc Text: Cancer is primarily a disease of old age, and that life style plays a major role in the development of most cancers is now well recognized. While plant-based formulations have been used to treat cancer for centuries, current treatments usually involve poisonous mustard gas, chemotherapy, radiation, ...
569
+
570
+
571
+ **Example 4:**
572
+
573
+ - Query: Organic Milk and Prostate Cancer
574
+
575
+ - Doc Title: Milk Intake in Early Life and Risk of Advanced Prostate Cancer
576
+
577
+ - Doc Text: The authors investigated whether early-life residency in certain areas of Iceland marked by distinct differences in milk intake was associated with risk of prostate cancer in a population-based cohort of 8,894 men born between 1907 and 1935. Through linkage to cancer and mortality registers, the men...
578
+
579
+
580
+ **Example 5:**
581
+
582
+ - Query: Foods for Macular Degeneration
583
+
584
+ - Doc Title: Blueberry anthocyanins: protection against ageing and light-induced damage in retinal pigment epithelial cells.
585
+
586
+ - Doc Text: Retinal pigment epithelium (RPE) cells are vital for retinal health. However, they are susceptible to injury with ageing and exposure to excessive light, including UV (100-380 nm) and visible (380-760 nm) radiation. To evaluate the protective effect of blueberry anthocyanins on RPE cells, in vitro c...
587
+
588
+
589
+ **Example 6:**
590
+
591
+ - Query: How Beans Help Our Bones
592
+
593
+ - Doc Title: The role of phytic acid in legumes: antinutrient or beneficial function?
594
+
595
+ - Doc Text: This review describes the present state of knowledge about phytic acid (phytate), which is often present in legume seeds. The antinutritional effects of phytic acid primarily relate to the strong chelating associated with its six reactive phosphate groups. Its ability to complex with proteins and pa...
596
+
597
+
598
+ **Example 7:**
599
+
600
+ - Query: Phosphate Additives in Chicken Banned Elsewhere
601
+
602
+ - Doc Title: Effects of Polyphosphate Additives on Campylobacter Survival in Processed Chicken Exudates
603
+
604
+ - Doc Text: Campylobacter spp. are responsible for a large number of the bacterial food poisoning cases worldwide. Despite being sensitive to oxygen and nutritionally fastidious, Campylobacter spp. are able to survive in food processing environments and reach consumers in sufficient numbers to cause disease. To...
605
+
606
+
607
+ **Example 8:**
608
+
609
+ - Query: How to Boost the Benefits of Exercise
610
+
611
+ - Doc Title: Endurance exercise results in DNA damage as detected by the comet assay.
612
+
613
+ - Doc Text: To determine if 6 weeks of supplementation with antioxidants could alleviate exercise-induced DNA damage, we studied 21 runners during a 50 km ultramarathon. Subjects were randomly assigned to one of two groups: (1) placebos (PL) or (2) antioxidants (AO) (1000 mg vitamin C and 400 IU RRR-alpha-tocop...
614
+
615
+
616
+ **Example 9:**
617
+
618
+ - Query: How to Treat Multiple Sclerosis With Diet
619
+
620
+ - Doc Title: Improvement of endothelium-dependent coronary vasodilation after a single LDL apheresis in patients with hypercholesterolemia.
621
+
622
+ - Doc Text: The purpose of this study was to determine whether a single LDL apheresis would improve impaired endothelium-dependent dilation of the coronary artery in patients with hypercholesterolemia. Hypercholesterolemia is associated with impaired endothelial function, and human studies using cholesterol-low...
623
+
624
+
625
+ **Example 10:**
626
+
627
+ - Query: How to Get Kids to Eat Their Vegetables
628
+
629
+ - Doc Title: Attractive names sustain increased vegetable intake in schools.
630
+
631
+ - Doc Text: OBJECTIVE: This study will determine if the selective use of attractive names can be a sustainable, scalable means to increase the selection of vegetables in school lunchrooms. METHODS: Study 1 paired an attractive name with carrots in five elementary schools (n=147) and measured selection and consu...
632
+
633
+
634
+ #### TEST Split
635
+
636
+ **Scores in test: [1, 2]**
637
+
638
+
639
+ ##### Score 1 Examples (test)
640
+
641
+
642
+ **Example 1:**
643
+
644
+ - Query: Do Cholesterol Statin Drugs Cause Breast Cancer?
645
+
646
+ - Doc Title: Women and statin use: a women's health advocacy perspective.
647
+
648
+ - Doc Text: This paper is based on a longer report on the benefits, safety and modalities of information representation with regard to women and statin use, situated within the historical context of Women's Health Movement which has advocated for unbiased, appropriate medical research and prescribing for women ...
649
+
650
+
651
+ **Example 2:**
652
+
653
+ - Query: Exploiting Autophagy to Live Longer
654
+
655
+ - Doc Title: Why human lifespan is rapidly increasing: solving "longevity riddle" with "revealed-slow-aging" hypothesis
656
+
657
+ - Doc Text: Healthy life span is rapidly increasing and human aging seems to be postponed. As recently exclaimed in Nature, these findings are so perplexing that they can be dubbed the 'longevity riddle'. To explain current increase in longevity, I discuss that certain genetic variants such as hyper-active mTOR...
658
+
659
+
660
+ **Example 3:**
661
+
662
+ - Query: How to Reduce Exposure to Alkylphenols Through Your Diet
663
+
664
+ - Doc Title: Maternal meat and fat consumption during pregnancy and suspected atopic eczema in Japanese infants aged 3-4 months: the Osaka Maternal and Child He...
665
+
666
+ - Doc Text: Interest has increased in the possibility that maternal dietary intake during pregnancy might influence the development of allergic disorders in children. The present prospective study examined the association of maternal intake of selected foods high in fatty acids and specific types of fatty acids...
667
+
668
+
669
+ **Example 4:**
670
+
671
+ - Query: What’s Driving America’s Obesity Problem?
672
+
673
+ - Doc Title: Prevalence of physical activity and obesity in US counties, 2001–2011: a road map for action
674
+
675
+ - Doc Text: Background Obesity and physical inactivity are associated with several chronic conditions, increased medical care costs, and premature death. Methods We used the Behavioral Risk Factor Surveillance System (BRFSS), a state-based random-digit telephone survey that covers the majority of United States ...
676
+
677
+
678
+ **Example 5:**
679
+
680
+ - Query: Who Should be Careful About Curcumin?
681
+
682
+ - Doc Title: Curcumin: a new paradigm and therapeutic opportunity for the treatment of osteoarthritis: curcumin for osteoarthritis management
683
+
684
+ - Doc Text: The management of osteoarthritis represents a real challenge. This complex and multi-factorial disease evolves over decades and requires not only the alleviation of symptoms, i.e. pain and joint function but also the preservation of articular structure without side effects. Nutraceuticals are good c...
685
+
686
+
687
+ **Example 6:**
688
+
689
+ - Query: Foods for Glaucoma
690
+
691
+ - Doc Title: Blueberry anthocyanins: protection against ageing and light-induced damage in retinal pigment epithelial cells.
692
+
693
+ - Doc Text: Retinal pigment epithelium (RPE) cells are vital for retinal health. However, they are susceptible to injury with ageing and exposure to excessive light, including UV (100-380 nm) and visible (380-760 nm) radiation. To evaluate the protective effect of blueberry anthocyanins on RPE cells, in vitro c...
694
+
695
+
696
+ **Example 7:**
697
+
698
+ - Query: What is Actually in Chicken Nuggets?
699
+
700
+ - Doc Title: The Prevalence of Phosphorus Containing Food Additives in Top Selling Foods in Grocery Stores
701
+
702
+ - Doc Text: Objective To determine the prevalence of phosphorus-containing food additives in best selling processed grocery products and to compare the phosphorus content of a subset of top selling foods with and without phosphorus additives. Design The labels of 2394 best selling branded grocery products in no...
703
+
704
+
705
+ **Example 8:**
706
+
707
+ - Query: What Do Meat Purge and Cola Have in Common?
708
+
709
+ - Doc Title: The Prevalence of Phosphorus Containing Food Additives in Top Selling Foods in Grocery Stores
710
+
711
+ - Doc Text: Objective To determine the prevalence of phosphorus-containing food additives in best selling processed grocery products and to compare the phosphorus content of a subset of top selling foods with and without phosphorus additives. Design The labels of 2394 best selling branded grocery products in no...
712
+
713
+
714
+ **Example 9:**
715
+
716
+ - Query: Chronic Headaches and Pork Parasites
717
+
718
+ - Doc Title: Cognitive Changes and Quality of Life in Neurocysticercosis: A Longitudinal Study
719
+
720
+ - Doc Text: Background Few studies have focused on the cognitive morbidity of neurocysticercosis (NCC), one of the most common parasitic infections of the central nervous system. We longitudinally assessed the cognitive status and quality of life (QoL) of patients with incident symptomatic NCC cases and matched...
721
+
722
+
723
+ **Example 10:**
724
+
725
+ - Query: Stopping Heart Disease in Childhood
726
+
727
+ - Doc Title: Endothelial function in healthy 11-year-old children after dietary intervention with onset in infancy: the Special Turku Coronary Risk Factor Inter...
728
+
729
+ - Doc Text: BACKGROUND: Early childhood introduction of nutritional habits aimed at atherosclerosis prevention reduces children's serum total cholesterol concentration, but its effect on vascular endothelial function is unknown. METHODS AND RESULTS: Between 1990 and 1992, we randomized healthy 7-month-old infan...
730
+
731
+
732
+ ##### Score 2 Examples (test)
733
+
734
+
735
+ **Example 1:**
736
+
737
+ - Query: Do Cholesterol Statin Drugs Cause Breast Cancer?
738
+
739
+ - Doc Title: Elevated Levels of Cholesterol-Rich Lipid Rafts in Cancer Cells Are Correlated with Apoptosis Sensitivity Induced by Cholesterol-Depleting Agents
740
+
741
+ - Doc Text: Lipid rafts/caveolae are membrane platforms for signaling molecules that regulate various cellular functions, including cell survival. To better understand the role of rafts in tumor progression and therapeutics, we investigated the effect of raft disruption on cell viability and compared raft level...
742
+
743
+
744
+ **Example 2:**
745
+
746
+ - Query: Exploiting Autophagy to Live Longer
747
+
748
+ - Doc Title: Insights into the beneficial effect of caloric/ dietary restriction for a healthy and prolonged life
749
+
750
+ - Doc Text: Over the last several years, new evidence has kept pouring in about the remarkable effect of caloric restriction (CR) on the conspicuous bedfellows- aging and cancer. Through the use of various animal models, it is now well established that by reducing calorie intake one can not only increase life s...
751
+
752
+
753
+ **Example 3:**
754
+
755
+ - Query: How to Reduce Exposure to Alkylphenols Through Your Diet
756
+
757
+ - Doc Title: p-Nonyl-phenol: an estrogenic xenobiotic released from "modified" polystyrene.
758
+
759
+ - Doc Text: Alkylphenols are widely used as plastic additives and surfactants. We report the identification of an alkylphenol, nonylphenol, as an estrogenic substance released from plastic centrifuge tubes. This compound was extracted with methanol, purified by flash chromatography and reverse-phase high perfor...
760
+
761
+
762
+ **Example 4:**
763
+
764
+ - Query: What’s Driving America’s Obesity Problem?
765
+
766
+ - Doc Title: The role of exercise in the treatment of obesity.
767
+
768
+ - Doc Text: The United States is in the midst of a significant public health problem that relates to obesity and inactivity. This epidemic has far-ranging consequences for our workforce and our children and shows no signs of slowing in the near future. Significant research has been performed on the effects of e...
769
+
770
+
771
+ **Example 5:**
772
+
773
+ - Query: Who Should be Careful About Curcumin?
774
+
775
+ - Doc Title: Total and soluble oxalate content of some Indian spices.
776
+
777
+ - Doc Text: Spices, such as cinnamon, cloves, cardamom, garlic, ginger, cumin, coriander and turmeric are used all over the world as flavouring and colouring ingredients in Indian foods. Previous studies have shown that spices contain variable amounts of total oxalates but there are few reports of soluble oxala...
778
+
779
+
780
+ **Example 6:**
781
+
782
+ - Query: Foods for Glaucoma
783
+
784
+ - Doc Title: Antioxidant intake and primary open-angle glaucoma: a prospective study.
785
+
786
+ - Doc Text: The relation between dietary antioxidant intake and primary open-angle glaucoma risk was examined in participants aged over 40 years in the Nurses' Health Study (n = 76,200) and the Health Professionals Follow-up Study (n = 40,284). They were followed biennially from 1980 and 1986, respectively, to ...
787
+
788
+
789
+ **Example 7:**
790
+
791
+ - Query: What is Actually in Chicken Nuggets?
792
+
793
+ - Doc Title: The autopsy of chicken nuggets reads "chicken little".
794
+
795
+ - Doc Text: PURPOSE: To determine the contents of chicken nuggets from 2 national food chains. BACKGROUND: Chicken nuggets have become a major component of the American diet. We sought to determine the current composition of this highly processed food. METHODS: Randomly selected nuggets from 2 different nationa...
796
+
797
+
798
+ **Example 8:**
799
+
800
+ - Query: What Do Meat Purge and Cola Have in Common?
801
+
802
+ - Doc Title: Public health impact of dietary phosphorus excess on bone and cardiovascular health in the general population.
803
+
804
+ - Doc Text: This review explores the potential adverse impact of the increasing phosphorus content in the American diet on renal, cardiovascular, and bone health of the general population. Increasingly, studies show that phosphorus intakes in excess of the nutrient needs of a healthy population may significantl...
805
+
806
+
807
+ **Example 9:**
808
+
809
+ - Query: Chronic Headaches and Pork Parasites
810
+
811
+ - Doc Title: Clinical manifestations, diagnosis, and treatment of neurocysticercosis.
812
+
813
+ - Doc Text: Neurocysticercosis (NCC) is the most frequent parasitic disease of the human brain. Modern imaging studies, CT and MRI, have defined the diagnosis and characterization of the disease. Through these studies the therapeutic approach for each case may be individualized with the aid of antihelmintics, s...
814
+
815
+
816
+ **Example 10:**
817
+
818
+ - Query: Stopping Heart Disease in Childhood
819
+
820
+ - Doc Title: Relation of serum lipoprotein levels and systolic blood pressure to early atherosclerosis. The Bogalusa Heart Study.
821
+
822
+ - Doc Text: We assessed the relation of risk factors for cardiovascular disease to early atherosclerotic lesions in the aorta and coronary arteries in 35 persons (mean age at death, 18 years). Aortic involvement with fatty streaks was greater in blacks than in whites (37 vs. 17 percent, P less than 0.01). Howev...
823
+
824
+
825
+ ## CORE
826
+
827
+ **Has Scores 1, 2 (in dev/test) but only 1 in train**
828
+ **Score 2 = HIGHLY RELEVANT** ✓
829
+ **Score 1 = RELEVANT** ✓
830
+
831
+ ## Special Notes
832
+
833
+ - **Original Purpose**: Medical IR bridging layman queries to technical PubMed docs
834
+ - **What Scores Really Mean**:
835
+ - Score 2: Direct, highly relevant medical papers (best answers)
836
+ - Score 1: Related papers providing context/mechanisms (still relevant!)
837
+ - Example: "Reducing Leucine" query → Score 2: caloric restriction papers, Score 1: mTOR/amino acid papers (leucine activates mTOR!)
838
+ - **Dataset Type**: Medical domain Q&A (NutritionFacts.org)
839
+ - **CRITICAL ISSUE**: Train has collapsed scores (all 1s), test/dev have proper 1/2 distinction
840
+
841
+ ## Training Verdict
842
+
843
+ **GOOD FOR TRAINING** ✓ (with caveats)
844
+ - Both Score 1 and 2 are truly relevant
845
+ - Score 1 provides mechanistic/related info
846
+ - BUT: Must handle train/test mismatch (train lacks Score 2)
847
+ - Recommendation: Use as binary (1+2 = positive) OR use only dev/test splits
848
+
849
+ ## Observations
850
+
851
+ ⚠️ **Score mismatch between train and test:**
852
+
853
+ - Train has scores: [1]
854
+
855
+ - Test has scores: [1, 2]
856
+
857
+ - This could cause issues when training models
858
+
859
+ ⚠️ **Training data only has Score 1:**
860
+
861
+ - All training documents are marked as equally relevant
862
+
863
+ - Model cannot learn to distinguish relevance levels
864
+
865
+ - May indicate collapsed/merged relevance levels
test_scores/dataset_reports/quora_analysis.md ADDED
File without changes
test_scores/dataset_reports/scidocs_analysis.md ADDED
@@ -0,0 +1,555 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SCIDOCS Dataset Analysis
2
+
3
+ ## Overview
4
+
5
+ - Total queries available: 1000
6
+
7
+ - Total corpus documents: 25657
8
+
9
+ - Corpus documents loaded for examples: 25657
10
+
11
+
12
+ ## Split Statistics
13
+
14
+ | Split | Queries | Total Pairs | Score Distribution |
15
+
16
+ |-------|---------|-------------|-------------------|
17
+
18
+ | train | N/A | N/A | N/A |
19
+
20
+ | dev | N/A | N/A | N/A |
21
+
22
+ | test | 1000 | 29928 | 0:25000, 1:4928 |
23
+
24
+
25
+ ## Score Analysis
26
+
27
+ **Multiple score levels found: [0, 1]**
28
+
29
+ - Higher scores indicate higher relevance
30
+
31
+ - Score 0 typically means not relevant
32
+
33
+ - Score 1 typically means relevant
34
+
35
+ - Score 2 typically means highly relevant
36
+
37
+
38
+ ## Examples
39
+
40
+ ### Score Comparison Examples
41
+
42
+ #### Comparison Example 1
43
+
44
+ **Query:** A Direct Search Method to solve Economic Dispatch Problem with Valve-Point Effect
45
+
46
+ **From:** test split
47
+
48
+
49
+ **Score 1 Documents:**
50
+
51
+
52
+ *Document 1:*
53
+
54
+ - Title: A hybrid of genetic algorithm and particle swarm optimization for recurrent network design
55
+
56
+ - Text: An evolutionary recurrent network which automates the design of recurrent neural/fuzzy networks using a new evolutionary learning algorithm is proposed in this paper. This new evolutionary learning algorithm is based on a hybrid of genetic algorithm (GA) and particle swarm optimization (PSO), and is thus called HGAPSO. In HGAPSO, individuals in a new generation are created, not only by crossover a...
57
+
58
+
59
+ *Document 2:*
60
+
61
+ - Title: A Hybrid EP and SQP for Dynamic Economic Dispatch with Nonsmooth Fuel Cost Function
62
+
63
+ - Text: Dynamic economic dispatch (DED) is one of the main functions of power generation operation and control. It determines the optimal settings of generator units with predicted load demand over a certain period of time. The objective is to operate an electric power system most economically while the system is operating within its security limits. This paper proposes a new hybrid methodology for solvin...
64
+
65
+
66
+ *Document 3:*
67
+
68
+ - Title: Genetic Fuzzy Systems - Evolutionary Tuning and Learning of Fuzzy Knowledge Bases
69
+
70
+ - Text: It's not surprisingly when entering this site to get the book. One of the popular books now is the genetic fuzzy systems evolutionary tuning and learning of fuzzy knowledge bases. You may be confused because you can't find the book in the book store around your city. Commonly, the popular book will be sold quickly. And when you have found the store to buy the book, it will be so hurt when you run ...
71
+
72
+
73
+
74
+ **Score 0 Documents:**
75
+
76
+
77
+ *Document 1:*
78
+
79
+ - Title: Separate face and body selectivity on the fusiform gyrus.
80
+
81
+ - Text: Recent reports of a high response to bodies in the fusiform face area (FFA) challenge the idea that the FFA is exclusively selective for face stimuli. We examined this claim by conducting a functional magnetic resonance imaging experiment at both standard (3.125 x 3.125 x 4.0 mm) and high resolution (1.4 x 1.4 x 2.0 mm). In both experiments, regions of interest (ROIs) were defined using data from ...
82
+
83
+
84
+ *Document 2:*
85
+
86
+ - Title: Scheduling for Reduced CPU Energy
87
+
88
+ - Text: The energy usage of computer systems is becoming more important, especially for battery operated systems. Displays, disks, and cpus, in that order, use the most energy. Reducing the energy used by displays and disks has been studied elsewhere; this paper considers a new method for reducing the energy used by the cpu. We introduce a new metric for cpu energy performance, millions-of-instructions-pe...
89
+
90
+
91
+ *Document 3:*
92
+
93
+ - Title: A data mining approach for location prediction in mobile environments
94
+
95
+ - Text: Mobility prediction is one of the most essential issues that need to be explored for mobility management in mobile computing systems. In this paper, we propose a new algorithm for predicting the next inter-cell movement of a mobile user in a Personal Communication Systems network. In the first phase of our threephase algorithm, user mobility patterns are mined from the history of mobile user traje...
96
+
97
+
98
+
99
+ ---
100
+
101
+
102
+ #### Comparison Example 2
103
+
104
+ **Query:** Bearish-Bullish Sentiment Analysis on Financial Microblogs
105
+
106
+ **From:** test split
107
+
108
+
109
+ **Score 1 Documents:**
110
+
111
+
112
+ *Document 1:*
113
+
114
+ - Title: SemEval-2015 Task 11: Sentiment Analysis of Figurative Language in Twitter
115
+
116
+ - Text: This report summarizes the objectives and evaluation of the SemEval 2015 task on the sentiment analysis of figurative language on Twitter (Task 11). This is the first sentiment analysis task wholly dedicated to analyzing figurative language on Twitter. Specifically, three broad classes of figurative language are considered: irony, sarcasm and metaphor. Gold standard sets of 8000 training tweets an...
117
+
118
+
119
+ *Document 2:*
120
+
121
+ - Title: Text mining of news-headlines for FOREX market prediction: A Multi-layer Dimension Reduction Algorithm with semantics and sentiment
122
+
123
+ - Text: In this paper a novel approach is proposed to predict intraday directional-movements of a currency-pair in the foreign exchange market based on the text of breaking financial news-headlines. The motivation behind this work is twofold: First, although market-prediction through text-mining is shown to be a promising area of work in the literature, the text-mining approaches utilized in it at this st...
124
+
125
+
126
+ *Document 3:*
127
+
128
+ - Title: BabelNet: The automatic construction, evaluation and application of a wide-coverage multilingual semantic network
129
+
130
+ - Text: a r t i c l e i n f o a b s t r a c t We present an automatic approach to the construction of BabelNet, a very large, wide-coverage multilingual semantic network. Key to our approach is the integration of lexicographic and encyclopedic knowledge from WordNet and Wikipedia. In addition, Machine Translation is applied to enrich the resource with lexical information for all languages. We first conduc...
131
+
132
+
133
+
134
+ **Score 0 Documents:**
135
+
136
+
137
+ *Document 1:*
138
+
139
+ - Title: A survey of software testing practices in alberta
140
+
141
+ - Text: Software organizations have typically de-emphasized the importance of software testing. In this paper, the results of a regional survey of software testing and software quality assurance techniques are described. Researchers conducted the study during the summer and fall of 2002 by surveying software organizations in the Province of Alberta. Results indicate that Alberta-based organizations tend t...
142
+
143
+
144
+ *Document 2:*
145
+
146
+ - Title: Analysis and Design of Average Current Mode Control Using a Describing-Function-Based Equivalent Circuit Model
147
+
148
+ - Text: This paper proposes a small-signal model for average current mode control based on an equivalent circuit. The model uses a three-terminal equivalent circuit model based on a linearized describing function method to include the feedback effect of the sideband frequency components of the inductor current. The model extends the results obtained in peak current mode control to average current mode con...
149
+
150
+
151
+ *Document 3:*
152
+
153
+ - Title: Using deep learning for short text understanding
154
+
155
+ - Text: Classifying short texts to one category or clustering semantically related texts is challenging, and the importance of both is growing due to the rise of microblogging platforms, digital news feeds, and the like. We can accomplish this classifying and clustering with the help of a deep neural network which produces compact binary representations of a short text, and can assign the same category to...
156
+
157
+
158
+
159
+ ---
160
+
161
+
162
+ #### Comparison Example 3
163
+
164
+ **Query:** Predicting defects in SAP Java code: An experience report
165
+
166
+ **From:** test split
167
+
168
+
169
+ **Score 1 Documents:**
170
+
171
+
172
+ *Document 1:*
173
+
174
+ - Title: Hipikat: a project memory for software development
175
+
176
+ - Text: Sociological and technical difficulties, such as a lack of informal encounters, can make it difficult for new members of noncollocated software development teams to learn from their more experienced colleagues. To address this situation, we have developed a tool, named Hipikat that provides developers with efficient and effective access to the group memory for a software development project that i...
177
+
178
+
179
+ *Document 2:*
180
+
181
+ - Title: A Metrics Suite for Object Oriented Design
182
+
183
+ - Text: Given the central role that software development plays in the delivery and application of information technology, managers are increasingly focusing on process improvement in the software development area. This demand has spurred the provision of a number of new and/or improved approaches to software development, with perhaps the most prominent being object-orientation (OO). In addition, the focus...
184
+
185
+
186
+ *Document 3:*
187
+
188
+ - Title: Two case studies of open source software development: Apache and Mozilla
189
+
190
+ - Text: According to its proponents, open source style software development has the capacity to compete successfully, and perhaps in many cases displace, traditional commercial development methods. In order to begin investigating such claims, we examine data from two major open source projects, the Apache web server and the Mozilla browser. By using email archives of source code change history and problem...
191
+
192
+
193
+
194
+ **Score 0 Documents:**
195
+
196
+
197
+ *Document 1:*
198
+
199
+ - Title: Object Detection Featuring 3D Audio Localization for Microsoft HoloLens - A Deep Learning based Sensor Substitution Approach for the Blind
200
+
201
+ - Text: Finding basic objects on a daily basis is a difficult but common task for blind people. This paper demonstrates the implementation of a wearable, deep learning backed, object detection approach in the context of visual impairment or blindness. The prototype aims to substitute the impaired eye of the user and replace it with technical sensors. By scanning its surroundings, the prototype provides a ...
202
+
203
+
204
+ *Document 2:*
205
+
206
+ - Title: English as a Formal Specification Language
207
+
208
+ - Text: PENG is a computer-processable controlled natural language designed for writing unambiguous and precise specifications. PENG covers a strict subset of standard English and is precisely defined by a controlled grammar and a controlled lexicon. In contrast to other controlled languages, the author does not need to know the grammatical restrictions explicitly. ECOLE, a look-ahead text editor, indicat...
209
+
210
+
211
+ *Document 3:*
212
+
213
+ - Title: Automatic Retraction and Full-Cycle Operation for a Class of Airborne Wind Energy Generators
214
+
215
+ - Text: Airborne wind energy systems aim to harvest the power of winds blowing at altitudes higher than what conventional wind turbines reach. They employ a tethered flying structure, usually a wing, and exploit the aerodynamic lift to produce electrical power. In the case of ground-based systems, where the traction force on the tether is used to drive a generator on the ground, a two-phase power cycle is...
216
+
217
+
218
+
219
+ ---
220
+
221
+
222
+ #### Comparison Example 4
223
+
224
+ **Query:** Active-Metric Learning for Classification of Remotely Sensed Hyperspectral Images
225
+
226
+ **From:** test split
227
+
228
+
229
+ **Score 1 Documents:**
230
+
231
+
232
+ *Document 1:*
233
+
234
+ - Title: Query by Committee
235
+
236
+ - Text: We propose an algorithm called query by commitee, in which a committee of students is trained on the same data set. The next query is chosen according to the principle of maximal disagreement. The algorithm is studied for two toy models: the high-low game and perceptron learning of another perceptron. As the number of queries goes to infinity, the committee algorithm yields asymptotically finite i...
237
+
238
+
239
+ *Document 2:*
240
+
241
+ - Title: Adaptive Manifold Learning
242
+
243
+ - Text: Manifold learning algorithms seek to find a low-dimensional parameterization of high-dimensional data. They heavily rely on the notion of what can be considered as local, how accurately the manifold can be approximated locally, and, last but not least, how the local structures can be patched together to produce the global parameterization. In this paper, we develop algorithms that address two key ...
244
+
245
+
246
+ *Document 3:*
247
+
248
+ - Title: Manifold-Learning-Based Feature Extraction for Classification of Hyperspectral Data: A Review of Advances in Manifold Learning
249
+
250
+ - Text: Advances in hyperspectral sensing provide new capability for characterizing spectral signatures in a wide range of physical and biological systems, while inspiring new methods for extracting information from these data. HSI data often lie on sparse, nonlinear manifolds whose geometric and topological structures can be exploited via manifold-learning techniques. In this article, we focused on demon...
251
+
252
+
253
+
254
+ **Score 0 Documents:**
255
+
256
+
257
+ *Document 1:*
258
+
259
+ - Title: Leveraging graph topology and semantic context for pharmacovigilance through twitter-streams
260
+
261
+ - Text: Adverse drug events (ADEs) constitute one of the leading causes of post-therapeutic death and their identification constitutes an important challenge of modern precision medicine. Unfortunately, the onset and effects of ADEs are often underreported complicating timely intervention. At over 500 million posts per day, Twitter is a commonly used social media platform. The ubiquity of day-to-day perso...
262
+
263
+
264
+ *Document 2:*
265
+
266
+ - Title: IMPLICIT SHAPE MODELS FOR OBJECT DETECTION IN 3D POINT CLOUDS
267
+
268
+ - Text: We present a method for automatic object localization and recognition in 3D point clouds representing outdoor urban scenes. The method is based on the implicit shape models (ISM) framework, which recognizes objects by voting for their center locations. It requires only few training examples per class, which is an important property for practical use. We also introduce and evaluate an improved vers...
269
+
270
+
271
+ *Document 3:*
272
+
273
+ - Title: Multi-task Learning for Maritime Traffic Surveillance from AIS Data Streams
274
+
275
+ - Text: In a world of global trading, maritime safety, security and efficiency are crucial issues. We propose a multi-task deep learning framework for vessel monitoring using Automatic Identification System (AIS) data streams. We combine recurrent neural networks with latent variable modeling and an embedding of AIS messages to a new representation space to jointly address key issues to be dealt with when...
276
+
277
+
278
+
279
+ ---
280
+
281
+
282
+ #### Comparison Example 5
283
+
284
+ **Query:** Ad Hoc Retrieval Experiments Using WordNet and Automatically Constructed Thesauri
285
+
286
+ **From:** test split
287
+
288
+
289
+ **Score 1 Documents:**
290
+
291
+
292
+ *Document 1:*
293
+
294
+ - Title: Concept Based Query Expansion
295
+
296
+ - Text: Query expansion methods have been studied for a long time - with debatable success in many instances. In this paper we present a probabilistic query expansion model based on a similarity thesaurus which was constructed automatically. A similarity thesaurus reflects domain knowledge about the particular collection from which it is constructed. We address the two important issues with query expansio...
297
+
298
+
299
+ *Document 2:*
300
+
301
+ - Title: An Association Thesaurus for Information Retrieval
302
+
303
+ - Text: Although commonly used in both commercial and experimental information retrieval systems, thesauri have not demonstrated consistent beneets for retrieval performance, and it is diicult to construct a thesaurus automatically for large text databases. In this paper, an approach, called PhraseFinder, is proposed to construct collection-dependent association thesauri automatically using large full-tex...
304
+
305
+
306
+ *Document 3:*
307
+
308
+ - Title: Experiments in Automatic Statistical Thesaurus Construction
309
+
310
+ - Text: A well constructed thesaurus has long been recognized as a valuable tool in the effective operation of an information retrieval system. This paper reports the results of experiments designed to determine the validity of an approach to the automatic construction of global thesauri (described originally by Crouch in [1] and [2] based on a clustering of the document collection. The authors validate t...
311
+
312
+
313
+
314
+ **Score 0 Documents:**
315
+
316
+
317
+ *Document 1:*
318
+
319
+ - Title: Build-to-order supply chain management : a literature review and framework for development
320
+
321
+ - Text: The build-to-order supply chain management (BOSC) strategy has recently attracted the attention of both researchers and practitioners, given its successful implementation in many companies including Dell computers, Compaq, and BMW. The growing number of articles on BOSC in the literature is an indication of the importance of the strategy and of its role in improving the competitiveness of an organ...
322
+
323
+
324
+ *Document 2:*
325
+
326
+ - Title: Team MIT Urban Challenge Technical Report
327
+
328
+ - Text: This technical report describes Team MIT's approach to the DARPA Urban Challenge. We have developed a novel strategy for using many inexpensive sensors, mounted on the vehicle periphery, and calibrated with a new cross­modal calibration technique. Lidar, camera, and radar data streams are processed using an innovative, locally smooth state representation that provides robust perception for real­ t...
329
+
330
+
331
+ *Document 3:*
332
+
333
+ - Title: Platforms in healthcare innovation ecosystems: The lens of an innovation intermediary
334
+
335
+ - Text: Healthcare innovation has made progressive strides. Innovative solutions now tend to incorporate device integration, data collection and data analysis linked across a diverse range of actors building platform-centric healthcare ecosystems. The interconnectedness and inter-disciplinarity of the ecosystems bring with it a number of vital issues around how to strategically manage such a complex syste...
336
+
337
+
338
+
339
+ ---
340
+
341
+
342
+ ### Examples by Split and Score
343
+
344
+
345
+ #### TEST Split
346
+
347
+ **Scores in test: [0, 1]**
348
+
349
+
350
+ ##### Score 0 Examples (test)
351
+
352
+
353
+ **Example 1:**
354
+
355
+ - Query: A Direct Search Method to solve Economic Dispatch Problem with Valve-Point Effect
356
+
357
+ - Doc Title: Separate face and body selectivity on the fusiform gyrus.
358
+
359
+ - Doc Text: Recent reports of a high response to bodies in the fusiform face area (FFA) challenge the idea that the FFA is exclusively selective for face stimuli. We examined this claim by conducting a functional magnetic resonance imaging experiment at both standard (3.125 x 3.125 x 4.0 mm) and high resolution...
360
+
361
+
362
+ **Example 2:**
363
+
364
+ - Query: Bearish-Bullish Sentiment Analysis on Financial Microblogs
365
+
366
+ - Doc Title: A survey of software testing practices in alberta
367
+
368
+ - Doc Text: Software organizations have typically de-emphasized the importance of software testing. In this paper, the results of a regional survey of software testing and software quality assurance techniques are described. Researchers conducted the study during the summer and fall of 2002 by surveying softwar...
369
+
370
+
371
+ **Example 3:**
372
+
373
+ - Query: Predicting defects in SAP Java code: An experience report
374
+
375
+ - Doc Title: Object Detection Featuring 3D Audio Localization for Microsoft HoloLens - A Deep Learning based Sensor Substitution Approach for the Blind
376
+
377
+ - Doc Text: Finding basic objects on a daily basis is a difficult but common task for blind people. This paper demonstrates the implementation of a wearable, deep learning backed, object detection approach in the context of visual impairment or blindness. The prototype aims to substitute the impaired eye of the...
378
+
379
+
380
+ **Example 4:**
381
+
382
+ - Query: Active-Metric Learning for Classification of Remotely Sensed Hyperspectral Images
383
+
384
+ - Doc Title: Leveraging graph topology and semantic context for pharmacovigilance through twitter-streams
385
+
386
+ - Doc Text: Adverse drug events (ADEs) constitute one of the leading causes of post-therapeutic death and their identification constitutes an important challenge of modern precision medicine. Unfortunately, the onset and effects of ADEs are often underreported complicating timely intervention. At over 500 milli...
387
+
388
+
389
+ **Example 5:**
390
+
391
+ - Query: Ad Hoc Retrieval Experiments Using WordNet and Automatically Constructed Thesauri
392
+
393
+ - Doc Title: Build-to-order supply chain management : a literature review and framework for development
394
+
395
+ - Doc Text: The build-to-order supply chain management (BOSC) strategy has recently attracted the attention of both researchers and practitioners, given its successful implementation in many companies including Dell computers, Compaq, and BMW. The growing number of articles on BOSC in the literature is an indic...
396
+
397
+
398
+ **Example 6:**
399
+
400
+ - Query: Underwater Acoustic Target Tracking: A Review
401
+
402
+ - Doc Title: Performance of Store Brands: A Cross-Country Analysis of Consumer Store Brand Preferences, Perceptions, and Risk
403
+
404
+ - Doc Text: This paper empirically studies consumer choice behavior in regard to store brands in the US, UK and Spain. Store brand market shares differ by country and they are usually much higher in Europe than in the US. However, there is surprisingly little work in marketing that empirically studies the reaso...
405
+
406
+
407
+ **Example 7:**
408
+
409
+ - Query: Unsupervised Diverse Colorization via Generative Adversarial Networks
410
+
411
+ - Doc Title: Active learning for on-road vehicle detection: a comparative study
412
+
413
+ - Doc Text: In recent years, active learning has emerged as a powerful tool in building robust systems for object detection using computer vision. Indeed, active learning approaches to on-road vehicle detection have achieved impressive results. While active learning approaches for object detection have been exp...
414
+
415
+
416
+ **Example 8:**
417
+
418
+ - Query: Lane Detection ( Part I ) : Mono-Vision Based Method
419
+
420
+ - Doc Title: The Ontological Foundation of REA Enterprise Information Systems
421
+
422
+ - Doc Text: Philosophers have studied ontologies for centuries in their search for a systematic explanation of existence: “What kind of things exist?” Recently, ontologies have emerged as a major research topic in the fields of artificial intelligence and knowledge management where they address the content issu...
423
+
424
+
425
+ **Example 9:**
426
+
427
+ - Query: Detection of distributed denial of service attacks using machine learning algorithms in software defined networks
428
+
429
+ - Doc Title: Sensing spatial distribution of urban land use by integrating points-of-interest and Google Word2Vec model
430
+
431
+ - Doc Text: Urban land use information plays an essential role in a wide variety of urban planning and environmental monitoring processes. During the past few decades, with the rapid technological development of remote sensing (RS), geographic information systems (GIS) and geospatial big data, numerous methods ...
432
+
433
+
434
+ **Example 10:**
435
+
436
+ - Query: Distributed Privacy-Preserving Collaborative Intrusion Detection Systems for VANETs
437
+
438
+ - Doc Title: The role of transparency in recommender systems
439
+
440
+ - Doc Text: Recommender Systems act as a personalized decision guides, aiding users in decisions on matters related to personal taste. Most previous research on Recommender Systems has focused on the statistical accuracy of the algorithms driving the systems, with little emphasis on interface issues and the use...
441
+
442
+
443
+ ##### Score 1 Examples (test)
444
+
445
+
446
+ **Example 1:**
447
+
448
+ - Query: A Direct Search Method to solve Economic Dispatch Problem with Valve-Point Effect
449
+
450
+ - Doc Title: A hybrid of genetic algorithm and particle swarm optimization for recurrent network design
451
+
452
+ - Doc Text: An evolutionary recurrent network which automates the design of recurrent neural/fuzzy networks using a new evolutionary learning algorithm is proposed in this paper. This new evolutionary learning algorithm is based on a hybrid of genetic algorithm (GA) and particle swarm optimization (PSO), and is...
453
+
454
+
455
+ **Example 2:**
456
+
457
+ - Query: Bearish-Bullish Sentiment Analysis on Financial Microblogs
458
+
459
+ - Doc Title: SemEval-2015 Task 11: Sentiment Analysis of Figurative Language in Twitter
460
+
461
+ - Doc Text: This report summarizes the objectives and evaluation of the SemEval 2015 task on the sentiment analysis of figurative language on Twitter (Task 11). This is the first sentiment analysis task wholly dedicated to analyzing figurative language on Twitter. Specifically, three broad classes of figurative...
462
+
463
+
464
+ **Example 3:**
465
+
466
+ - Query: Predicting defects in SAP Java code: An experience report
467
+
468
+ - Doc Title: Hipikat: a project memory for software development
469
+
470
+ - Doc Text: Sociological and technical difficulties, such as a lack of informal encounters, can make it difficult for new members of noncollocated software development teams to learn from their more experienced colleagues. To address this situation, we have developed a tool, named Hipikat that provides develope...
471
+
472
+
473
+ **Example 4:**
474
+
475
+ - Query: Active-Metric Learning for Classification of Remotely Sensed Hyperspectral Images
476
+
477
+ - Doc Title: Query by Committee
478
+
479
+ - Doc Text: We propose an algorithm called query by commitee, in which a committee of students is trained on the same data set. The next query is chosen according to the principle of maximal disagreement. The algorithm is studied for two toy models: the high-low game and perceptron learning of another perceptro...
480
+
481
+
482
+ **Example 5:**
483
+
484
+ - Query: Ad Hoc Retrieval Experiments Using WordNet and Automatically Constructed Thesauri
485
+
486
+ - Doc Title: Concept Based Query Expansion
487
+
488
+ - Doc Text: Query expansion methods have been studied for a long time - with debatable success in many instances. In this paper we present a probabilistic query expansion model based on a similarity thesaurus which was constructed automatically. A similarity thesaurus reflects domain knowledge about the particu...
489
+
490
+
491
+ **Example 6:**
492
+
493
+ - Query: Underwater Acoustic Target Tracking: A Review
494
+
495
+ - Doc Title: The challenges of building mobile underwater wireless networks for aquatic applications
496
+
497
+ - Doc Text: The large-scale mobile underwater wireless sensor network (UWSN) is a novel networking paradigm to explore aqueous environments. However, the characteristics of mobile UWSNs, such as low communication bandwidth, large propagation delay, floating node mobility, and high error probability, are signifi...
498
+
499
+
500
+ **Example 7:**
501
+
502
+ - Query: Unsupervised Diverse Colorization via Generative Adversarial Networks
503
+
504
+ - Doc Title: Infrared Colorization Using Deep Convolutional Neural Networks
505
+
506
+ - Doc Text: This paper proposes a method for transferring the RGB color spectrum to near-infrared (NIR) images using deep multi-scale convolutional neural networks. A direct and integrated transfer between NIR and RGB pixels is trained. The trained model does not require any user guidance or a reference image d...
507
+
508
+
509
+ **Example 8:**
510
+
511
+ - Query: Lane Detection ( Part I ) : Mono-Vision Based Method
512
+
513
+ - Doc Title: Real time detection of lane markers in urban streets
514
+
515
+ - Doc Text: We present a robust and real time approach to lane marker detection in urban streets. It is based on generating a top view of the road, filtering using selective oriented Gaussian filters, using RANSAC line fitting to give initial guesses to a new and fast RANSAC algorithm for fitting Bezier Splines...
516
+
517
+
518
+ **Example 9:**
519
+
520
+ - Query: Detection of distributed denial of service attacks using machine learning algorithms in software defined networks
521
+
522
+ - Doc Title: Intrusion detection by machine learning: A review
523
+
524
+ - Doc Text: The popularity of using Internet contains some risks of network attacks. Intrusion detection is one major research problem in network security, whose aim is to identify unusual access or attacks to secure internal networks. In literature, intrusion detection systems have been approached by various m...
525
+
526
+
527
+ **Example 10:**
528
+
529
+ - Query: Distributed Privacy-Preserving Collaborative Intrusion Detection Systems for VANETs
530
+
531
+ - Doc Title: Private Empirical Risk Minimization: Efficient Algorithms and Tight Error Bounds
532
+
533
+ - Doc Text: Convex empirical risk minimization is a basic tool in machine learning and statistics. We provide new algorithms and matching lower bounds for differentially private convex empirical risk minimization assuming only that each data point's contribution to the loss function is Lipschitz and that the do...
534
+
535
+
536
+ ## CORE
537
+
538
+ **Has Scores 0, 1**
539
+ **Score 1 = RELEVANT** ✓
540
+
541
+ ## Special Notes
542
+
543
+ - **Original Purpose**: Citation prediction - find papers to cite for a given paper
544
+ - **What Score 1 Really Means**: Papers that should be cited (topically related research)
545
+ - **Dataset Type**: Scientific citation recommendation
546
+ - **Key Characteristic**: Helps find relevant prior work for academic papers
547
+
548
+ ## Training Verdict
549
+
550
+ **GOOD FOR TRAINING** ✓
551
+ - Score 1 documents are citation-worthy papers
552
+ - True relevance for academic IR
553
+ - Can use for binary classification: Score 1 = positive, Score 0 = negative
554
+
555
+ ## Observations
test_scores/dataset_reports/scifact_analysis.md ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SCIFACT Dataset Analysis
2
+
3
+ ## Overview
4
+
5
+ - Total queries available: 1109
6
+
7
+ - Total corpus documents: 5183
8
+
9
+ - Corpus documents loaded for examples: 5183
10
+
11
+
12
+ ## Split Statistics
13
+
14
+ | Split | Queries | Total Pairs | Score Distribution |
15
+
16
+ |-------|---------|-------------|-------------------|
17
+
18
+ | train | 809 | 919 | 1:919 |
19
+
20
+ | dev | N/A | N/A | N/A |
21
+
22
+ | test | 300 | 339 | 1:339 |
23
+
24
+
25
+ ## Score Analysis
26
+
27
+ **Only one score level (1) found in this dataset.**
28
+
29
+ - This means all labeled documents are considered equally relevant
30
+
31
+ - Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)
32
+
33
+
34
+ ## Examples
35
+
36
+ ### Score Comparison Examples
37
+
38
+ ### Examples by Split and Score
39
+
40
+
41
+ #### TRAIN Split
42
+
43
+ **Scores in train: [1]**
44
+
45
+
46
+ ##### Score 1 Examples (train)
47
+
48
+
49
+ **Example 1:**
50
+
51
+ - Query: 0-dimensional biomaterials lack inductive properties.
52
+
53
+ - Doc Title: New opportunities: the use of nanotechnologies to manipulate and track stem cells.
54
+
55
+ - Doc Text: Nanotechnologies are emerging platforms that could be useful in measuring, understanding, and manipulating stem cells. Examples include magnetic nanoparticles and quantum dots for stem cell labeling and in vivo tracking; nanoparticles, carbon nanotubes, and polyplexes for the intracellular delivery ...
56
+
57
+
58
+ **Example 2:**
59
+
60
+ - Query: 1 in 5 million in UK have abnormal PrP positivity.
61
+
62
+ - Doc Title: Prevalent abnormal prion protein in human appendixes after bovine spongiform encephalopathy epizootic: large scale survey
63
+
64
+ - Doc Text: OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, an...
65
+
66
+
67
+ **Example 3:**
68
+
69
+ - Query: 1-1% of colorectal cancer patients are diagnosed with regional or distant metastases.
70
+
71
+ - Doc Title: Relation between Medicare screening reimbursement and stage at diagnosis for older patients with colon cancer.
72
+
73
+ - Doc Text: CONTEXT Medicare's reimbursement policy was changed in 1998 to provide coverage for screening colonoscopies for patients with increased colon cancer risk, and expanded further in 2001 to cover screening colonoscopies for all individuals. OBJECTIVE To determine whether the Medicare reimbursement poli...
74
+
75
+
76
+ **Example 4:**
77
+
78
+ - Query: 10% of sudden infant death syndrome (SIDS) deaths happen in newborns aged less than 6 months.
79
+
80
+ - Doc Title: Sudden infant death syndrome.
81
+
82
+ - Doc Text: Despite declines in prevalence during the past two decades, sudden infant death syndrome (SIDS) continues to be the leading cause of death for infants aged between 1 month and 1 year in developed countries. Behavioural risk factors identified in epidemiological studies include prone and side positio...
83
+
84
+
85
+ **Example 5:**
86
+
87
+ - Query: 32% of liver transplantation programs required patients to discontinue methadone treatment in 2001.
88
+
89
+ - Doc Title: Liver transplantation and opioid dependence.
90
+
91
+ - Doc Text: ContextChronic hepatitis C is the leading cause for liver transplantation in the United States. Intravenous drug use, the major risk factor, accounts for approximately 60% of hepatitis C virus transmission. Information from the United Network of Organ Sharing (UNOS) does not address substance use am...
92
+
93
+
94
+ **Example 6:**
95
+
96
+ - Query: 4-PBA treatment decreases endoplasmic reticulum stress in response to general endoplasmic reticulum stress markers.
97
+
98
+ - Doc Title: Wolfram syndrome 1 and adenylyl cyclase 8 interact at the plasma membrane to regulate insulin production and secretion
99
+
100
+ - Doc Text: Endoplasmic reticulum (ER) stress causes pancreatic β-cell dysfunction and contributes to β-cell loss and the progression of type 2 diabetes. Wolfram syndrome 1 (WFS1) has been shown to be an important regulator of the ER stress signalling pathway; however, its role in β-cell function remains unclea...
101
+
102
+
103
+ **Example 7:**
104
+
105
+ - Query: 4-PBA treatment raises endoplasmic reticulum stress in response to general endoplasmic reticulum stress markers.
106
+
107
+ - Doc Title: Wolfram syndrome 1 and adenylyl cyclase 8 interact at the plasma membrane to regulate insulin production and secretion
108
+
109
+ - Doc Text: Endoplasmic reticulum (ER) stress causes pancreatic β-cell dysfunction and contributes to β-cell loss and the progression of type 2 diabetes. Wolfram syndrome 1 (WFS1) has been shown to be an important regulator of the ER stress signalling pathway; however, its role in β-cell function remains unclea...
110
+
111
+
112
+ **Example 8:**
113
+
114
+ - Query: 40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.
115
+
116
+ - Doc Title: Effect of homocysteine lowering on mortality and vascular disease in advanced chronic kidney disease and end-stage renal disease: a randomized control
117
+
118
+ - Doc Text: CONTEXT High plasma homocysteine levels are a risk factor for mortality and vascular disease in observational studies of patients with chronic kidney disease. Folic acid and B vitamins decrease homocysteine levels in this population but whether they lower mortality is unknown. OBJECTIVE To determine...
119
+
120
+
121
+ **Example 9:**
122
+
123
+ - Query: 5'-nucleotidase metabolizes 6MP.
124
+
125
+ - Doc Title: Relapse specific mutations in NT5C2 in childhood acute lymphoblastic leukemia
126
+
127
+ - Doc Text: Relapsed childhood acute lymphoblastic leukemia (ALL) carries a poor prognosis, despite intensive retreatment, owing to intrinsic drug resistance. The biological pathways that mediate resistance are unknown. Here, we report the transcriptome profiles of matched diagnosis and relapse bone marrow spec...
128
+
129
+
130
+ **Example 10:**
131
+
132
+ - Query: 50% of patients exposed to radiation have activated markers of mesenchymal stem cells.
133
+
134
+ - Doc Title: KLF4-dependent phenotypic modulation of smooth muscle cells has a key role in atherosclerotic plaque pathogenesis
135
+
136
+ - Doc Text: Previous studies investigating the role of smooth muscle cells (SMCs) and macrophages in the pathogenesis of atherosclerosis have provided controversial results owing to the use of unreliable methods for clearly identifying each of these cell types. Here, using Myh11-CreERT2 ROSA floxed STOP eYFP Ap...
137
+
138
+
139
+ #### TEST Split
140
+
141
+ **Scores in test: [1]**
142
+
143
+
144
+ ##### Score 1 Examples (test)
145
+
146
+
147
+ **Example 1:**
148
+
149
+ - Query: 0-dimensional biomaterials show inductive properties.
150
+
151
+ - Doc Title: New opportunities: the use of nanotechnologies to manipulate and track stem cells.
152
+
153
+ - Doc Text: Nanotechnologies are emerging platforms that could be useful in measuring, understanding, and manipulating stem cells. Examples include magnetic nanoparticles and quantum dots for stem cell labeling and in vivo tracking; nanoparticles, carbon nanotubes, and polyplexes for the intracellular delivery ...
154
+
155
+
156
+ **Example 2:**
157
+
158
+ - Query: 1,000 genomes project enables mapping of genetic sequence variation consisting of rare variants with larger penetrance effects than common variants.
159
+
160
+ - Doc Title: Rare Variants Create Synthetic Genome-Wide Associations
161
+
162
+ - Doc Text: Genome-wide association studies (GWAS) have now identified at least 2,000 common variants that appear associated with common diseases or related traits (http://www.genome.gov/gwastudies), hundreds of which have been convincingly replicated. It is generally thought that the associated markers reflect...
163
+
164
+
165
+ **Example 3:**
166
+
167
+ - Query: 1/2000 in UK have abnormal PrP positivity.
168
+
169
+ - Doc Title: Prevalent abnormal prion protein in human appendixes after bovine spongiform encephalopathy epizootic: large scale survey
170
+
171
+ - Doc Text: OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, an...
172
+
173
+
174
+ **Example 4:**
175
+
176
+ - Query: 5% of perinatal mortality is due to low birth weight.
177
+
178
+ - Doc Title: Estimates of global prevalence of childhood underweight in 1990 and 2015.
179
+
180
+ - Doc Text: CONTEXT One key target of the United Nations Millennium Development goals is to reduce the prevalence of underweight among children younger than 5 years by half between 1990 and 2015. OBJECTIVE To estimate trends in childhood underweight by geographic regions of the world. DESIGN, SETTING, AND PARTI...
181
+
182
+
183
+ **Example 5:**
184
+
185
+ - Query: A deficiency of vitamin B12 increases blood levels of homocysteine.
186
+
187
+ - Doc Title: Folic acid improves endothelial function in coronary artery disease via mechanisms largely independent of homocysteine lowering.
188
+
189
+ - Doc Text: BACKGROUND Homocysteine is a risk factor for coronary artery disease (CAD), although a causal relation remains to be proven. The importance of determining direct causality rests in the fact that plasma homocysteine can be safely and inexpensively reduced by 25% with folic acid. This reduction is max...
190
+
191
+
192
+ **Example 6:**
193
+
194
+ - Query: A high microerythrocyte count raises vulnerability to severe anemia in homozygous alpha (+)- thalassemia trait subjects.
195
+
196
+ - Doc Title: Increased Microerythrocyte Count in Homozygous α+-Thalassaemia Contributes to Protection against Severe Malarial Anaemia
197
+
198
+ - Doc Text: BACKGROUND The heritable haemoglobinopathy alpha(+)-thalassaemia is caused by the reduced synthesis of alpha-globin chains that form part of normal adult haemoglobin (Hb). Individuals homozygous for alpha(+)-thalassaemia have microcytosis and an increased erythrocyte count. Alpha(+)-thalassaemia hom...
199
+
200
+
201
+ **Example 7:**
202
+
203
+ - Query: A total of 1,000 people in the UK are asymptomatic carriers of vCJD infection.
204
+
205
+ - Doc Title: Prevalent abnormal prion protein in human appendixes after bovine spongiform encephalopathy epizootic: large scale survey
206
+
207
+ - Doc Text: OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, an...
208
+
209
+
210
+ **Example 8:**
211
+
212
+ - Query: ADAR1 binds to Dicer to cleave pre-miRNA.
213
+
214
+ - Doc Title: ADAR1 Forms a Complex with Dicer to Promote MicroRNA Processing and RNA-Induced Gene Silencing
215
+
216
+ - Doc Text: Adenosine deaminases acting on RNA (ADARs) are involved in RNA editing that converts adenosine residues to inosine specifically in double-stranded RNAs. In this study, we investigated the interaction of the RNA editing mechanism with the RNA interference (RNAi) machinery and found that ADAR1 forms a...
217
+
218
+
219
+ **Example 9:**
220
+
221
+ - Query: AIRE is expressed in some skin tumors.
222
+
223
+ - Doc Title: Keratin-dependent regulation of Aire and gene expression in skin tumor keratinocytes
224
+
225
+ - Doc Text: Expression of the intermediate filament protein keratin 17 (K17) is robustly upregulated in inflammatory skin diseases and in many tumors originating in stratified and pseudostratified epithelia. We report that autoimmune regulator (Aire), a transcriptional regulator, is inducibly expressed in human...
226
+
227
+
228
+ **Example 10:**
229
+
230
+ - Query: ALDH1 expression is associated with better breast cancer outcomes.
231
+
232
+ - Doc Title: ALDH1 is a marker of normal and malignant human mammary stem cells and a predictor of poor clinical outcome.
233
+
234
+ - Doc Text: Application of stem cell biology to breast cancer research has been limited by the lack of simple methods for identification and isolation of normal and malignant stem cells. Utilizing in vitro and in vivo experimental systems, we show that normal and cancer human mammary epithelial cells with incre...
235
+
236
+
237
+ ## CORE
238
+
239
+ **Only Score 1 exists**
240
+ **Score 1 = RELEVANT** ✓
241
+
242
+ ## Special Notes
243
+
244
+ - **Original Purpose**: Scientific claim verification - find evidence to support/refute claims
245
+ - **What Score 1 Really Means**: Papers containing evidence for verifying scientific claims
246
+ - **Dataset Type**: Scientific fact checking
247
+ - **Key Characteristic**: Claims paired with abstracts that can verify them
248
+
249
+ ## Training Verdict
250
+
251
+ **GOOD FOR TRAINING** ✓
252
+ - Score 1 documents provide verification evidence
253
+ - True scientific relevance matching
254
+ - Can use for binary classification: Score 1 = positive class
255
+
256
+ ## Observations
257
+
258
+ ⚠️ **Training data only has Score 1:**
259
+
260
+ - All training documents are marked as equally relevant
261
+
262
+ - Model cannot learn to distinguish relevance levels
263
+
264
+ - May indicate collapsed/merged relevance levels
test_scores/dataset_reports/training_config.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "beir_training_config": {
3
+ "version": "2.0",
4
+ "created": "2024-01-09",
5
+ "purpose": "Three-category training with within-dataset sampling priority",
6
+ "datasets": {
7
+ "arguana": {
8
+ "use": true,
9
+ "provides_hard_negatives": false,
10
+ "splits_available": ["test"],
11
+ "score_to_category": {
12
+ "1": "positive",
13
+ "0": "easy_negative",
14
+ "unlabeled": "easy_negative"
15
+ },
16
+ "sampling": "within_dataset",
17
+ "notes": "Counter-arguments are relevant. Only test split (1406 queries)."
18
+ },
19
+ "climate-fever": {
20
+ "use": false,
21
+ "provides_hard_negatives": false,
22
+ "splits_available": ["test"],
23
+ "score_to_category": {
24
+ "1": "skip"
25
+ },
26
+ "sampling": "skip",
27
+ "notes": "Score 1 is questionable. DO NOT USE. Can provide easy negatives to others."
28
+ },
29
+ "dbpedia-entity": {
30
+ "use": true,
31
+ "provides_hard_negatives": true,
32
+ "splits_available": ["dev", "test"],
33
+ "score_to_category": {
34
+ "2": "positive",
35
+ "1": "hard_negative",
36
+ "0": "easy_negative",
37
+ "unlabeled": "easy_negative"
38
+ },
39
+ "sampling": "within_dataset",
40
+ "notes": "GOLD MINE! Score 1 = keyword match without intent = perfect hard negatives"
41
+ },
42
+ "fever": {
43
+ "use": true,
44
+ "provides_hard_negatives": false,
45
+ "splits_available": ["train", "dev", "test"],
46
+ "score_to_category": {
47
+ "1": "positive",
48
+ "0": "easy_negative",
49
+ "unlabeled": "easy_negative"
50
+ },
51
+ "sampling": "within_dataset",
52
+ "notes": "Evidence that supports OR refutes claims. Large dataset (140K train)."
53
+ },
54
+ "fiqa": {
55
+ "use": true,
56
+ "provides_hard_negatives": false,
57
+ "splits_available": ["train", "dev", "test"],
58
+ "score_to_category": {
59
+ "1": "positive",
60
+ "0": "easy_negative",
61
+ "unlabeled": "easy_negative"
62
+ },
63
+ "sampling": "within_dataset",
64
+ "notes": "Financial Q&A. 5500 train queries."
65
+ },
66
+ "hotpotqa": {
67
+ "use": true,
68
+ "provides_hard_negatives": false,
69
+ "splits_available": ["train", "dev", "test"],
70
+ "score_to_category": {
71
+ "1": "positive",
72
+ "0": "easy_negative",
73
+ "unlabeled": "easy_negative"
74
+ },
75
+ "sampling": "within_dataset",
76
+ "notes": "Multi-hop reasoning. 85K train queries."
77
+ },
78
+ "nfcorpus": {
79
+ "use": true,
80
+ "provides_hard_negatives": false,
81
+ "splits_available": ["train", "dev", "test"],
82
+ "score_to_category": {
83
+ "1": "positive",
84
+ "2": "positive",
85
+ "0": "easy_negative",
86
+ "unlabeled": "easy_negative"
87
+ },
88
+ "sampling": "within_dataset",
89
+ "notes": "Medical IR. Both 1&2 are relevant. WARNING: train has only 1s, test/dev have 1&2."
90
+ },
91
+ "quora": {
92
+ "use": true,
93
+ "provides_hard_negatives": false,
94
+ "splits_available": ["dev", "test"],
95
+ "score_to_category": {
96
+ "1": "positive",
97
+ "0": "easy_negative",
98
+ "unlabeled": "easy_negative"
99
+ },
100
+ "sampling": "within_dataset",
101
+ "notes": "Duplicate questions. 5K dev, 10K test queries."
102
+ },
103
+ "scidocs": {
104
+ "use": true,
105
+ "provides_hard_negatives": false,
106
+ "splits_available": ["test"],
107
+ "score_to_category": {
108
+ "1": "positive",
109
+ "0": "easy_negative",
110
+ "unlabeled": "easy_negative"
111
+ },
112
+ "sampling": "within_dataset",
113
+ "notes": "Citation prediction. 1000 test queries."
114
+ },
115
+ "scifact": {
116
+ "use": true,
117
+ "provides_hard_negatives": false,
118
+ "splits_available": ["train", "test"],
119
+ "score_to_category": {
120
+ "1": "positive",
121
+ "0": "easy_negative",
122
+ "unlabeled": "easy_negative"
123
+ },
124
+ "sampling": "within_dataset",
125
+ "notes": "Scientific claim verification. 920 train, 300 test."
126
+ },
127
+ "trec-covid": {
128
+ "use": true,
129
+ "provides_hard_negatives": false,
130
+ "splits_available": ["test"],
131
+ "score_to_category": {
132
+ "2": "positive",
133
+ "1": "positive",
134
+ "0": "easy_negative",
135
+ "unlabeled": "easy_negative"
136
+ },
137
+ "sampling": "within_dataset",
138
+ "notes": "COVID-19 IR. Both 1&2 are relevant. Only 50 test queries."
139
+ },
140
+ "webis-touche2020": {
141
+ "use": true,
142
+ "provides_hard_negatives": true,
143
+ "splits_available": ["test"],
144
+ "score_to_category": {
145
+ "2": "positive",
146
+ "1": "hard_negative",
147
+ "0": "easy_negative",
148
+ "unlabeled": "easy_negative"
149
+ },
150
+ "sampling": "within_dataset",
151
+ "notes": "GOLD MINE! Score 1 = wrong topic = perfect hard negatives. Only 49 test queries."
152
+ }
153
+ },
154
+ "global_settings": {
155
+ "sampling_factor": 10,
156
+ "min_easy_negatives": 5,
157
+ "max_easy_negatives": 100,
158
+ "sampling_priority": "within_dataset_first",
159
+ "cross_dataset_fallback": true,
160
+ "random_seed": 42,
161
+ "hard_negative_sources": ["dbpedia-entity", "webis-touche2020"]
162
+ }
163
+ }
164
+ }
test_scores/dataset_reports/trec-covid_analysis.md ADDED
@@ -0,0 +1,782 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TREC-COVID Dataset Analysis
2
+
3
+ ## Overview
4
+
5
+ - Total queries available: 50
6
+
7
+ - Total corpus documents: 171332
8
+
9
+ - Corpus documents loaded for examples: 50000
10
+
11
+
12
+ ## Split Statistics
13
+
14
+ | Split | Queries | Total Pairs | Score Distribution |
15
+
16
+ |-------|---------|-------------|-------------------|
17
+
18
+ | train | N/A | N/A | N/A |
19
+
20
+ | dev | N/A | N/A | N/A |
21
+
22
+ | test | 50 | 66334 | 0:41661, 1:10456, 2:14217 |
23
+
24
+
25
+ ## Score Analysis
26
+
27
+ **Multiple score levels found: [0, 1, 2]**
28
+
29
+ - Higher scores indicate higher relevance
30
+
31
+ - Score 0 typically means not relevant
32
+
33
+ - Score 1 typically means relevant
34
+
35
+ - Score 2 typically means highly relevant
36
+
37
+
38
+ ## Examples
39
+
40
+ ### Score Comparison Examples
41
+
42
+ #### Comparison Example 1
43
+
44
+ **Query:** what is the origin of COVID-19
45
+
46
+ **From:** test split
47
+
48
+
49
+ **Score 2 Documents:**
50
+
51
+
52
+ *Document 1:*
53
+
54
+ - Title: Intraspecies diversity of SARS-like coronaviruses in Rhinolophus sinicus and its implications for the origin of SARS coronaviruses in humans.
55
+
56
+ - Text: The Chinese rufous horseshoe bat (Rhinolophus sinicus) has been suggested to carry the direct ancestor of severe acute respiratory syndrome (SARS) coronavirus (SCoV), and the diversity of SARS-like CoVs (SLCoV) within this Rhinolophus species is therefore worth investigating. Here, we demonstrate the remarkable diversity of SLCoVs in R. sinicus and identify a strain with the same pattern of phylog...
57
+
58
+
59
+ *Document 2:*
60
+
61
+ - Title: Virus–Host Coevolution with a Focus on Animal and Human DNA Viruses
62
+
63
+ - Text: Viruses have been infecting their host cells since the dawn of life, and this extremely long-term coevolution gave rise to some surprising consequences for the entire tree of life. It is hypothesised that viruses might have contributed to the formation of the first cellular life form, or that even the eukaryotic cell nucleus originates from an infection by a coated virus. The continuous struggle b...
64
+
65
+
66
+ *Document 3:*
67
+
68
+ - Title: Animal source of the coronavirus continues to elude scientists.
69
+
70
+ - Text: ...
71
+
72
+
73
+
74
+ **Score 1 Documents:**
75
+
76
+
77
+ *Document 1:*
78
+
79
+ - Title: Deadly virus effortlessly hops species
80
+
81
+ - Text: Genetic engineering helps reveal origin of deadly 'flu...
82
+
83
+
84
+ *Document 2:*
85
+
86
+ - Title: Emerging infectious diseases
87
+
88
+ - Text: The spectrum of human pathogens and the infectious diseases they cause is continuously changing through evolution and changes in the way human populations interact with their environment and each other. New human pathogens most often emerge from an animal reservoir, emphasizing the central role that non-human reservoirs play in human infectious diseases. Pathogens may also re-emerge with new chara...
89
+
90
+
91
+ *Document 3:*
92
+
93
+ - Title: Latest SARS evidence
94
+
95
+ - Text: ...
96
+
97
+
98
+
99
+ **Score 0 Documents:**
100
+
101
+
102
+ *Document 1:*
103
+
104
+ - Title: The epidemiology of MERS-CoV
105
+
106
+ - Text: ...
107
+
108
+
109
+ *Document 2:*
110
+
111
+ - Title: Anesthesia and COVID-19: What We Should Know and What We Should Do.
112
+
113
+ - Text: Coronavirus disease 2019 (COVID-19), caused by Severe Acute Respiratory Syndrome-Coronavirus-2 (SARS-CoV-2), was first reported in Wuhan, Hubei, China, and has spread to more than 200 other countries around the world. COVID-19 is a highly contagious disease with continuous human-to-human transmission. The origin of the virus is unknown. Airway manipulations and intubations, which are common during...
114
+
115
+
116
+ *Document 3:*
117
+
118
+ - Title: Diseases of the Liver and Hepatobiliary System
119
+
120
+ - Text: ...
121
+
122
+
123
+
124
+ ---
125
+
126
+
127
+ #### Comparison Example 2
128
+
129
+ **Query:** how does the coronavirus respond to changes in the weather
130
+
131
+ **From:** test split
132
+
133
+
134
+ **Score 2 Documents:**
135
+
136
+
137
+ *Document 1:*
138
+
139
+ - Title: Seasonality of Respiratory Viral Infections.
140
+
141
+ - Text: The seasonal cycle of respiratory viral diseases has been widely recognized for thousands of years, as annual epidemics of the common cold and influenza disease hit the human population like clockwork in the winter season in temperate regions. Moreover, epidemics caused by viruses such as severe acute respiratory syndrome coronavirus (SARS-CoV) and the newly emerging SARS-CoV-2 occur during the wi...
142
+
143
+
144
+ *Document 2:*
145
+
146
+ - Title: Effect modification of environmental factors on influenza-associated mortality: a time-series study in two Chinese cities
147
+
148
+ - Text: BACKGROUND: Environmental factors have been associated with transmission and survival of influenza viruses but no studies have ever explored the role of environmental factors on severity of influenza infection. METHODS: We applied a Poisson regression model to the mortality data of two Chinese metropolitan cities located within the subtropical zone, to calculate the influenza associated excess mor...
149
+
150
+
151
+ *Document 3:*
152
+
153
+ - Title: Meteorological impacts on the incidence of COVID-19 in the U.S.
154
+
155
+ - Text: Since the World Health Organization has declared the current outbreak of the novel coronavirus (COVID-19) a global pandemic, some have been anticipating that the mitigation could happen in the summer like seasonal influenza, while medical solutions are still in a slow progress. Experimental studies have revealed a few evidences that coronavirus decayed quickly under the exposure of heat and humidi...
156
+
157
+
158
+
159
+ **Score 1 Documents:**
160
+
161
+
162
+ *Document 1:*
163
+
164
+ - Title: The association between the seasonality of pediatric pandemic influenza virus outbreak and ambient meteorological factors in Shanghai
165
+
166
+ - Text: BACKGROUND AND OBJECTIVES: The number of pediatric patients diagnosed with influenza types A and B is increasing annually, especially in temperate regions such as Shanghai (China). The onset of pandemic influenza viruses might be attributed to various ambient meteorological factors including temperature, relative humidity (Rh), and PM(1) concentrations, etc. The study aims to explore the correlati...
167
+
168
+
169
+ *Document 2:*
170
+
171
+ - Title: A relationship between acute respiratory illnesses and weather.
172
+
173
+ - Text: Weekly data from 7 years (2004-2010) of primary-care counts of acute respiratory illnesses (ARIs) and local weather readings were used to adjust a multivariate time-series vector error correction model with covariates (VECMX). Weather variables were included through a partial least squares index that consisted of weekly minimum temperature (coefficient = - 0·26), weekly median of relative humidity...
174
+
175
+
176
+ *Document 3:*
177
+
178
+ - Title: Evaluating the impact of the weather conditions on the influenza propagation
179
+
180
+ - Text: BACKGROUND: Predicting the details of how an epidemic evolves is highly valuable as health institutions need to better plan towards limiting the infection propagation effects and optimizing their prediction and response capabilities. Simulation is a cost- and time-effective way of predicting the evolution of the infection as the joint influence of many different factors: interaction patterns, pers...
181
+
182
+
183
+
184
+ **Score 0 Documents:**
185
+
186
+
187
+ *Document 1:*
188
+
189
+ - Title: Bayesian Analysis for Inference of an Emerging Epidemic: Citrus Canker in Urban Landscapes
190
+
191
+ - Text: Outbreaks of infectious diseases require a rapid response from policy makers. The choice of an adequate level of response relies upon available knowledge of the spatial and temporal parameters governing pathogen spread, affecting, amongst others, the predicted severity of the epidemic. Yet, when a new pathogen is introduced into an alien environment, such information is often lacking or of no use,...
192
+
193
+
194
+ *Document 2:*
195
+
196
+ - Title: Scientific Opinion on the welfare of cattle kept for beef production and the welfare in intensive calf farming systems
197
+
198
+ - Text: Information given in previous Opinions “Welfare of cattle kept for beef production” (SCAHAW, 2001) and “The risks of poor welfare in intensive calf farming systems” (EFSA, 2006) is updated and recent scientific evidence on the topics reviewed. Risks of poor welfare are identified using a structured analysis, and issues not identified in the SCAHAW (2001) beef Opinion, especially effects of housing...
199
+
200
+
201
+ *Document 3:*
202
+
203
+ - Title: Emergency Preparedness—Planning and Management
204
+
205
+ - Text: In general, emergencies can be divided into internal and external situations. Regardless of the nature of the emergency, the organizational response will be quite different when the emergency condition exists within the facility from when it occurs at some distant location. The actual emergency response programs of healthcare facilities indicate that much more effort has gone into a prepared respo...
206
+
207
+
208
+
209
+ ---
210
+
211
+
212
+ #### Comparison Example 3
213
+
214
+ **Query:** will SARS-CoV2 infected people develop immunity? Is cross protection possible?
215
+
216
+ **From:** test split
217
+
218
+
219
+ **Score 2 Documents:**
220
+
221
+
222
+ *Document 1:*
223
+
224
+ - Title: Serological differentiation between COVID-19 and SARS infections.
225
+
226
+ - Text: In response to the coronavirus disease 2019 (COVID-19) outbreak, caused by the SARS-CoV-2 virus, multiple diagnostic tests are required globally for acute disease diagnosis, contact tracing, monitoring of asymptomatic infection rates and assessing herd immunity. While PCR remains the frontline test of choice in the acute diagnostic setting, serological tests are urgently needed to fulfil the other...
227
+
228
+
229
+ *Document 2:*
230
+
231
+ - Title: The laboratory tests and host immunity of COVID-19 patients with different severity of illness.
232
+
233
+ - Text: BACKGROUND The Coronavirus Disease-2019 (COVID-19), infected by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), has caused a severe outbreak in China. The host immunity of COVID-19 patients is unknown. METHODS The routine laboratory tests and host immunity in COVID-19 patients with different severity of illness were compared after patient admission. RESULTS A total of 65 SARS-CoV-2-p...
234
+
235
+
236
+ *Document 3:*
237
+
238
+ - Title: Host Immune Response and Immunobiology of Human SARS-CoV-2 Infection
239
+
240
+ - Text: One of the most serious viral outbreaks of the decade, infecting humans, originated from the city of Wuhan, China, by the end of December 2019, has left the world shaken up. It is the successor infection of severe acute respiratory syndrome coronavirus (SARS-CoV) named as SARS-CoV-2 causing a disease called as COVID-19 (Coronavirus disease-19). Being one of the most severe diseases in terms of tra...
241
+
242
+
243
+
244
+ **Score 1 Documents:**
245
+
246
+
247
+ *Document 1:*
248
+
249
+ - Title: Complete protection against severe acute respiratory syndrome coronavirus-mediated lethal respiratory disease in aged mice by immunization with a mouse-adapted virus lacking E protein.
250
+
251
+ - Text: Zoonotic coronaviruses, including the one that caused severe acute respiratory syndrome (SARS), cause significant morbidity and mortality in humans. No specific therapy for any human coronavirus is available, making vaccine development critical for protection against these viruses. We previously showed that recombinant SARS coronavirus (SARS-CoV) (Urbani strain based) lacking envelope (E) protein ...
252
+
253
+
254
+ *Document 2:*
255
+
256
+ - Title: Temporal Dynamics of Co-circulating Lineages of Porcine Reproductive and Respiratory Syndrome Virus
257
+
258
+ - Text: Porcine Reproductive and Respiratory Syndrome Virus (PRRSV) is the most important endemic pathogen in the U.S. swine industry. Despite control efforts involving improved biosecurity and different vaccination protocols, the virus continues to circulate and evolve. One of the foremost challenges in its control is high levels of genetic and antigenic diversity. Here, we quantify the co-circulation, e...
259
+
260
+
261
+ *Document 3:*
262
+
263
+ - Title: Proposal for vaccination against SARS coronavirus using avian infectious bronchitis virus strain H from The Netherlands
264
+
265
+ - Text: ...
266
+
267
+
268
+
269
+ **Score 0 Documents:**
270
+
271
+
272
+ *Document 1:*
273
+
274
+ - Title: Updated Vaccination Recommendations for Carnivores
275
+
276
+ - Text: ...
277
+
278
+
279
+ *Document 2:*
280
+
281
+ - Title: Deep sequencing identifies viral and wasp genes with potential roles in replication of Microplitis demolitor Bracovirus.
282
+
283
+ - Text: Viruses in the genus Bracovirus (BV) (Polydnaviridae) are symbionts of parasitoid wasps that specifically replicate in the ovaries of females. Recent analysis of expressed sequence tags from two wasp species, Cotesia congregata and Chelonus inanitus, identified transcripts related to 24 different nudivirus genes. These results together with other data strongly indicate that BVs evolved from a nudi...
284
+
285
+
286
+ *Document 3:*
287
+
288
+ - Title: BCG vaccine protection from severe coronavirus disease 2019 (COVID-19).
289
+
290
+ - Text: A series of epidemiological explorations has suggested a negative association between national bacillus Calmette-Guérin (BCG) vaccination policy and the prevalence and mortality of coronavirus disease 2019 (COVID-19). However, these comparisons are difficult to validate due to broad differences between countries such as socioeconomic status, demographic structure, rural vs. urban settings, time of...
291
+
292
+
293
+
294
+ ---
295
+
296
+
297
+ #### Comparison Example 4
298
+
299
+ **Query:** what causes death from Covid-19?
300
+
301
+ **From:** test split
302
+
303
+
304
+ **Score 2 Documents:**
305
+
306
+
307
+ *Document 1:*
308
+
309
+ - Title: Covid-19: Death rate in England and Wales reaches record high because of covid-19.
310
+
311
+ - Text: ...
312
+
313
+
314
+ *Document 2:*
315
+
316
+ - Title: Clinical Characteristics and Risk Factors for Mortality of COVID-19 Patients With Diabetes in Wuhan, China: A Two-Center, Retrospective Study.
317
+
318
+ - Text: OBJECTIVE Diabetes is common in COVID-19 patients and associated with unfavorable outcomes. We aimed to describe the characteristics and outcomes and to analyze the risk factors for in-hospital mortality of COVID-19 patients with diabetes. RESEARCH DESIGN AND METHODS This two-center retrospective study was performed at two tertiary hospitals in Wuhan, China. Confirmed COVID-19 patients with diabet...
319
+
320
+
321
+ *Document 3:*
322
+
323
+ - Title: Sudden Death from Pulmonary Causes
324
+
325
+ - Text: This chapter seeks to survey many of the common pathological entities identified in the lungs at autopsy and the potential role of pulmonary disease in formulating an opinion regarding the cause of death. Appreciation of pulmonary pathology in the medicolegal context is important as it frequently contributes to the immediate or underlying mechanisms of death. The primacy of the lungs in breathing ...
326
+
327
+
328
+
329
+ **Score 1 Documents:**
330
+
331
+
332
+ *Document 1:*
333
+
334
+ - Title: The kidney in COVID-19: protagonist or figurant?
335
+
336
+ - Text: The etiology of injury in COVID-19 patients is diverse and multifactorial. Autopsy and biopsy studies reveal, alongside podocyte and tubular cell anomalies, the presence of virion within the cells. Evidence suggests that, in addition to the direct cytopathic effect of SARS- CoV2 on the glomeruli and renal tubules, there is also the indirect effect of cell-mediated immunity, the cytokines storm and...
337
+
338
+
339
+ *Document 2:*
340
+
341
+ - Title: Why are more BAME people dying from COVID-19?
342
+
343
+ - Text: ...
344
+
345
+
346
+ *Document 3:*
347
+
348
+ - Title: Symptom burden and clinical profile of COVID-19 deaths: a rapid systematic review and evidence summary.
349
+
350
+ - Text: The spread of pandemic COVID-19 has created unprecedented need for information. The pandemic is the cause of significant mortality and with this the need for rapidly disseminated information for palliative care professionals regarding the prevalence of symptoms, their intensity, their resistance or susceptibility to symptom control and the mode of death for patients. METHODS We undertook a systema...
351
+
352
+
353
+
354
+ **Score 0 Documents:**
355
+
356
+
357
+ *Document 1:*
358
+
359
+ - Title: Herausforderungen in der Seuchenkontrolle und der jetzigen Pandemie durch verzerrte Verteilungen
360
+
361
+ - Text: BACKGROUND: The pandemic phase 3—mitigation—by the SARS-Coronavirus‑2 is currently taking on speed in Germany. Skewed distributions of key epidemiological parameters of the virus and patient care are a challenge for the control of the outbreak as well as keeping the system functional. METHODS: The skewed parameters—pre-patency period, incubation period, duration of viral shedding and time to admis...
362
+
363
+
364
+ *Document 2:*
365
+
366
+ - Title: Article 14 Prevention of Spread of Disease
367
+
368
+ - Text: Each contracting State agrees to take effective measures to prevent the spread by means of air navigation of cholera, typhus (epidemic), smallpox, yellow fever, plague, and such other communicable diseases as the contracting States shall from time to time decide to designate, and to that end contracting States will keep in close consultation with the agencies concerned with international regulatio...
369
+
370
+
371
+ *Document 3:*
372
+
373
+ - Title: Diseases of the Liver and Hepatobiliary System
374
+
375
+ - Text: ...
376
+
377
+
378
+
379
+ ---
380
+
381
+
382
+ #### Comparison Example 5
383
+
384
+ **Query:** what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?
385
+
386
+ **From:** test split
387
+
388
+
389
+ **Score 2 Documents:**
390
+
391
+
392
+ *Document 1:*
393
+
394
+ - Title: Complete protection against severe acute respiratory syndrome coronavirus-mediated lethal respiratory disease in aged mice by immunization with a mouse-adapted virus lacking E protein.
395
+
396
+ - Text: Zoonotic coronaviruses, including the one that caused severe acute respiratory syndrome (SARS), cause significant morbidity and mortality in humans. No specific therapy for any human coronavirus is available, making vaccine development critical for protection against these viruses. We previously showed that recombinant SARS coronavirus (SARS-CoV) (Urbani strain based) lacking envelope (E) protein ...
397
+
398
+
399
+ *Document 2:*
400
+
401
+ - Title: Protease Inhibitors: Candidate Drugs to Inhibit Severe Acute Respiratory Syndrome Coronavirus 2 Replication.
402
+
403
+ - Text: The number of patients infected with severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) has rapidly increased, although the WHO declared a pandemic. However, drugs that function against SARS-CoV-2 have not been established. SARS-CoV-2 has been suggested to bind angiotensin-converting enzyme 2, the receptor of the SARS coronavirus. SARS coronavirus and coronavirus 229E, the cause of the c...
404
+
405
+
406
+ *Document 3:*
407
+
408
+ - Title: Immunogenicity and protective efficacy in mice and hamsters of a β-propiolactone inactivated whole virus SARS-CoV vaccine.
409
+
410
+ - Text: The immunogenicity and efficacy of β-propiolactone (BPL) inactivated whole virion SARS-CoV (WI-SARS) vaccine was evaluated in BALB/c mice and golden Syrian hamsters. The vaccine preparation was tested with or without adjuvants. Adjuvant Systems AS01(B) and AS03(A) were selected and tested for their capacity to elicit high humoral and cellular immune responses to WI-SARS vaccine. We evaluated the e...
411
+
412
+
413
+
414
+ **Score 1 Documents:**
415
+
416
+
417
+ *Document 1:*
418
+
419
+ - Title: Comparison of SARS-CoV-2 spike protein binding to ACE2 receptors from human, pets, farm animals, and putative intermediate hosts.
420
+
421
+ - Text: The emergence of a novel coronavirus, SARS-CoV-2, resulted in a pandemic. Here, we used X-ray structures of human ACE2 bound to the receptor-binding domain (RBD) of the spike protein (S) from SARS-CoV-2 to predict its binding to ACE2 proteins from different animals, including pets, farm animals, and putative intermediate hosts of SARS-CoV-2. Comparing the interaction sites of ACE2 proteins known t...
422
+
423
+
424
+ *Document 2:*
425
+
426
+ - Title: Replication of SARS-CoV-2 in human respiratory epithelium.
427
+
428
+ - Text: Currently, there are four seasonal coronaviruses associated with relatively mild respiratory tract disease in humans. However, there are also a plethora of animal coronaviruses, which have the potential to cross the species border. This regularly results in the emergence of new viruses in humans. In 2002 SARS-CoV emerged, to rapidly disappear in May 2003. In 2012 MERS-CoV was identified as a possi...
429
+
430
+
431
+ *Document 3:*
432
+
433
+ - Title: Cholesterol-modifying drugs in COVID-19
434
+
435
+ - Text: Infection with SARS-CoV-2 is more likely to lead to poor outcomes in the elderly and those with cardiovascular disease, obesity or metabolic syndrome. Here we consider mechanisms by which dyslipidemia and the use of cholesterol-modifying drugs could influence the virus-host relationship. Cholesterol is essential for the assembly, replication and infectivity of enveloped virus particles; we highlig...
436
+
437
+
438
+
439
+ **Score 0 Documents:**
440
+
441
+
442
+ *Document 1:*
443
+
444
+ - Title: Extensive Literature Search on the ‘Effects of Copper intake levels in the gut microbiota profile of target animals, in particular piglets’
445
+
446
+ - Text: ...
447
+
448
+
449
+ *Document 2:*
450
+
451
+ - Title: Big Data and Biodefense: Prospects and Pitfalls
452
+
453
+ - Text: This chapter will provide an overview of how “big data” and “big data” analytics can be brought to bear on the pressing biodefense challenges of: (1) threat awareness; and (2) surveillance and detection. The chapter will also discuss potential problems that can arise by relying exclusively on “big data” approaches, which have properties and limitations inherent in their composition that may not be...
454
+
455
+
456
+ *Document 3:*
457
+
458
+ - Title: Types of Recombinant Vaccines
459
+
460
+ - Text: The original scientific strategy behind vaccinology has historically been to “isolate, inactivate, and inject,” first invoked by Louis Pasteur....
461
+
462
+
463
+
464
+ ---
465
+
466
+
467
+ ### Examples by Split and Score
468
+
469
+
470
+ #### TEST Split
471
+
472
+ **Scores in test: [-1, 0, 1, 2]**
473
+
474
+
475
+ ##### Score -1 Examples (test)
476
+
477
+ No examples found in loaded corpus.
478
+
479
+
480
+ ##### Score 0 Examples (test)
481
+
482
+
483
+ **Example 1:**
484
+
485
+ - Query: what is the origin of COVID-19
486
+
487
+ - Doc Title: The epidemiology of MERS-CoV
488
+
489
+ - Doc Text: ...
490
+
491
+
492
+ **Example 2:**
493
+
494
+ - Query: how does the coronavirus respond to changes in the weather
495
+
496
+ - Doc Title: Bayesian Analysis for Inference of an Emerging Epidemic: Citrus Canker in Urban Landscapes
497
+
498
+ - Doc Text: Outbreaks of infectious diseases require a rapid response from policy makers. The choice of an adequate level of response relies upon available knowledge of the spatial and temporal parameters governing pathogen spread, affecting, amongst others, the predicted severity of the epidemic. Yet, when a n...
499
+
500
+
501
+ **Example 3:**
502
+
503
+ - Query: will SARS-CoV2 infected people develop immunity? Is cross protection possible?
504
+
505
+ - Doc Title: Updated Vaccination Recommendations for Carnivores
506
+
507
+ - Doc Text: ...
508
+
509
+
510
+ **Example 4:**
511
+
512
+ - Query: what causes death from Covid-19?
513
+
514
+ - Doc Title: Herausforderungen in der Seuchenkontrolle und der jetzigen Pandemie durch verzerrte Verteilungen
515
+
516
+ - Doc Text: BACKGROUND: The pandemic phase 3—mitigation—by the SARS-Coronavirus‑2 is currently taking on speed in Germany. Skewed distributions of key epidemiological parameters of the virus and patient care are a challenge for the control of the outbreak as well as keeping the system functional. METHODS: The s...
517
+
518
+
519
+ **Example 5:**
520
+
521
+ - Query: what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?
522
+
523
+ - Doc Title: Extensive Literature Search on the ‘Effects of Copper intake levels in the gut microbiota profile of target animals, in particular piglets’
524
+
525
+ - Doc Text: ...
526
+
527
+
528
+ **Example 6:**
529
+
530
+ - Query: what types of rapid testing for Covid-19 have been developed?
531
+
532
+ - Doc Title: Clinical Features and Chest CT Manifestations of Coronavirus Disease 2019 (COVID-19) in a Single-Center Study in Shanghai, China.
533
+
534
+ - Doc Text: OBJECTIVE. Confronting the new coronavirus infection known as coronavirus disease 2019 (COVID-19) is challenging and requires excluding patients with suspected COVID-19 who actually have other diseases. The purpose of this study was to assess the clinical features and CT manifestations of COVID-19 b...
535
+
536
+
537
+ **Example 7:**
538
+
539
+ - Query: are there serological tests that detect antibodies to coronavirus?
540
+
541
+ - Doc Title: A review of enzyme immunoassay for detection of antibody to Brucella abortus in cattle
542
+
543
+ - Doc Text: Enzyme immunoassay has gained wide acceptance for serological diagnosis of bovine brucellosis because of its ability to detect antibody of all isotypes unlike the conventional tests. The indirect enzyme immunoassay, however, presents several parameters that require careful analysis. These parameters...
544
+
545
+
546
+ **Example 8:**
547
+
548
+ - Query: how has lack of testing availability led to underreporting of true incidence of Covid-19?
549
+
550
+ - Doc Title: Implications of SARSr-CoV 2 infection in thalassemias: Do patients fall into the "high clinical risk" category?
551
+
552
+ - Doc Text: We're all flying blind regarding coronavirus, but it's fair to think if thalassemic patients are particularly vulnerable to SARS-COV-2 infection or are at potential higher risk of complications from COVID-19 than normal population, specially when they become older. The frustrating thing is that, rig...
553
+
554
+
555
+ **Example 9:**
556
+
557
+ - Query: how has COVID-19 affected Canada
558
+
559
+ - Doc Title: [Diagnostic steps and estimation of disease prognosis in COVID-19-infected patients].
560
+
561
+ - Doc Text: In December 2019, a new coronavirus outbreak hit his head in Wuhan, China. The identified pathogen is a new coronavirus named "severe acute respiratory syndrome coronavirus 2," or SARS-CoV-2, and the respiratory syndrome it elicits was named "coronavirus disease 2019," or COVID-19. The World Health ...
562
+
563
+
564
+ **Example 10:**
565
+
566
+ - Query: has social distancing had an impact on slowing the spread of COVID-19?
567
+
568
+ - Doc Title: Global trends in research related to social media in psychology: mapping and bibliometric analysis
569
+
570
+ - Doc Text: BACKGROUND: Social media, defined as interactive Web applications, have been on the rise globally, particularly among adults. The objective of this study was to investigate the trend of the literature related to the most used social network worldwide (i.e. Facebook, Twitter, LinkedIn, Snapchat, and ...
571
+
572
+
573
+ ##### Score 1 Examples (test)
574
+
575
+
576
+ **Example 1:**
577
+
578
+ - Query: what is the origin of COVID-19
579
+
580
+ - Doc Title: Deadly virus effortlessly hops species
581
+
582
+ - Doc Text: Genetic engineering helps reveal origin of deadly 'flu...
583
+
584
+
585
+ **Example 2:**
586
+
587
+ - Query: how does the coronavirus respond to changes in the weather
588
+
589
+ - Doc Title: The association between the seasonality of pediatric pandemic influenza virus outbreak and ambient meteorological factors in Shanghai
590
+
591
+ - Doc Text: BACKGROUND AND OBJECTIVES: The number of pediatric patients diagnosed with influenza types A and B is increasing annually, especially in temperate regions such as Shanghai (China). The onset of pandemic influenza viruses might be attributed to various ambient meteorological factors including tempera...
592
+
593
+
594
+ **Example 3:**
595
+
596
+ - Query: will SARS-CoV2 infected people develop immunity? Is cross protection possible?
597
+
598
+ - Doc Title: Complete protection against severe acute respiratory syndrome coronavirus-mediated lethal respiratory disease in aged mice by immunization with a mous
599
+
600
+ - Doc Text: Zoonotic coronaviruses, including the one that caused severe acute respiratory syndrome (SARS), cause significant morbidity and mortality in humans. No specific therapy for any human coronavirus is available, making vaccine development critical for protection against these viruses. We previously sho...
601
+
602
+
603
+ **Example 4:**
604
+
605
+ - Query: what causes death from Covid-19?
606
+
607
+ - Doc Title: The kidney in COVID-19: protagonist or figurant?
608
+
609
+ - Doc Text: The etiology of injury in COVID-19 patients is diverse and multifactorial. Autopsy and biopsy studies reveal, alongside podocyte and tubular cell anomalies, the presence of virion within the cells. Evidence suggests that, in addition to the direct cytopathic effect of SARS- CoV2 on the glomeruli and...
610
+
611
+
612
+ **Example 5:**
613
+
614
+ - Query: what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?
615
+
616
+ - Doc Title: Comparison of SARS-CoV-2 spike protein binding to ACE2 receptors from human, pets, farm animals, and putative intermediate hosts.
617
+
618
+ - Doc Text: The emergence of a novel coronavirus, SARS-CoV-2, resulted in a pandemic. Here, we used X-ray structures of human ACE2 bound to the receptor-binding domain (RBD) of the spike protein (S) from SARS-CoV-2 to predict its binding to ACE2 proteins from different animals, including pets, farm animals, and...
619
+
620
+
621
+ **Example 6:**
622
+
623
+ - Query: what types of rapid testing for Covid-19 have been developed?
624
+
625
+ - Doc Title: Understanding, verifying and implementing Emergency Use Authorization molecular diagnostics for the detection of SARS-CoV-2 RNA.
626
+
627
+ - Doc Text: The SARS-CoV-2 pandemic has brought a new wave of challenges to health care, particularly in the area of rapid diagnostic test development and implementation. Acute diagnosis of COVID-19 infection is critically dependent on detection of SARS-CoV-2 RNA from clinical specimens (e.g. nasopharyngeal swa...
628
+
629
+
630
+ **Example 7:**
631
+
632
+ - Query: are there serological tests that detect antibodies to coronavirus?
633
+
634
+ - Doc Title: Self-assembly into virus–like particles of the recombinant capsid protein of porcine circovirus type 3 and its application on antibodies detection
635
+
636
+ - Doc Text: PCV3 capsid protein (Cap) is an important antigen for diagnosis and vaccine development. To achieve high-level expression of recombinant PCV3 Cap in Escherichia coli (E. coli), the gene of wild-type entire Cap (wt-eCap) was amplified from clinical samples, and three optimized entire Cap (opti-eCap) ...
637
+
638
+
639
+ **Example 8:**
640
+
641
+ - Query: how has lack of testing availability led to underreporting of true incidence of Covid-19?
642
+
643
+ - Doc Title: Quarantine alone or in combination with other public health measures to control COVID-19: a rapid review.
644
+
645
+ - Doc Text: BACKGROUND Coronavirus disease 2019 (COVID-19) is a rapidly emerging disease that has been classified a pandemic by the World Health Organization (WHO). To support WHO with their recommendations on quarantine, we conducted a rapid review on the effectiveness of quarantine during severe coronavirus o...
646
+
647
+
648
+ **Example 9:**
649
+
650
+ - Query: how has COVID-19 affected Canada
651
+
652
+ - Doc Title: Health-related concerns and precautions during the COVID-19 pandemic: A comparison of Canadians with and without underlying health conditions.
653
+
654
+ - Doc Text: BACKGROUND The risk of experiencing adverse outcomes from the coronavirus disease 2019 (COVID-19), such as hospitalization, admission to intensive care units and death, is elevated for older individuals and those with certain underlying health conditions including diabetes, chronic conditions affect...
655
+
656
+
657
+ **Example 10:**
658
+
659
+ - Query: has social distancing had an impact on slowing the spread of COVID-19?
660
+
661
+ - Doc Title: Understanding the Impact of Face Mask Usage Through Epidemic Simulation of Large Social Networks
662
+
663
+ - Doc Text: Evidence from the 2003 SARS epidemic and 2009 H1N1 pandemic shows that face masks can be an effective non-pharmaceutical intervention in minimizing the spread of airborne viruses. Recent studies have shown that using face masks is correlated to an individual’s age and gender, where females and older...
664
+
665
+
666
+ ##### Score 2 Examples (test)
667
+
668
+
669
+ **Example 1:**
670
+
671
+ - Query: what is the origin of COVID-19
672
+
673
+ - Doc Title: Intraspecies diversity of SARS-like coronaviruses in Rhinolophus sinicus and its implications for the origin of SARS coronaviruses in humans.
674
+
675
+ - Doc Text: The Chinese rufous horseshoe bat (Rhinolophus sinicus) has been suggested to carry the direct ancestor of severe acute respiratory syndrome (SARS) coronavirus (SCoV), and the diversity of SARS-like CoVs (SLCoV) within this Rhinolophus species is therefore worth investigating. Here, we demonstrate th...
676
+
677
+
678
+ **Example 2:**
679
+
680
+ - Query: how does the coronavirus respond to changes in the weather
681
+
682
+ - Doc Title: Seasonality of Respiratory Viral Infections.
683
+
684
+ - Doc Text: The seasonal cycle of respiratory viral diseases has been widely recognized for thousands of years, as annual epidemics of the common cold and influenza disease hit the human population like clockwork in the winter season in temperate regions. Moreover, epidemics caused by viruses such as severe acu...
685
+
686
+
687
+ **Example 3:**
688
+
689
+ - Query: will SARS-CoV2 infected people develop immunity? Is cross protection possible?
690
+
691
+ - Doc Title: Serological differentiation between COVID-19 and SARS infections.
692
+
693
+ - Doc Text: In response to the coronavirus disease 2019 (COVID-19) outbreak, caused by the SARS-CoV-2 virus, multiple diagnostic tests are required globally for acute disease diagnosis, contact tracing, monitoring of asymptomatic infection rates and assessing herd immunity. While PCR remains the frontline test ...
694
+
695
+
696
+ **Example 4:**
697
+
698
+ - Query: what causes death from Covid-19?
699
+
700
+ - Doc Title: Covid-19: Death rate in England and Wales reaches record high because of covid-19.
701
+
702
+ - Doc Text: ...
703
+
704
+
705
+ **Example 5:**
706
+
707
+ - Query: what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?
708
+
709
+ - Doc Title: Complete protection against severe acute respiratory syndrome coronavirus-mediated lethal respiratory disease in aged mice by immunization with a mous
710
+
711
+ - Doc Text: Zoonotic coronaviruses, including the one that caused severe acute respiratory syndrome (SARS), cause significant morbidity and mortality in humans. No specific therapy for any human coronavirus is available, making vaccine development critical for protection against these viruses. We previously sho...
712
+
713
+
714
+ **Example 6:**
715
+
716
+ - Query: what types of rapid testing for Covid-19 have been developed?
717
+
718
+ - Doc Title: Hunting coronavirus severe acute respiratory syndrome coronavirus 2 (2019 novel coronavirus): From laboratory testing back to basic research
719
+
720
+ - Doc Text: The rapid spread of coronavirus disease 2019 (COVID-19) in many countries causes citizens of daily inconvenience and even life-threat for elderly population. The invasion of the main pathogen, severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2; 2019 novel coronavirus [2019-nCoV]), into huma...
721
+
722
+
723
+ **Example 7:**
724
+
725
+ - Query: are there serological tests that detect antibodies to coronavirus?
726
+
727
+ - Doc Title: SARS-CoV-2 serology: Test, test, test, but interpret with caution!
728
+
729
+ - Doc Text: SARS-CoV-2 serological tests are a subject of intense interest and have the potential to significantly enhance the diagnostic capability of healthcare services in the current pandemic. However, as with all novel assays, significant validation is required to understand the clinical relevance of resul...
730
+
731
+
732
+ **Example 8:**
733
+
734
+ - Query: how has lack of testing availability led to underreporting of true incidence of Covid-19?
735
+
736
+ - Doc Title: I Just Can’t Get Enough (of Experts): The Numbers of COVID-19 and the Need for a European Approach to Testing
737
+
738
+ - Doc Text: This article offers a reflection on the testing strategies deployed in the generation of epidemiological data in the European Union (EU). I will argue that, while in the early days of the pandemic, Member States proceeded to testing in a rather scattered way, the shortage of resources seems to have ...
739
+
740
+
741
+ **Example 9:**
742
+
743
+ - Query: how has COVID-19 affected Canada
744
+
745
+ - Doc Title: Communication, transparency key as Canada faces new coronavirus threat.
746
+
747
+ - Doc Text: ...
748
+
749
+
750
+ **Example 10:**
751
+
752
+ - Query: has social distancing had an impact on slowing the spread of COVID-19?
753
+
754
+ - Doc Title: Timing of Community Mitigation and Changes in Reported COVID-19 and Community Mobility - Four U.S. Metropolitan Areas, February 26-April 1, 2020.
755
+
756
+ - Doc Text: Community mitigation activities (also referred to as nonpharmaceutical interventions) are actions that persons and communities can take to slow the spread of infectious diseases. Mitigation strategies include personal protective measures (e.g., handwashing, cough etiquette, and face coverings) that ...
757
+
758
+
759
+ ## CORE
760
+
761
+ **Has Scores 0, 1, 2**
762
+ **Score 2 = HIGHLY RELEVANT** ✓
763
+ **Score 1 = RELEVANT** ✓
764
+
765
+ ## Special Notes
766
+
767
+ - **Original Purpose**: COVID-19 biomedical information retrieval
768
+ - **What Scores Really Mean**:
769
+ - Score 2: Direct COVID-19 papers (origins, immunity, mortality)
770
+ - Score 1: Related viral/respiratory papers (influenza, SARS, general respiratory)
771
+ - Score 0: Unrelated topics (cattle welfare, citrus disease, liver disease)
772
+ - **Dataset Type**: Pandemic IR with graded relevance
773
+ - **CLEANED**: Removed 2 Score -1 annotations (were errors)
774
+
775
+ ## Training Verdict
776
+
777
+ **GOOD FOR TRAINING** ✓
778
+ - Both Score 1 and 2 are truly relevant
779
+ - Clear graded relevance levels
780
+ - For binary: Use 1+2 as positive, 0 as negative
781
+
782
+ ## Observations
test_scores/dataset_reports/webis-touche2020_analysis.md ADDED
@@ -0,0 +1,624 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # WEBIS-TOUCHE2020 Dataset Analysis
2
+
3
+ ## Overview
4
+
5
+ - Total queries available: 49
6
+
7
+ - Total corpus documents: 382545
8
+
9
+ - Corpus documents loaded for examples: 50000
10
+
11
+
12
+ ## Split Statistics
13
+
14
+ | Split | Queries | Total Pairs | Score Distribution |
15
+
16
+ |-------|---------|-------------|-------------------|
17
+
18
+ | train | N/A | N/A | N/A |
19
+
20
+ | dev | N/A | N/A | N/A |
21
+
22
+ | test | 49 | 2214 | 0:1282, 1:296, 2:636 |
23
+
24
+
25
+ ## Score Analysis
26
+
27
+ **Multiple score levels found: [0, 1, 2]**
28
+
29
+
30
+ ## Examples
31
+
32
+ ### Score Comparison Examples
33
+
34
+ #### Comparison Example 1
35
+
36
+ **Query:** Should teachers get tenure?
37
+
38
+ **From:** test split
39
+
40
+
41
+ **Score 2 Documents:**
42
+
43
+
44
+ *Document 1:*
45
+
46
+ - Title: There should not be a teacher tenure.
47
+
48
+ - Text: Reason 1 - Teacher tenure creates complacency because teachers know they are unlikely to lose their jobs: If teachers know that they reached the period where they get special defence from most accusations - it would send the message to them that they can then do whatever they want to do in the classroom and really slack with their teaching duties. Reason 2 - Tenure makes it difficult to remove und...
49
+
50
+
51
+ *Document 2:*
52
+
53
+ - Title: There should not be a teacher tenure.
54
+
55
+ - Text: Quotes used in my debate are all included here. (I know this is not exactly allowed and it's really messy and confusing but 10,000 characters (approx. 1500 words) really isn’t enough for 10 rebuttals.) Rebuttal of ‘Reason 1’: ‘a’ (this is where Quote a. from the picture should go) Pro is presupposing that teachers will become complacent if they know they are unlikely to lose their jobs. However, 1...
56
+
57
+
58
+ *Document 3:*
59
+
60
+ - Title: There should not be a teacher tenure.
61
+
62
+ - Text: Thank you sara_ann_dee for her argument. In this debate, I will argue that ‘there should be a teacher tenure.’ Whether this tenure is reformed, or reduced to only applicable to some teachers, does not matter as long as I am able to prove that ‘teacher tenure’ should, in some form, exist. Voters should vote for me if I am able to do this. Although rebuttals are reserved for the next round, I urge t...
63
+
64
+
65
+
66
+ **Score 0 Documents:**
67
+
68
+
69
+ *Document 1:*
70
+
71
+ - Title: should students have to wear school uniforms
72
+
73
+ - Text: What do u think should students wear school uniforms...
74
+
75
+
76
+ *Document 2:*
77
+
78
+ - Title: There should not be a teacher tenure.
79
+
80
+ - Text: Hello and thank you for joining my debate I will be arguing to get rid of the teacher tenure....
81
+
82
+
83
+
84
+ ---
85
+
86
+
87
+ #### Comparison Example 2
88
+
89
+ **Query:** Is vaping with e-cigarettes safe?
90
+
91
+ **From:** test split
92
+
93
+
94
+ **Score 2 Documents:**
95
+
96
+
97
+ *Document 1:*
98
+
99
+ - Title: Bloomberg's Ban on E-Cigs
100
+
101
+ - Text: Electronic cigarettes comes with different cartridges including 6-18mg of nicotine and sometimes 0mg. This is to say that electronic cigarettes are safer to smoke than traditional cigarettes. Electronic cigarettes do not cause tar because of the fact that it does not contain tobacco and leave behind no tar. As a result, the main components of carcinogen are not present to create a problem that tra...
102
+
103
+
104
+ *Document 2:*
105
+
106
+ - Title: Bloomberg's Ban on E-Cigs
107
+
108
+ - Text: Whether smoking a cigarette or e-cig there is still nicotine In both and nicotine is highly addictive. E cigs are not a safer alternative to cigarettes because they are just as addictive. E-cigs may only be vapor but it is not undetectable. That wretched nicotine smell will linger on your clothes and in your hair. The smoke and vapor is bound to offend someone and I would not like to be sitting ou...
109
+
110
+
111
+ *Document 3:*
112
+
113
+ - Title: Should E-cigs and vapes be regulated
114
+
115
+ - Text: I've never claimed that I'm okay with "kids" acquiring an addiction to electronic cigarettes. I simply do not care. It's not only nicotine but other harmful substances that are present. You provided examples of why they're bad, Without any sources. I'm just saying that there shouldn't be any regulations because it will be utterly pointless and time consuming. Juul for example is a good way to star...
116
+
117
+
118
+
119
+ **Score 0 Documents:**
120
+
121
+
122
+ *Document 1:*
123
+
124
+ - Title: Mining is relatively safe.
125
+
126
+ - Text: Your assertion of the debate is that mining is a relatively safe occupation, but relative to what? It's not safe relative to toilet paper testers. In fact, I can actually bring statistics to show that it is actually far more dangerous. How many miners die per year? What percentage of miners get asbestos or other mining related illnesses? This is from an independent Indonesian website: Pada tahun p...
127
+
128
+
129
+ *Document 2:*
130
+
131
+ - Title: Mining is relatively safe.
132
+
133
+ - Text: Welcome to the debateFirst round is for defining and the last round is for conclusions. I define mining as a quest that involved digging at a very large scale to seek for precious stones or metals. I define relatively as 'compared to other occupations' and safe as 'compared to other occupations' as well.Thank you....
134
+
135
+
136
+ *Document 3:*
137
+
138
+ - Title: The legalization of Marijuana in the United States would benefit all Americans.
139
+
140
+ - Text: This drug should stay illegal for many reasons concerning the health and safety of Americans today. Saying no one ever died from smoking marijuana is like saying no one ever died from smoking tobacco. Today research shows that nearly 50 percent of teenagers try marijuana before they graduate from high school. Most young teens and children start using marijuana for many reasons, they are curious, o...
141
+
142
+
143
+
144
+ ---
145
+
146
+
147
+ #### Comparison Example 3
148
+
149
+ **Query:** Should corporal punishment be used in schools?
150
+
151
+ **From:** test split
152
+
153
+
154
+ **Score 2 Documents:**
155
+
156
+
157
+ *Document 1:*
158
+
159
+ - Title: Spanking In Schools
160
+
161
+ - Text: I will now summarize this debate and make a closing statement. Con has presented arguments that are self-contradicting: Should spanking "be banned from everywhere," including children's own homes, or is it "the parents [sic] choice" whether or not to spank their children? I have shown that spanking should not be banned universally, and I have also demonstrated that if parents are allowed to spank ...
162
+
163
+
164
+ *Document 2:*
165
+
166
+ - Title: Corporal punishment should be banned from schools
167
+
168
+ - Text: http://abcnews.go.com... This article shows the ineffectiveness of corporal punishment in schools. In closing, corporal punishment should be banned because it is ineffective and lowers students' IQ's. Other forms of punishment should be explored and tested. Good luck....
169
+
170
+
171
+
172
+ **Score 1 Documents:**
173
+
174
+
175
+ *Document 1:*
176
+
177
+ - Title: Corporal punishment in the families!!!
178
+
179
+ - Text: Spanking Has Its Place I believe many would argue that spanking a child is a form of corporal punishment and it should be regarded as child abuse, but there is also a whole other group that supports spankings and they say it works. I've tried to use spanking with my child and I found it didn't work and I didn't prefer that method, but I don't see it as child abuse either. Of course, when parents o...
180
+
181
+
182
+
183
+ **Score 0 Documents:**
184
+
185
+
186
+ *Document 1:*
187
+
188
+ - Title: corporal punishment
189
+
190
+ - Text: Sir, you are the one who instigated the debate. You have to present your position and I refute it. Not the other way around....
191
+
192
+
193
+ *Document 2:*
194
+
195
+ - Title: should schools teach sword fighting
196
+
197
+ - Text: i say schools should teach sword fighting because it would be self defense and would help students defend the school if someone broke in while classes were going on...
198
+
199
+
200
+
201
+ ---
202
+
203
+
204
+ #### Comparison Example 4
205
+
206
+ **Query:** Should social security be privatized?
207
+
208
+ **From:** test split
209
+
210
+
211
+ **Score 2 Documents:**
212
+
213
+
214
+ *Document 1:*
215
+
216
+ - Title: Abolish Social Security
217
+
218
+ - Text: Unpromising, chaotic, and fraud. Social security fits this description exactly; therefore social security must be abolished. I take an affirmative stance to abolish Social security for the following contentions; Contention 1; Social security has no prospect for America, Contention 2; America now and in the future will not be able to sustain the needs of the Social security system, and Contention 3...
219
+
220
+
221
+
222
+ **Score 0 Documents:**
223
+
224
+
225
+ *Document 1:*
226
+
227
+ - Title: Get rid of parole
228
+
229
+ - Text: When someone is convicted of murder should they get rid of parole and automatically send them to death row?...
230
+
231
+
232
+ *Document 2:*
233
+
234
+ - Title: Abolish Social Security
235
+
236
+ - Text: Due to the fact my opponent's account has been closed, this debate is null. I please ask you not to vote....
237
+
238
+
239
+ *Document 3:*
240
+
241
+ - Title: Obama Care is a disaster
242
+
243
+ - Text: “Unemployment insurance or subsidized coverage run by the government is destined to fail. It encourages laziness, has no accountability, and wins the voters' vote who is receiving it. […] If you don't work and don't plan on working, you shouldn't be allowed to have it.” Keep in mind unemployment insurance only goes to folks who are actively looking for work, a mom who needs help feeding her kids w...
244
+
245
+
246
+
247
+ ---
248
+
249
+
250
+ #### Comparison Example 5
251
+
252
+ **Query:** Is a college education worth it?
253
+
254
+ **From:** test split
255
+
256
+
257
+ **Score 2 Documents:**
258
+
259
+
260
+ *Document 1:*
261
+
262
+ - Title: College education is not necessary to become successful
263
+
264
+ - Text: College education is a waste of money and time. Most people who attend college do not get what they pay for. An associate degree or a bachelors degree cannot really get you a high paying job or "career". In order to get a "career" you must continue your education to receive a masters degree, a PhD or any other further education Many people without a college career have been extremely successful; F...
265
+
266
+
267
+
268
+ **Score 1 Documents:**
269
+
270
+
271
+ *Document 1:*
272
+
273
+ - Title: Going to college is worth
274
+
275
+ - Text: Consider a smart person who is determined to be a lawyer. This person has a full ride scholarship, so education is free, and it is necessary for this person's life dreams and happiness to be a lawyer. A degree is required to practice law [1]. You cannot become a lawyer without being able to practice law, and so, since classes cost no money, it is totally worth it for this person to go to college. ...
276
+
277
+
278
+
279
+ **Score 0 Documents:**
280
+
281
+
282
+ *Document 1:*
283
+
284
+ - Title: Should colleges be free
285
+
286
+ - Text: Colleges should not be free because of the following reasons: 1) How will the colleges pay for the teachers and professors that teach the students the course and material without having the students pay for their education fee? 2)Do you really you think with the capitalist society America is rooted on that anything at the most part will be free? Sure there's public education. You still have to pay...
287
+
288
+
289
+ *Document 2:*
290
+
291
+ - Title: Going to college is worth
292
+
293
+ - Text: I accept, and will be arguing that college is worth it for some people....
294
+
295
+
296
+ *Document 3:*
297
+
298
+ - Title: It's not worth having children
299
+
300
+ - Text: True, true... but there is one thing that redeems reproduction... In order for us humans to evolve, we are required by the laws of nature to knock each other up and get preggerz so we can continue the human race's existence. Unless, like, we gave birth to robots that ruin the woman's "anatomy" forever during birth. And so what if they cause grief? I'm sure there's plenty of masochistic parents out...
301
+
302
+
303
+
304
+ ---
305
+
306
+
307
+ ### Examples by Split and Score
308
+
309
+
310
+ #### TEST Split
311
+
312
+ **Scores in test: [0, 1, 2]**
313
+
314
+
315
+ ##### Score 0 Examples (test)
316
+
317
+
318
+ **Example 1:**
319
+
320
+ - Query: Should teachers get tenure?
321
+
322
+ - Doc Title: should students have to wear school uniforms
323
+
324
+ - Doc Text: What do u think should students wear school uniforms...
325
+
326
+
327
+ **Example 2:**
328
+
329
+ - Query: Is vaping with e-cigarettes safe?
330
+
331
+ - Doc Title: Mining is relatively safe.
332
+
333
+ - Doc Text: Your assertion of the debate is that mining is a relatively safe occupation, but relative to what? It's not safe relative to toilet paper testers. In fact, I can actually bring statistics to show that it is actually far more dangerous. How many miners die per year? What percentage of miners get asbestos or other mining related illnesses? This is from an independent Indonesian website: Pada tahun pertama saja kita melihat bahwa 19% dari semua penguji kertas toilet akan kontrak penyakit mematikan. Selama rentang 10 tahun kita telah melihat bahwa hanya di bawah setengah (48,2%) dari semua penguji kertas toilet mati [1.] Translation: "In the first year alone we see that 19% of all toilet paper testers will contract a lethal disease. Over a 10 year span we have seen that just under half (48.2%) of all toilet paper testers are dead from job-related illnesses." [1.] This was from an independent Chinese website where many toilet paper testers are employed. Upon doing the research of how many miners die over a 10 year span from job related illnesses you will that 48.2% figure is far larger than the amount of miners that die from job related illnesses over a 10 year period. www.npr.org " News " Health Apr 16, 2010 - "Mindful of the grief and anger surrounding the death of the 29 miners at the Upper Big ... in 2006, Attfield found that 4 percent of 1,000 miners tested had advanced pneumoconiosis." This is just one example in one particular case, but it gives you perspective of the 4% of miners battling job-related illnesses vs. the 19% of job-related illnesses that toilet paper testers suffer. [1.] http://m.inilah.com......
334
+
335
+
336
+ **Example 3:**
337
+
338
+ - Query: Should corporal punishment be used in schools?
339
+
340
+ - Doc Title: corporal punishment
341
+
342
+ - Doc Text: Sir, you are the one who instigated the debate. You have to present your position and I refute it. Not the other way around....
343
+
344
+
345
+ **Example 4:**
346
+
347
+ - Query: Should social security be privatized?
348
+
349
+ - Doc Title: Get rid of parole
350
+
351
+ - Doc Text: When someone is convicted of murder should they get rid of parole and automatically send them to death row?...
352
+
353
+
354
+ **Example 5:**
355
+
356
+ - Query: Is a college education worth it?
357
+
358
+ - Doc Title: Should colleges be free
359
+
360
+ - Doc Text: Colleges should not be free because of the following reasons: 1) How will the colleges pay for the teachers and professors that teach the students the course and material without having the students pay for their education fee? 2)Do you really you think with the capitalist society America is rooted on that anything at the most part will be free? Sure there's public education. You still have to pay taxes for that. Virtually, colleges cannot be free. This leads to my third point. 3)There's this thing called community college. It's cheap and inexpensive. Most people that go there either need to save money or just extend their studies before they transfer to a better college. 4)Scholarships help a lot with loans. Most students can get a scholarship. Not the big flashy "Bill Gate Scholarship", but a small one that pays for a part of your tuition. All in all, the way colleges run is that they provide education in return for money. Also, public colleges are funded by the state. Private colleges aren't. This is why they are usually more expensive and in return you get better education. And that's pretty much it for my side of the debate....
361
+
362
+
363
+ **Example 6:**
364
+
365
+ - Query: Should felons who have completed their sentence be allowed to vote?
366
+
367
+ - Doc Title: In a democratic society, felons out to retain the right to vote.
368
+
369
+ - Doc Text: I regret that my opponent has forfeited this round as I enjoyed the debate....
370
+
371
+
372
+ **Example 7:**
373
+
374
+ - Query: Should abortion be legal?
375
+
376
+ - Doc Title: Legal Abortion
377
+
378
+ - Doc Text: My argument is this: you are attempting to save innocent lives. Saving lives is heteronormative and normalises the queer - and makes them "other". Otherness creates conflict and arises dictatorships.Your case makes dictatorships, yes. As for the "no statistical support" - I provided quality sources for you and I quoted them (they are behind paywalls so I will not provide the full text because I doubt the authors want that) - just saying that my arguments are unsubstantiated is fallacious.In the end my opponent just says my argument doesn't make sense - never attacking. He also provides no reasons to vote in negation, just that doing so saves lives - I AGREE but saving lives is INHERENTLY flawed!Easily vote in affirmation because of the dropped arguments and a lack of constructive arguments on my opponent's side. Thank you ~...
379
+
380
+
381
+ **Example 8:**
382
+
383
+ - Query: Should students have to wear school uniforms?
384
+
385
+ - Doc Title: should students have to wear school uniforms
386
+
387
+ - Doc Text: What do u think should students wear school uniforms...
388
+
389
+
390
+ **Example 9:**
391
+
392
+ - Query: Should any vaccines be required for children?
393
+
394
+ - Doc Title: Vaccines do not cause autism
395
+
396
+ - Doc Text: To begin I would like to reiterate the framework of this debate. Namely, as the resolution specifies that vaccines do not cause autism, I simple need demonstrate some pathway by which autism can come about as a result of a vaccination. The negative is not required to demonstrate a direct link by which a vaccination directly leads to autism. Secondly, the negative need only show one instance in which a vaccination could bring about autism as the affirmative has already agreed with this proposition. High fevers in pregnant women can lead to autism In a recent landmark study "A team of UC Davis researchers has found that mothers who had a fever during pregnancy were more than twice as likely to have a child with autism " than mothers who did not have a fever or who took medication to counter its effect" [1] With this in mind let's look at some of the side effects associated with several vaccines. According to the center of Disease control[2] vaccines which have a possible side effect of high fever include but are not limited to, the vaccine for Haemophilus influenza type b, the vaccine for, Diphtheria, Tetanus, and acellular Pertussis and Gardasil the vaccine used to prevent HPV. Responses: My opponent makes that claim that the redacted 1998 Andrew Wakefield study implemented a poor methodology and fabricated data. I completely agree, this study was poorly conducted. However, this does not negate the fact the high fevers in women who are pregnant can result in autism and that vaccination can result in such fevers. Conclusion: I have demonstrated that high fever, a side effect of three different vaccinations can bring about autism as specified in the resolution. I have thus fulfilled my burden of proof and ask that you negate this resolution. 1) http://www.ucdmc.ucdavis.edu... 2) http://www.cdc.gov......
397
+
398
+
399
+ **Example 10:**
400
+
401
+ - Query: Should performance-enhancing drugs be accepted in sports?
402
+
403
+ - Doc Title: Anybody who used performance enhancing drugs in the MLB shouldn't be elected to the Hall of fame
404
+
405
+ - Doc Text: No, they cheated therefore they shouldn't make the hall of fame. It doesn't matter if a lot of players did it. They cheated therefore they should be penalized. The people who make the Hall of Fame should be in there because of their honor to the game, not for cheating and deceiving people so they can have a physical advantage. You said that there are many "incredible players" in the Mitchell Report. Don't you think maybe they were so incredible because they used steriods and performance enhancing drugs? If they didn't use them they wouldn't be as good as the stats show. I understand players are out there to get theirs and get as much money as they can, but what about the true players of the game who aren't using these performing enhancing drugs. They did the right thing and won't make the hall of fame because they might have decent numbers over their career but the ones who used performance enhancing drugs had better stats than them because they cheated....
406
+
407
+
408
+ ##### Score 1 Examples (test)
409
+
410
+
411
+ **Example 1:**
412
+
413
+ - Query: Should corporal punishment be used in schools?
414
+
415
+ - Doc Title: Corporal punishment in the families!!!
416
+
417
+ - Doc Text: Spanking Has Its Place I believe many would argue that spanking a child is a form of corporal punishment and it should be regarded as child abuse, but there is also a whole other group that supports spankings and they say it works. I've tried to use spanking with my child and I found it didn't work and I didn't prefer that method, but I don't see it as child abuse either. Of course, when parents over do it, or spank particularly hard, or hit children in inappropriate ways, then of course it is child abuse....
418
+
419
+
420
+ **Example 2:**
421
+
422
+ - Query: Is a college education worth it?
423
+
424
+ - Doc Title: Going to college is worth
425
+
426
+ - Doc Text: Consider a smart person who is determined to be a lawyer. This person has a full ride scholarship, so education is free, and it is necessary for this person's life dreams and happiness to be a lawyer. A degree is required to practice law [1]. You cannot become a lawyer without being able to practice law, and so, since classes cost no money, it is totally worth it for this person to go to college. 1. http://study.com......
427
+
428
+
429
+ **Example 3:**
430
+
431
+ - Query: Should abortion be legal?
432
+
433
+ - Doc Title: Legal Abortion
434
+
435
+ - Doc Text: This debate is "should abortion be legal?" Short answer no... Long answer noooooooo... Lol, but seriously, Abortion is murder any way that you look at it. I am excited for an intelligent debate...
436
+
437
+
438
+ **Example 4:**
439
+
440
+ - Query: Should students have to wear school uniforms?
441
+
442
+ - Doc Title: Should we be aloud too wear hats in school as in appropriate hats no GANG HATS OKKK
443
+
444
+ - Doc Text: I think we should be able to but also a little we shouldn't because you could get distracted and not do your work instead just play with your hat but you would get a chance to express yourself...
445
+
446
+
447
+ **Example 5:**
448
+
449
+ - Query: Should performance-enhancing drugs be accepted in sports?
450
+
451
+ - Doc Title: All drugs should be legalised, change my mind.
452
+
453
+ - Doc Text: I believe that all drugs should be legalised, whether it is dangerous or not its irrelevantI challenge anyone to change my mind.The only downside I see with the legalisation of drugs is that there might be a 1-2% more drug addicts....
454
+
455
+
456
+ **Example 6:**
457
+
458
+ - Query: Should birth control pills be available over the counter?
459
+
460
+ - Doc Title: birth control in high school
461
+
462
+ - Doc Text: i think birth control is very high in high school they are all too young and they are making the wrong choses and they should really think about there choses because if they have a babie in high school then they have to drop out and then they have to get a crappy job to take care of that babie so there fore there should not be any birth control in high school...
463
+
464
+
465
+ **Example 7:**
466
+
467
+ - Query: Is sexual orientation determined at birth?
468
+
469
+ - Doc Title: We are who we are based on our enviorment. Nature vs. Nurture Debate
470
+
471
+ - Doc Text: I agree that there is a combination at play but I believe that nurture plays a greater role. A child only learns from what he/she experiences. A child would not understand manners if a parent doesn't suggest and teach them as a child develops. The examples we observe help us distinguish right from wrong. This is the basis in which we create our own set values and discover who we are as an individual....
472
+
473
+
474
+ **Example 8:**
475
+
476
+ - Query: Should animals be used for scientific or commercial testing?
477
+
478
+ - Doc Title: Monkeys should be considered persons in the eyes of the law
479
+
480
+ - Doc Text: Amina Amjed The Case for Animal Rights Round 1 1.The use of animal for science, for commercial purposes, for agriculture purposes and for the purposes of sport hunting and trapping should be abolished. 2.We should regard animals just as we regard those who are unable to defend and protect themselves such as children, the disabled and the elderly. 3.The abuse of animals for science, for sports, for agriculture and for any other human entertainment or attainment of needs for humans is fundamentally wrong. 4.We, as humans, must stop treating animals as things and treating them as beings that are capable of feeling pain and suffering. 5.It is true, that the process of change regarding animals and the rights of animals is complicated and requires extensive amount of time and finance to bring about this change but even so it is much needed. 6.The idea of animal rights has reason if not just emotion. 7.We have a direct duty to ensure safety of animals from brutality, as we do to our children and to those humans around us who are incapable of helping themselves. 8.We should not overlook animals because they do not have the characteristics and components of a human being. 9.Using animals for scientific experiments and/or for sports" purposes is brutal and against morality. 10.Pain is pain wherever it occurs. 11.Some would try to justify the use, therefore abuse, of animals by referring to what is known as contractarianism. 12.Contractariansim is the belief, by some, that morality consists of a set of rules that individuals abide to voluntarily just as we do when we agree to a contract. 13.Animals cannot volunteer or provide consent to be or not be a part of a contract therefore they cannot be moral and not considered persons. 14.However, this is true for children as well. Children cannot consent to a contract without the permission and supervision of their parent and/or guardian. Just so, animals should be protected by their guardian(s), if one is present, with the same moral perspective as another human. 15.Theories such as the cruelty-kindness view and utilitarianism have been presented to justify the protection of animals but have not proved to be entirely effective. 16.Each view is flawed is some way to segregate animal and the rights that should be given to them. 17.By taking the inherent value approach, it is evidently clear that each individual is as valuable as the next and not just what that individual can do for others. Your value as an individual would not be regarded by what you can do for me and in return what I can do for you. 18.In that same respect, an animal is considered an individual who should be treated with the same rights as me and you regardless of what they can do for us. 19.We should not use and/or abuse animals just because by doing so, it provides for the good of others. By hurting or undermining one individual we inherently subject ourselves to condone acts of cruelty. 20.The rights view of moral theory rationally explains the domain of human morality. 21.It is true, that animals lack many attributes of a human such as reading, writing, building book cases or baking a cake. 22.However, it is also true that there are some humans who are incapable of doing some of the things that I mentioned above but we do not devalue them as an individual or say that they are not human. 23.As an individual, be it a human or a monkey, we feel things, want things, have certain expectations, we feel pain and excitement, we feel frustration and we also suffer from untimely death. 24.All who have inherent value have it equally regardless of them being human animals or not. 25.The fight for animal rights is analogous to that of equality for women and other minorities. 26.In regards to animals that are used in the field of science, there is proof of devaluing these animals by testing on them routinely as if their value is reducible by their usefulness to others. These animals are treated without any respect and in doing so the rights of animals are violated. 27.In the same sense, farm animals are kept in stressful close confinements or in isolation which causes them pain and suffering. This treatment, rather mistreatment of farm animals is rooted in the view which lacks the acknowledgement of animals as individuals with independent value. Instead they are viewed as resource for "humans". 28.Killing one human for selfish means does not satisfy our moral values than why should that be true of killing or torturing animals. 29.A right, properly comprehended, is a claim that one party may exercise against another. The victim of this claim can potentially be a person, a community, or even all human kind. 30.It should than be understood that rights in general are in every case claims or potential claims within a community or moral agents. 31.Rights can be defended or claimed by those beings that can make moral claims against one another. 32.Human can make these moral choices whereas animals cannot. Animals are not beings that are capable of practicing or responding to moral claims. 33.But rights cannot simply depend on the presence of moral capacity. If that were true then we would have to agree that humans who are brain damaged or comatose lack the ability to respond to or exercise moral claims therefore they have no rights. 34.Non-human mammals have the same fundamental rights as a normal mammal to not be harmed or killed. 35.Those that have a subject of life, like normal mammals and non-human mammals, have inherent value. 36.Animals have the ability to feel pain, satisfaction, need, pleasure, disease and death, just as any human being does. 37.Therefore, I argue, that animals, such as monkeys should be considered persons in the eyes of the law. 38.Rights of animals should be respected. 39.The use of animals in science; commercial animal agriculture; commercial and sport hunting and trapping should be indefinitely terminated....
481
+
482
+
483
+ **Example 9:**
484
+
485
+ - Query: Should gay marriage be legal?
486
+
487
+ - Doc Title: Gay marriage should be legal
488
+
489
+ - Doc Text: I think gay marriage should be legal because every body deserves to marry who they love....
490
+
491
+
492
+ **Example 10:**
493
+
494
+ - Query: Is drinking milk healthy for humans?
495
+
496
+ - Doc Title: Eating Meat is not Inherently Unhealthy or Immoral
497
+
498
+ - Doc Text: I apologize for the mishap in citation, upon inspection you'll notice 3 black periods after each source "..." this cut off the actual link, leading to the general website. This is undoubtedly a mistake caused by having to copy and paste my arguments after your FF. Actual citation is as follows; please feel free to verify: [1] http://authoritynutrition.com... [2] http://www.mayoclinic.org... [3] http://www.hsph.harvard.edu... [4] http://www.peta.org... [5] http://well.blogs.nytimes.com... [6] http://modernfarmer.com... For the purpose of my rebuttal I will refer to my previous 6 sources as numerically cited and future sources will be cited as 7 & up. In citing your sources they will be referenced as [Con #] Given that my arguments have been legitimated, your comment "Since there are no proper sources we can ignore the statistics, for they were wrong anyways." can be disregarded. I understand that this is a bit unfair to you given that you decided to arrogantly brush off my arguments and supply your own. You will now have the chance to refute my Round 2 argument. I recommend you actually read it this time, because there is clear evidence in your rebuttal you did not. Problematic Citation and Source Material: I apologize for the mishap in my Round 2 source material; however you'll find that everything is properly cited and credible. However in your entire argument you fail to properly cite anything you said. I have no idea which comment corresponds to which source; if this were an academic setting you would be charged with plagiarism. Your source material is also very questionable and contradictory, and some are irrelevant to the debate entirely. Because you did not openly disclose which arguments come from which sources I have to spend the majority of my rebuttal debunking your sources. I ask that you re-read my Round 1 post. Source [Con 1] enforces my position quote "The high level of meat and saturated fat consumption in the USA ... exceeds nutritional needs and contributes to high rates of chronic diseases such as cardiovascular disease, diabetes mellitus and some cancers." [Con 1] is describing SAD, in my position statement I clearly acknowledge that SAD is not relevant to the debate because over-consumption does not equal inherent unhealthiness. If over-consumption was a true indicator of inherent negative health affects the 8% of obese vegetarians you mention would show that over consuming vegetables causes the negative health effects associated with obesity. [Con 2] is problematic; the author is simply unqualified, her brief bio gives no indication of having a degree in anything, she is extremely biased and only provides one citation for her entire 15 point article. More importantly all of her points are contingent on the immorality of factory farming, which as I've stated in Round 2 does not divert from meat being inherently unhealthy or immoral because there are healthy and moral ways of raising cattle [2][5][6][Con 8]. The findings of [Con 3] can also be refuted, Point 4 Source [1] quote "It is true that processed meat is associated with an increased risk of cancer, especially colon cancer ... Two review studies, one that looked at data from 35 studies and the other from 25 studies, found that the effect for unprocessed red meat was very weak for men and nonexistent for women. " This implies it is the process, not the meat that causes cancer. [Con 4] is 42 pages long, I only read the first page and found that it, like [Con 1], was critical of SAD for its health effects. This does not prove meat is inherently bad, fish for example is low in fat and has positive health effects [3]. Also [Con 4] says Chimps eat monkeys, if sentience is such an issue then why don't chimps respect the sentience of monkeys? (I'll address this in more detail later). [Con 5] is from PETA an organization well known for its bias against eating meat. Like [Con 2] the arguments are almost entirely based on factory farming, which again is irrelevant in the face of free range farming [5][6][Con 8]. [Con 6] is PETA India, it has the same bias and credibility as [Con 2]. Point 4 for example says that meat is bad because if you under cook it then you could get bird flu. Ok, by this logic vegetables are bad because if you don't clean them you could get E. Coli. As I said in the beginning of Round 2 "Like all food the health factor is not necessarily the food itself but how it is prepared and produced." [Con 7] is my personal favorite. By citing this source you show how truly desperate you are to grasp at straws. This source made me understand why you did not disclose them in your debate, as it has no scientific value whatsoever, it simply bullet points propaganda without any evidence. Let's read some of the findings of SupremeMasterTV.com. Eating meat causes "Blue tongue disease", what is this? Oh right it's a disease that only affects livestock and not humans [7]. The other diseases are food borne illnesses from improper cooking. In your rebuttal you state "Studies have shown that an Ovo-Lacto Vegetarian diet is the most healthy diet of all." Meaning that those who drink milk and eat eggs along with vegetables have the best diet of all (this is not a vegan diet, in Round 1 you said you'd argue as a vegan). But Lo! [Con 7] says drinking milk is bad! It causes "Breast, prostate and testicular cancer from hormones present in milk; Listeria and Crohn's disease; Hormones and saturated fat lead to osteoporosis, obesity, diabetes and heart disease; Linked to higher incidences of multiple sclerosis." Still think drinking milk is cool? [Con 8] is written by the same author as [Con 4] and as such contains the exact same arguments. Also I don't think you actually read this article because the author lays out a plan for rational meat eating in the section "Toward Rational Meat Eating" so yea ... you're helping me with this one. [Con 9] I admit this article does seem legitimate and has sources. However this argument doesn't help you in the way you might think, it focuses on factory farming and it acknowledges meat can be environmentally friendly "Chicken is probably the best land animal to eat, certainly in terms of climate change impact. Fish have a low greenhouse gas impact but are being eaten in such large quantities that many are at risk of extinction." "Even limiting one's meat consumption to chicken yields major environmental benefits--not to mention health and financial benefits." So eating chicken yields health benefits ... Interesting. [Con 10] was about jacana birds being influenced by human scientists to commit infanticide. I don't see how this proves anything more than humans encouraged behavior that only humans would consider immoral. If anything this goes to show killing animals is not an issue of morality. For the sake of science (as suppose to survival) humans encouraged these birds kill each others babies. Are these scientists immoral for encouraging nature? Eating Meat Can be Healthy: For the bulk of this section please read Round 2. This section is for counter arguments. Given that none of your statistics are properly cited they can be seen as fiction. You clearly dramatize the negative health effects of meat, if meat was so bad then why would doctors recommend it as part of a balanced diet [2][3][8]? Quoting Con "Aquamarine animals can often cause toxins, for all those fish which are high in Omega 3 are also extremely high in mercury. Those which are not like Salmon may cause many other diseases, and may also create bowel issues." Ah yes, Harvard disagrees quote "Environmental Protection Agency " calculated that if 100,000 people ate farmed salmon twice a week for 70 years, the extra PCB intake could potentially cause 24 extra deaths from cancer"but would prevent at least 7,000 deaths from heart disease. Second, levels of PCBs and dioxins in fish are very low, similar to levels in meats, dairy products, and eggs. Third, more than 90 percent of the PCBs and dioxins in the U.S. food supply come from such non-seafood sources, including meats, dairy, eggs, and vegetables." [3] Morality of Meat: Appeal to Nature In Round 2 I specifically acknowledge the possible flaw in this argument "However just because an animal does something does not give us the right to copy them blindly. It is our advanced intelligence and our appeal to morality that sets us apart from the lesser animals." It"s important to note humans are animals, and as such comparison to other animals is legitimate (it's kind of how biology and anthropology work). You yourself make this a moral underpinning in Round 1 "" it is wrong to kill [animals], as it is wrong to kill humans." But why? What makes them like us? You argue mammals are sentient like us, they feel emotion like we do, so they should be spared. You think that just because chimps can love each as humans can love each other this somehow negates the fact that chimps are equally enthusiastic about killing lesser yet still sentient primates like monkeys [Con 4], humans eat monkeys too ya know. Because we can feel love somehow all mammals are entitled to our good graces. Even if I agreed with your pan-mammalism philosophy fish and chicken aren't mammals what claim to familiarity do they have? When it comes to rape and infanticide there are obvious pro-human arguments against it, if your only argument against eating mammals is 'well you wouldn't eat a human!' then I'm afraid the moral connection falls flat. New Citation: [7] https://www.princeton.edu... [8] http://www.mayoclinic.org......
499
+
500
+
501
+ ##### Score 2 Examples (test)
502
+
503
+
504
+ **Example 1:**
505
+
506
+ - Query: Should teachers get tenure?
507
+
508
+ - Doc Title: There should not be a teacher tenure.
509
+
510
+ - Doc Text: Reason 1 - Teacher tenure creates complacency because teachers know they are unlikely to lose their jobs: If teachers know that they reached the period where they get special defence from most accusations - it would send the message to them that they can then do whatever they want to do in the classroom and really slack with their teaching duties. Reason 2 - Tenure makes it difficult to remove under-performing teachers because the process involves months of legal wrangling by the principal, the school board, the union, and the courts: Most schools stop trying to fire a certain teacher because the proccess is just too difficult. " A June 1, 2009 study by the New Teacher Project found that 81% of school administrators knew a poorly performing tenured teacher at their school; however, 86% of administrators said they do not always pursue dismissal of teachers because of the costly and time consuming process. It can take up to 335 days to remove a tenured teacher in Michigan before the courts get involved. " (. http://teachertenure.procon.org...) (Patrick McGuinn, "Ringing the Bell for K-12 Teacher Tenure Reform," www. americanprogress. org). This quote means that 86 OUT OF 100 SCHOOL ADMINISTRATORS WANT A TEACHER TO BE FIRED - but will not do so because the proccess is to draining. But what does that leave our learning and growing generation with? Many teachers who do not care, teach well, or put effort in their work? That is certaintly what this is going to result into if we do not abolish it quickly. Also check out this statistic of who is in favor (people in general) "An Apr. -May 2011 survey of 2,600 Americans found that 49% oppose teacher tenure while 20% support it. Among teachers, 53% support tenure while 32% oppose it. According to a Sep. 2010 report by the Thomas B. Fordham Institute, 86% of education professors favor "making it easier to terminate unmotivated or incompetent teachers - even if they are tenured. ” Of course you cannot expect most teachers to be against it sinse that it their profession and it effects them - but for bystanders with accurate and unbiased opinions, look how many people are against it. Also, "56% of school board presidents disagreed with the statement that teacher tenure ensures academic freedom. " (M. J. Stephey, "A Brief History of Tenure," www. time. com). Reason 3 - Most people are against teature tenure: "In an Oct. 1, 2006 survey, 91% of school board presidents either agreed or strongly agreed that tenure impedes the dismissal of under-performing teachers. 60% also believed that tenure does not promote fair evaluations. " (. http://teachertenure.procon.org...) This means that most teachers OF SUCH A LARGE PERCENTAGE are not in favor of the teacher tenure. Reason 4 - Teacher tenure does nothing to promote the education of children: "Former DC Schools Chancellor Michelle Rhee said in 2008, "Tenure is the holy grail of teacher unions, but it has no educational value for kids; it only benefits adults. ”("Rhee-Forming D. C. Schools," www. wsj. com). This piece of evidence means that the only people actually benefiting from this tenure are the teachers who are employed - not any students. Isint education suppost to be focused on the younger generation and their best interest? Since when did school become all about the teachers - this tenure undermines what it means to actually be a teacher. If anything, it is only a BAD THING for students - and why would we keep something in our school systems that MAKES THE GENERATIONS' LEARNING LESS VALUEABLE? It does not make any sense. Reason 5 - Tenure at the K-12 level is not earned, but given to nearly everyone: "To receive tenure at the university level, professors must show contributions to their fields by publishing research. At the K-12 level, teachers only need to "stick around” for a short period of time to receive tenure. A June 1, 2009 study by the New Teacher Project found that less than 1% of evaluated teachers were rated unsatisfactory. " (Marcus A. Winters, "Challenging Tenure in D. C. ," www. manhattan-institute. org). This statistic is absolutely upsetting and degrating. Basically, this quote is explaning how 99% of teachers have free protection handed to them if they just stay in that profession for a certain amount of time. What if that teacher was already slacking in many areas? Now we are going to award them for poor effort and teaching abilities? It is not fair to the students involved with these teachers and it is not fair that they do not actually have to WORK to recieve a benefit of protection unlike most other professions that require some form of acomplishment to recieve that/those benefits in question. Because "with most states granting tenure after three years, teachers have not had the opportunity to "show their worth, or their ineptitude. " (Rose Garrett, "What Is Teacher Tenure? ," www. education. com), (. http://teachertenure.procon.org...).Reason 6 - Tenure makes it costly for schools to remove a teacher with poor performance or who is guilty of wrongdoing: "It costs an average of $250,000 to fire a teacher in New York City. New York spent an estimated $30 million a year paying tenured teachers accused of incompetence and wrongdoing to report to reassignment centers (sometimes called "rubber rooms”) where they were paid to sit idly. Those rooms were shut down on June 28, 2010. " ("Rhee-Forming D. C. Schools," www. wsj. com), (Steven Brill, "The Rubber Room," New Yorker). This is just sad, now it even costs the school boards money for teachers not doing their job? Should'nt that be the opposite? Reason 7 - Tenure is not needed to recruit teachers: "Sacramento Charter High School, which does not offer tenure, had 900 teachers apply for 80 job openings. " (Nanette Asimov, "Teacher Job Security Fuels Prop. 74 Battle," San Francisco Chronicle). This quote further proves why tenure is pretty much useless and unfair because teachers DO NOT NEED TENURE to continue their job as a teacher at their shchool, past school, future school, or school they are applying for. Reason 8 - With job protections granted through court rulings, collective bargaining, and state and federal laws, teachers today no longer need tenure to protect them from dismissal: "For this reason, few other professions offer tenure because employees are adequately protected with existing laws. " (Tenure Reforms and NJSBA Policy: Report of the NJSBA Tenure Task Force," New Jersey School Boards Association website, www. njsba. org), (Scott McLeod, JD, PhD, "Does Teacher Tenure Have a Future? ," www. dangerouslyirrelevant. org). This is the most important fact out of all these because it shows how the WHOLE REASON teacher tenure is here in the first place is NOT NEEDED not have the protections that teachers have without tenure. The teacher tenure is not benefitial for anyone except teachers - they get unfair advantages in MANY ways, some I have just listed. Why should we let this continue if unnessisary? Citations: . http://teachertenure.procon.org...http://teachertenure.procon.org...http://teachertenure.procon.org...Wanda Marie Thibodeaux, "Pro & Cons of Teacher Tenure," www. ehow. comPatrick McGuinn, "Ringing the Bell for K-12 Teacher Tenure Reform," www. americanprogress. org. http://teachertenure.procon.org... "Rhee-Forming D. C. Schools," www. wsj. comMarcus A. Winters, "Challenging Tenure in D. C. ," www. manhattan-institute. orgM. J. Stephey, "A Brief History of Tenure," www. time. comRose Garrett, "What Is Teacher Tenure? ," www. education. com. http://teachertenure.procon.org... "Rhee-Forming D. C. Schools," www. wsj. comSteven Brill, "The Rubber Room," New YorkerTenure Reforms and NJSBA Policy: Report of the NJSBA Tenure Task Force," New Jersey School Boards Association website, www. njsba. orgScott McLeod, JD, PhD, "Does Teacher Tenure Have a Future? ," www. dangerouslyirrelevant. orgNanette Asimov, "Teacher Job Security Fuels Prop. 74 Battle," San Francisco Chronicle...
511
+
512
+
513
+ **Example 2:**
514
+
515
+ - Query: Is vaping with e-cigarettes safe?
516
+
517
+ - Doc Title: Bloomberg's Ban on E-Cigs
518
+
519
+ - Doc Text: Electronic cigarettes comes with different cartridges including 6-18mg of nicotine and sometimes 0mg. This is to say that electronic cigarettes are safer to smoke than traditional cigarettes. Electronic cigarettes do not cause tar because of the fact that it does not contain tobacco and leave behind no tar. As a result, the main components of carcinogen are not present to create a problem that traditional cigarettes that contain various chemicals, additives and smokes. Vapor is just vapor. It does not include any smell or lingering odor. It is far from affecting people around you while smoking electronic cigarette. Electronic cigarettes should not be banned because it does not pose any harm to its users and help people from quitting cigar....
520
+
521
+
522
+ **Example 3:**
523
+
524
+ - Query: Should corporal punishment be used in schools?
525
+
526
+ - Doc Title: Spanking In Schools
527
+
528
+ - Doc Text: I will now summarize this debate and make a closing statement. Con has presented arguments that are self-contradicting: Should spanking "be banned from everywhere," including children's own homes, or is it "the parents [sic] choice" whether or not to spank their children? I have shown that spanking should not be banned universally, and I have also demonstrated that if parents are allowed to spank their children, it is also their right to choose a school that will do the same in an appropriate manner as well. My opponent also mentions studies that have shown that spanking "causes problems later down the road," but he has failed to present any of the studies in his sources. On the other hand, I have cited sources that show that corporal punishment is very effective, and I have also presented a testimonial from a teacher who was grateful for the ability to apply it in class due to its effective nature. Con has thrown claims around willy-nilly throughout the debate, but has not been able to back up any of them. The one piece of evidence my opponent has presented, i. e. the case of a child who went home to her mother with bruises on her bottom, has been shown to be unconvincing at best. The mother of the child was upset that the school was not implementing corporal punishment correctly, not that the school was enacting corporal punishment. The child was most definitely at fault, and faced a few bruises on her bottom as opposed to something that might appear in her permanent record. If anything, spanking was the correct decision in this case; it just should have been implemented differently, which only shows that supporting corporal punishment is better than abolishing it, as it can then be refined and implemented better. In addition to successfully refuting all of Con's points, I have done the following:-I have shown that spanking is similar in many ways to other forms of punishment that society imposes; to oppose spanking in schools (or in general) would be to argue a much greater point about punishment in general. -I have shown that spanking provides teachers with another tool to keep their classes progressing smoothly. -I have shown that spanking has benefits, largely due to its immediacy and proven ability to keep children from misbehaving. -I have shown that spanking is actually a great alternative to other forms of punishment imposed in schools, in that it does not potentially ruin a child's future. In closing, from looking at the exchange between Con and me, one must conclude that corporal punishment in schools is appropriate if carried out correctly, just like any other form of punishment society must use at various levels of misbehavior, whether the punishment is to send a criminal to prison or even to punish a dog to keep it from becoming unruly. Spanking should not be eliminated from schools everywhere, and if anything, it should become a more widespread practice....
529
+
530
+
531
+ **Example 4:**
532
+
533
+ - Query: Should social security be privatized?
534
+
535
+ - Doc Title: Abolish Social Security
536
+
537
+ - Doc Text: Unpromising, chaotic, and fraud. Social security fits this description exactly; therefore social security must be abolished. I take an affirmative stance to abolish Social security for the following contentions; Contention 1; Social security has no prospect for America, Contention 2; America now and in the future will not be able to sustain the needs of the Social security system, and Contention 3; Individual workers should be able to invest their own retirement money. Under Social Security, lower and middle class individuals are forced to pay a significant portion of their income, approximately 12 percent for the alleged purpose of securing their retirement. That money is not saved or invested, but transferred directly to the program's current beneficiaries with the "promise" that when current taxpayers get old, the income of future taxpayers will be transferred to them. Since this scheme creates no wealth, any benefits one person receives in excess of his payments necessarily come at the expense of others. Under Social Security, whether an individual gets twice as much from others as was taken from them, or half as much, or nothing at all, is entirely at the discretion of politicians. They cannot count on Social Security for anything-except a massive drain on his income. Therefore, there is absolutely no way that the system can even guarantee future retirees the equal amount that they had previously contributed to Social security, making this system inequitable. Fixing the Social security system is essentially impossible. The government has increased the payroll tax 17 times since 1935 yet, the system is still crippling. Proving my point further, that Social security should be abolished immediately. In 2002, there were 186 million workers in America and 190 million retired people. This was the beginning of the end of the Social security system. The workforce can no longer acquire the money necessary to give to the retirement population. The evidence continues to mount. According to newyorktimes.com, by 2010, while 41 million new workers enter the workforce, a staggering 76 million workers will enter retirement. This is an unfathomable amount and impossible for the social security system to reach a solution for how these retired people are going to get money. How much, when, and in what form one should provide for retirement is highly individual-and is properly left to the individual's free judgment and action. Social Security deprives the individual of this freedom, and thus makes them less able to plan for the future, less able to provide for their retirements, less able to enjoy their most vital years, and less able to invest in themselves. If Social Security did not exist, individual workers could be free to use that 12 percent of their income as they choose making their ability to better their future incomparably greater. They could save for their retirement with a diversified, long-term, productive investment in stocks or bonds. Or they could reasonably choose not to devote all 12 percent to retirement. They might choose to work far past the age of 65 or choose to invest in their own productivity through additional education or starting a business. So the future of this individual's life is up to no one but themselves. This would encourage many Americans to work to earn money for a better future. To conclude my remarks, I urge my fellow debators to vote in affirmation....
538
+
539
+
540
+ **Example 5:**
541
+
542
+ - Query: Is a college education worth it?
543
+
544
+ - Doc Title: College education is not necessary to become successful
545
+
546
+ - Doc Text: College education is a waste of money and time. Most people who attend college do not get what they pay for. An associate degree or a bachelors degree cannot really get you a high paying job or "career". In order to get a "career" you must continue your education to receive a masters degree, a PhD or any other further education Many people without a college career have been extremely successful; For example, , Bill Gates , Andrew Carnegie, Federica Henry Royce and Abraham Lincoln ....
547
+
548
+
549
+ **Example 6:**
550
+
551
+ - Query: Should abortion be legal?
552
+
553
+ - Doc Title: Abortion should be illegal world wide
554
+
555
+ - Doc Text: Abortion is rang no one should murder there unborn child. If a women has a right to the choice of the abortion ,,well the baby has a choice and should have the right to live. Its Evil to kill a baby .there should be no argument because any rebuttal means your stance is kill a child infant you created. it makes no sense religious wise, morally makes no sense at all, and it shouldn't make legal sense because murders illegal....
556
+
557
+
558
+ **Example 7:**
559
+
560
+ - Query: Should students have to wear school uniforms?
561
+
562
+ - Doc Title: Students should wear school uniform.
563
+
564
+ - Doc Text: I think students should wear school uniforms. It is because in schools where students are allowed to wear whatever they want, there are school bullies that make fun of other students who may not have a good taste in fashion. But if they all wear the same school uniforms, they can't make fun of one anothers' fashion senses which prevents this type of bullying from happening. And also, if students can wear whatever they want, they might take a longer time to leave their houses for school due to taking too long to choose what they want to wear which may cause them to be late for school....
565
+
566
+
567
+ **Example 8:**
568
+
569
+ - Query: Should any vaccines be required for children?
570
+
571
+ - Doc Title: Resolved: State mandated administration of childhood vaccinations is justified.
572
+
573
+ - Doc Text: Not a full case yet.. Just some little points I put together... Governments should not have the right to intervene in the health decisions parents make for their children. 31% of parents believe they should have the right to refuse mandated school entry vaccinations for their children, according to a 2010 survey by the University of Michigan. Many parents hold religious beliefs against vaccination. Forcing such parents to vaccinate their children would violate the 1st Amendment which guarantees citizens the right to the free exercise of their religion. Vaccines are often unnecessary in many cases where the threat of death from disease is small. During the early nineteenth century, mortality for the childhood diseases whooping cough, measles, and scarlet fever fell drastically before immunization became available. This decreased mortality has been attributed to improved personal hygiene, water purification, effective sewage disposal, and better food hygiene and nutrition. Vaccines interfere with natural law and God's plan for humanity. Disease is a natural occurrence, and humans should not interfere with its trajectory. Common childhood vaccinations may cause rare yet serious reactions including anaphylactic shock, paralysis, and sudden death. This risk is not worth taking, especially considering most diseases vaccinated against are not necessarily life threatening. Vaccines can trigger auto-immune disorders such as arthritis, multiple sclerosis, lupus, Guillain-Barr� Syndrome (GBS), and other disorders. Vaccines can cause brain inflammation (encephalopathy) which can lead to death or permanent brain damage and disorders such as autism, ADD/ADHD, and other developmental problems. In addition, the vaccine additive thimerosal (found in most pre-1999 vaccines) has been associated specifically with the development of autism and is still found in certain meningococcal, tetanus, and flu vaccines such as the H1N1 vaccine. Vaccines clog and disrupt the lymphatic system with large foreign protein molecules (the active ingredients contained within vaccines) which may lead to lymphatic cancers such as leukemia and lymphoma. All vaccines cause immune system suppression, and can permanently damage the natural immune system. Unvaccinated children build and strengthen their immune systems through fighting off infection and developing natural immunity to diseases like measles and chickenpox. Artificial immunity, generated through vaccination, weakens the immune system and leaves children more vulnerable to all other diseases and infections. Children should not be required to receive the DPT (diphtheria, pertussis, tetanus) vaccine. Some studies have shown that children who receive the DPT vaccine exhibit shallow breathing which has been associated with sleep apnea and may be a causal factor in sudden infant death syndrome (SIDS). Studies of infants whose deaths were recorded as SIDS show a temporal relationship with DPT vaccination (these infants tended to die at similar time intervals in relation to when they were vaccinated). Children should not receive the hepatitis B vaccine. Hepatitis B is a blood-born disease and is primarily spread by sexual intercourse and intravenous drug use. Children are not at great risk of contracting the disease. In addition, researchers have found that immunization with the hepatitis B vaccine is associated with an increased risk of developing multiple sclerosis. Young girls should not receive mandatory vaccination for HPV (human papilloma virus). The vaccine was approved in 2006 and the long-term effects are unknown. Since approval, adverse side effects such as severe allergic reactions, Guillain-Barr� syndrome, spinal cord inflammation and pancreatitis have been reported to the US Vaccine Adverse Events Reporting System. Although these adverse reactions may be rare, they are not worth the risk since the vaccine only protects against two of the 15 strains of HPV that may cause cancer of the cervix (20-40 years after an individual is infected). Vaccines are promoted primarily to generate profits for manufacturers and financial donations for medical organizations that endorse vaccines. In 2003, a House Committee on Government Reform report revealed that the CDC Advisory Committee on Immunization Practices had members with significant financial ties to vaccine companies. The American Academy of Pediatrics, a leading pro-vaccination organization, receives millions of dollars from vaccine companies....
574
+
575
+
576
+ **Example 9:**
577
+
578
+ - Query: Should performance-enhancing drugs be accepted in sports?
579
+
580
+ - Doc Title: Performance Enhancing Drugs (PEDs) should be permitted in professional sports.
581
+
582
+ - Doc Text: I affirm that PEDs should be permitted in sports. Legalizing PEDs would serve to level the playing field. According to the New York Times, more than a third of the top Tour de France finishers since 1998 have been confirmed to be doping while racing, despite efforts to stop them (1). So, the statistics clearly indicate that those who obey the rules and do not dope have a relatively low chance of performing well. If those who did not not use Performance Enhancing Drugs in the Tour de France were instead allowed to dope, those racers would have had the same advantages as those who currently dope illegally, and the playing field would have been more even. One of the primary reasons people enjoy watching professional sports is because they showcase the highest level of human achievement. Allowing PEDs to be used would raise the level of competition and skill in professional sports. This would thus make achievements in sports seem even more unthinkable and impressive, which would thus make more people attend these sporting events. As described in Forbes, "A huge part of watching sports is witnessing the very peak of human athletic ability, and legalizing performance enhancing drugs would help athletes climb even higher (2)." This shows that Performance Enhancing Drugs can raise the level of sports, which can make sports even more of a spectacle. For example, during the 1990's, when Mark McGwire and Sammy Sosa used steroids, their respective games were usually sold out with people who wanted to see their heightened abilities (2). I will now refute my opponent's arguments. My opponent argues that PEDs provide with a large risk for athletes. However, Athletes who are using PEDs are already in a dangerous industry, and frequently become injured regardless of whether or not they use drugs. In the year 2013 alone, the NFL reported at least 152 concussions, and was forced to spend up to 765 million dollars to settle claims of traumatic head injury brought by former players (3). So, PEDs do not provide with a large injury risk compared to the inherent risk of professional sports. My opponent also argues that allowing PEDs takes away from true human achievement. However, being an athlete requires a high amount of skill regardless of whether or not you use PEDs. Hitting a home run, for example requires excellent timing and a perfect swing. Using PEDs may increase the distance of this home run, but hitting a it is still a spectacle of your own human, not chemical precision. So, performing in sports requires intricate skill, no matter what. It is thus clear that PEDs do not take away from human achievement. As such, I affirm. 1- http://www.nytimes.com... 2- http://www.forbes.com... 3- http://www.pbs.org......
583
+
584
+
585
+ **Example 10:**
586
+
587
+ - Query: Should birth control pills be available over the counter?
588
+
589
+ - Doc Title: Birth Control
590
+
591
+ - Doc Text: I believe that birth control should be allowed to be obtained without parental consent. When a girl is requesting to use birth control, she is being mature and responsible and it is her own right, not her parents, to make the decision to have sex or not. If someone is making the decision to have sex, they should be mature enough to make the decision to be safe about it. Further more, most teens would have sex if they want to regardless if they have protection or not, so it is better to be safe than sorry....
592
+
593
+
594
+ ## CORE
595
+
596
+ **Has Scores 0, 1, 2**
597
+ **Score 2 = HIGHLY RELEVANT** ✓
598
+ **Score 1 = NOT RELEVANT** ✗ (should be merged with Score 0!)
599
+
600
+ ## Special Notes
601
+
602
+ - **Original Purpose**: Argument retrieval for controversial topics and debates
603
+ - **What Scores Really Mean**:
604
+ - Score 2: Strong argumentative content directly addressing the query
605
+ - Score 1: Just keyword matching - wrong topic/context (uniforms→hats, schools→families)
606
+ - Score 0: Completely off-topic
607
+ - **Dataset Type**: Argument retrieval from debate portals (Debate.org, IDebate, etc.)
608
+ - **CRITICAL FLAW**: Score 1 is just keyword matching without intent matching!
609
+
610
+ ## Training Verdict
611
+
612
+ **BAD FOR TRAINING** ✗
613
+ - Score 1 should be treated as Score 0 (not relevant)
614
+ - Only Score 2 is actually relevant
615
+ - Similar issue to DBPedia-Entity - keyword matching ≠ relevance
616
+ - For binary: Score 2 as positive, Score 0+1 as negative
617
+
618
+ ## Training Recommendations
619
+
620
+ - If must use: Merge Score 1 into Score 0
621
+ - Only use Score 2 as positive examples
622
+ - Better to skip this dataset due to bad Score 1 annotations
623
+
624
+ ## Observations
test_scores/examine_more_scores.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Examine more examples of score 1 vs score 2 documents to understand the pattern
3
+ """
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from collections import defaultdict
8
+
9
+ def load_queries(dataset_path):
10
+ """Load queries"""
11
+ queries = {}
12
+ queries_file = dataset_path / "queries.jsonl"
13
+
14
+ if queries_file.exists():
15
+ with open(queries_file, 'r') as f:
16
+ for line in f:
17
+ q = json.loads(line)
18
+ queries[q['_id']] = q['text']
19
+
20
+ return queries
21
+
22
+ def load_corpus(dataset_path):
23
+ """Load corpus documents"""
24
+ corpus = {}
25
+ corpus_file = dataset_path / "corpus.jsonl"
26
+
27
+ if corpus_file.exists():
28
+ with open(corpus_file, 'r') as f:
29
+ for line in f:
30
+ doc = json.loads(line)
31
+ corpus[doc['_id']] = {
32
+ 'title': doc.get('title', ''),
33
+ 'text': doc.get('text', '')
34
+ }
35
+
36
+ return corpus
37
+
38
+ def examine_more_nfcorpus():
39
+ """Examine more NFCorpus examples"""
40
+ dataset_path = Path("../beir_data/nfcorpus")
41
+
42
+ # Load data
43
+ queries = load_queries(dataset_path)
44
+ corpus = load_corpus(dataset_path)
45
+
46
+ # Load test qrels (has both 1 and 2 scores)
47
+ qrels = defaultdict(dict)
48
+ qrels_file = dataset_path / "qrels" / "test.tsv"
49
+
50
+ with open(qrels_file, 'r') as f:
51
+ next(f) # Skip header
52
+ for line in f:
53
+ parts = line.strip().split('\t')
54
+ if len(parts) == 3:
55
+ query_id, doc_id, score = parts
56
+ qrels[query_id][doc_id] = int(score)
57
+
58
+ # Find queries that have both score 1 and score 2 documents
59
+ mixed_queries = []
60
+ for query_id, docs in qrels.items():
61
+ scores = set(docs.values())
62
+ if 1 in scores and 2 in scores:
63
+ mixed_queries.append(query_id)
64
+
65
+ print(f"Examining 5 more examples from NFCorpus\n")
66
+
67
+ # Skip first 3, examine next 5
68
+ for query_id in mixed_queries[3:8]:
69
+ if query_id not in queries:
70
+ continue
71
+
72
+ print("="*80)
73
+ print(f"QUERY: {queries[query_id]}")
74
+ print("-"*80)
75
+
76
+ # Get score 2 and 1 documents
77
+ score_2_docs = [doc_id for doc_id, score in qrels[query_id].items() if score == 2]
78
+ score_1_docs = [doc_id for doc_id, score in qrels[query_id].items() if score == 1]
79
+
80
+ print(f"\n[SCORE 2 - HIGHLY RELEVANT] (showing 1 of {len(score_2_docs)})")
81
+ for doc_id in score_2_docs[:1]:
82
+ if doc_id in corpus:
83
+ doc = corpus[doc_id]
84
+ print(f"Title: {doc['title'][:150]}")
85
+ print(f"Text snippet: {doc['text'][:250]}...")
86
+
87
+ print(f"\n[SCORE 1 - SUPPOSEDLY RELEVANT] (showing 2 of {len(score_1_docs)})")
88
+ for doc_id in score_1_docs[:2]:
89
+ if doc_id in corpus:
90
+ doc = corpus[doc_id]
91
+ print(f"\nTitle: {doc['title'][:150]}")
92
+ print(f"Text snippet: {doc['text'][:250]}...")
93
+
94
+ print("\n")
95
+
96
+ def examine_webis_touche():
97
+ """Examine Webis-Touche2020 which also has 0, 1, 2"""
98
+ dataset_path = Path("beir_data/webis-touche2020")
99
+
100
+ # Load data
101
+ queries = load_queries(dataset_path)
102
+ corpus = load_corpus(dataset_path)
103
+
104
+ # Load test qrels
105
+ qrels = defaultdict(dict)
106
+ qrels_file = dataset_path / "qrels" / "test.tsv"
107
+
108
+ with open(qrels_file, 'r') as f:
109
+ next(f) # Skip header
110
+ for line in f:
111
+ parts = line.strip().split('\t')
112
+ if len(parts) == 3:
113
+ query_id, doc_id, score = parts
114
+ qrels[query_id][doc_id] = int(score)
115
+
116
+ print("\n" + "="*80)
117
+ print("WEBIS-TOUCHE2020 EXAMPLES (has 0, 1, 2 scores)")
118
+ print("="*80)
119
+
120
+ # Find queries with all three scores
121
+ count = 0
122
+ for query_id, docs in qrels.items():
123
+ scores = set(docs.values())
124
+ if 0 in scores and 1 in scores and 2 in scores:
125
+ if query_id not in queries or count >= 3:
126
+ continue
127
+
128
+ count += 1
129
+ print(f"\nQUERY: {queries[query_id]}")
130
+ print("-"*80)
131
+
132
+ # Get documents by score
133
+ score_2_docs = [doc_id for doc_id, score in docs.items() if score == 2]
134
+ score_1_docs = [doc_id for doc_id, score in docs.items() if score == 1]
135
+ score_0_docs = [doc_id for doc_id, score in docs.items() if score == 0]
136
+
137
+ print(f"\n[SCORE 2] (showing 1 of {len(score_2_docs)})")
138
+ for doc_id in score_2_docs[:1]:
139
+ if doc_id in corpus:
140
+ doc = corpus[doc_id]
141
+ print(f"Text: {doc['text'][:300]}...")
142
+
143
+ print(f"\n[SCORE 1] (showing 1 of {len(score_1_docs)})")
144
+ for doc_id in score_1_docs[:1]:
145
+ if doc_id in corpus:
146
+ doc = corpus[doc_id]
147
+ print(f"Text: {doc['text'][:300]}...")
148
+
149
+ print(f"\n[SCORE 0] (showing 1 of {len(score_0_docs)})")
150
+ for doc_id in score_0_docs[:1]:
151
+ if doc_id in corpus:
152
+ doc = corpus[doc_id]
153
+ print(f"Text: {doc['text'][:300]}...")
154
+
155
+ if __name__ == "__main__":
156
+ examine_more_nfcorpus()
157
+ examine_webis_touche()
test_scores/test_fiqa_relevance.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test FiQA score 1 documents to see how relevant they really are
3
+ """
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from collections import defaultdict
8
+ import random
9
+
10
+ def load_fiqa_data():
11
+ """Load FiQA queries, corpus, and qrels"""
12
+ base_path = Path("../beir_data/fiqa")
13
+
14
+ # Load queries
15
+ queries = {}
16
+ with open(base_path / "queries.jsonl", 'r') as f:
17
+ for line in f:
18
+ q = json.loads(line)
19
+ queries[q['_id']] = q['text']
20
+
21
+ # Load corpus
22
+ corpus = {}
23
+ with open(base_path / "corpus.jsonl", 'r') as f:
24
+ for line in f:
25
+ doc = json.loads(line)
26
+ corpus[doc['_id']] = {
27
+ 'title': doc.get('title', ''),
28
+ 'text': doc.get('text', '')
29
+ }
30
+
31
+ # Load train qrels (all score 1)
32
+ train_qrels = defaultdict(list)
33
+ with open(base_path / "qrels/train.tsv", 'r') as f:
34
+ next(f) # Skip header
35
+ for line in f:
36
+ parts = line.strip().split('\t')
37
+ if len(parts) == 3:
38
+ query_id, doc_id, score = parts
39
+ train_qrels[query_id].append(doc_id)
40
+
41
+ return queries, corpus, train_qrels
42
+
43
+ def examine_fiqa_examples():
44
+ """Look at actual FiQA examples to judge relevance"""
45
+ queries, corpus, train_qrels = load_fiqa_data()
46
+
47
+ print("="*80)
48
+ print("FiQA Training Data Analysis")
49
+ print("="*80)
50
+ print(f"Total queries in train: {len(train_qrels)}")
51
+ print(f"Total corpus size: {len(corpus)}")
52
+
53
+ # Get distribution of docs per query
54
+ docs_per_query = [len(docs) for docs in train_qrels.values()]
55
+ print(f"Docs per query: min={min(docs_per_query)}, max={max(docs_per_query)}, avg={sum(docs_per_query)/len(docs_per_query):.1f}")
56
+
57
+ print("\n" + "="*80)
58
+ print("EXAMINING SPECIFIC EXAMPLES")
59
+ print("="*80)
60
+
61
+ # Take first 5 queries with their documents
62
+ sample_queries = list(train_qrels.keys())[:5]
63
+
64
+ for i, query_id in enumerate(sample_queries, 1):
65
+ if query_id not in queries:
66
+ continue
67
+
68
+ print(f"\n[EXAMPLE {i}]")
69
+ print(f"QUERY: {queries[query_id]}")
70
+ print("-"*80)
71
+
72
+ docs = train_qrels[query_id]
73
+ print(f"Number of 'relevant' docs: {len(docs)}")
74
+
75
+ # Show first 2 docs for this query
76
+ for j, doc_id in enumerate(docs[:2], 1):
77
+ if doc_id in corpus:
78
+ doc = corpus[doc_id]
79
+ print(f"\nDoc {j} (ID: {doc_id}):")
80
+ print(f"Title: {doc['title'][:100]}" if doc['title'] else "[No title]")
81
+ print(f"Text: {doc['text'][:300]}...")
82
+
83
+ print("\n" + "="*80)
84
+
85
+ # Now let's check some random negative examples (not in qrels)
86
+ print("\nCOMPARING WITH RANDOM (UNLABELED) DOCUMENTS")
87
+ print("="*80)
88
+
89
+ # Pick a query and show both labeled and unlabeled docs
90
+ test_query_id = sample_queries[0]
91
+ if test_query_id in queries:
92
+ print(f"\nQUERY: {queries[test_query_id]}")
93
+ print("-"*80)
94
+
95
+ # Get labeled docs
96
+ labeled_docs = set(train_qrels[test_query_id])
97
+
98
+ # Get some random unlabeled docs
99
+ all_doc_ids = list(corpus.keys())
100
+ unlabeled_docs = [d for d in all_doc_ids if d not in labeled_docs]
101
+ random_unlabeled = random.sample(unlabeled_docs, min(2, len(unlabeled_docs)))
102
+
103
+ print("\n[LABELED AS RELEVANT (Score 1)]:")
104
+ for doc_id in list(labeled_docs)[:1]:
105
+ if doc_id in corpus:
106
+ doc = corpus[doc_id]
107
+ print(f"Text: {doc['text'][:250]}...")
108
+
109
+ print("\n[UNLABELED (Implicit Score 0)]:")
110
+ for doc_id in random_unlabeled[:1]:
111
+ if doc_id in corpus:
112
+ doc = corpus[doc_id]
113
+ print(f"Text: {doc['text'][:250]}...")
114
+
115
+ def analyze_relevance_quality():
116
+ """Analyze if Score 1 docs are truly highly relevant or mixed"""
117
+ queries, corpus, train_qrels = load_fiqa_data()
118
+
119
+ print("\n" + "="*80)
120
+ print("RELEVANCE QUALITY ANALYSIS")
121
+ print("="*80)
122
+
123
+ # Find queries with multiple relevant docs to see if they're all equally relevant
124
+ multi_doc_queries = [(q, docs) for q, docs in train_qrels.items() if len(docs) > 2]
125
+
126
+ if multi_doc_queries:
127
+ query_id, doc_ids = multi_doc_queries[0]
128
+
129
+ if query_id in queries:
130
+ print(f"\nQuery with {len(doc_ids)} 'relevant' docs:")
131
+ print(f"QUERY: {queries[query_id]}")
132
+ print("-"*80)
133
+
134
+ # Show all relevant docs to see if they're equally relevant
135
+ for i, doc_id in enumerate(doc_ids[:4], 1):
136
+ if doc_id in corpus:
137
+ doc = corpus[doc_id]
138
+ print(f"\nRelevant Doc {i}:")
139
+ print(f"Text snippet: {doc['text'][:200]}...")
140
+
141
+ print("\n[QUESTION]: Do all these docs seem EQUALLY relevant to the query?")
142
+ print("If some are more relevant than others, then Score 1 is mixing different relevance levels!")
143
+
144
+ if __name__ == "__main__":
145
+ examine_fiqa_examples()
146
+ analyze_relevance_quality()
test_scores/test_nfcorpus_train_relevance.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test NFCorpus train score 1 documents to see if they're highly relevant or mixed
3
+ """
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from collections import defaultdict
8
+ import random
9
+
10
+ def load_nfcorpus_data():
11
+ """Load NFCorpus queries, corpus, and qrels"""
12
+ base_path = Path("../beir_data/nfcorpus")
13
+
14
+ # Load queries
15
+ queries = {}
16
+ with open(base_path / "queries.jsonl", 'r') as f:
17
+ for line in f:
18
+ q = json.loads(line)
19
+ queries[q['_id']] = q['text']
20
+
21
+ # Load corpus
22
+ corpus = {}
23
+ with open(base_path / "corpus.jsonl", 'r') as f:
24
+ for i, line in enumerate(f):
25
+ if i > 5000: # Limit for speed
26
+ break
27
+ doc = json.loads(line)
28
+ corpus[doc['_id']] = {
29
+ 'title': doc.get('title', ''),
30
+ 'text': doc.get('text', '')
31
+ }
32
+
33
+ # Load train qrels (all score 1)
34
+ train_qrels = defaultdict(list)
35
+ with open(base_path / "qrels/train.tsv", 'r') as f:
36
+ next(f) # Skip header
37
+ for line in f:
38
+ parts = line.strip().split('\t')
39
+ if len(parts) == 3:
40
+ query_id, doc_id, score = parts
41
+ train_qrels[query_id].append(doc_id)
42
+
43
+ # Also load test qrels to compare
44
+ test_qrels = defaultdict(dict)
45
+ with open(base_path / "qrels/test.tsv", 'r') as f:
46
+ next(f) # Skip header
47
+ for line in f:
48
+ parts = line.strip().split('\t')
49
+ if len(parts) == 3:
50
+ query_id, doc_id, score = parts
51
+ test_qrels[query_id][doc_id] = int(score)
52
+
53
+ return queries, corpus, train_qrels, test_qrels
54
+
55
+ def examine_train_examples():
56
+ """Look at actual NFCorpus train examples"""
57
+ queries, corpus, train_qrels, test_qrels = load_nfcorpus_data()
58
+
59
+ print("="*80)
60
+ print("NFCorpus TRAIN Data Analysis")
61
+ print("="*80)
62
+ print(f"Total queries in train: {len(train_qrels)}")
63
+ print(f"Total corpus loaded: {len(corpus)}")
64
+
65
+ # Get distribution of docs per query
66
+ docs_per_query = [len(docs) for docs in train_qrels.values()]
67
+ print(f"Docs per query: min={min(docs_per_query)}, max={max(docs_per_query)}, avg={sum(docs_per_query)/len(docs_per_query):.1f}")
68
+
69
+ print("\n" + "="*80)
70
+ print("EXAMINING TRAIN EXAMPLES (all Score 1)")
71
+ print("="*80)
72
+
73
+ # Take 5 random queries from train
74
+ sample_queries = random.sample(list(train_qrels.keys()), min(5, len(train_qrels)))
75
+
76
+ for i, query_id in enumerate(sample_queries, 1):
77
+ if query_id not in queries:
78
+ continue
79
+
80
+ print(f"\n[TRAIN EXAMPLE {i}]")
81
+ print(f"QUERY: {queries[query_id]}")
82
+ print("-"*80)
83
+
84
+ docs = train_qrels[query_id]
85
+ print(f"Number of Score 1 docs in TRAIN: {len(docs)}")
86
+
87
+ # Show first 3 docs to see variation in relevance
88
+ for j, doc_id in enumerate(docs[:3], 1):
89
+ if doc_id in corpus:
90
+ doc = corpus[doc_id]
91
+ print(f"\nTRAIN Doc {j} (Score 1):")
92
+ print(f"Title: {doc['title'][:100]}" if doc['title'] else "[No title]")
93
+ print(f"Text: {doc['text'][:250]}...")
94
+
95
+ print("\n" + "="*80)
96
+ print("COMPARING WITH TEST SET (has Score 1 and 2)")
97
+ print("="*80)
98
+
99
+ # Find a test query with both score 1 and 2
100
+ for query_id in test_qrels:
101
+ if query_id not in queries:
102
+ continue
103
+
104
+ docs = test_qrels[query_id]
105
+ scores = set(docs.values())
106
+
107
+ if 1 in scores and 2 in scores:
108
+ score_1_docs = [d for d, s in docs.items() if s == 1]
109
+ score_2_docs = [d for d, s in docs.items() if s == 2]
110
+
111
+ print(f"\n[TEST EXAMPLE]")
112
+ print(f"QUERY: {queries[query_id]}")
113
+ print("-"*80)
114
+
115
+ # Show Score 2 doc
116
+ print("\nTEST Score 2 (Highly Relevant):")
117
+ for doc_id in score_2_docs[:1]:
118
+ if doc_id in corpus:
119
+ doc = corpus[doc_id]
120
+ print(f"Title: {doc['title'][:100]}" if doc['title'] else "[No title]")
121
+ print(f"Text: {doc['text'][:250]}...")
122
+
123
+ # Show Score 1 doc
124
+ print("\nTEST Score 1 (Relevant):")
125
+ for doc_id in score_1_docs[:1]:
126
+ if doc_id in corpus:
127
+ doc = corpus[doc_id]
128
+ print(f"Title: {doc['title'][:100]}" if doc['title'] else "[No title]")
129
+ print(f"Text: {doc['text'][:250]}...")
130
+
131
+ break
132
+
133
+ print("\n" + "="*80)
134
+ print("QUESTION: Are TRAIN Score 1 docs more like TEST Score 1 or Score 2?")
135
+ print("="*80)
136
+
137
+ def check_quality_variation():
138
+ """Check if train Score 1 docs have quality variation"""
139
+ queries, corpus, train_qrels, test_qrels = load_nfcorpus_data()
140
+
141
+ print("\n" + "="*80)
142
+ print("CHECKING VARIATION IN TRAIN SCORE 1 DOCUMENTS")
143
+ print("="*80)
144
+
145
+ # Find a query with many Score 1 docs in train
146
+ queries_with_many_docs = [(q, docs) for q, docs in train_qrels.items()
147
+ if len(docs) > 10]
148
+
149
+ if queries_with_many_docs:
150
+ # Sort by number of docs
151
+ queries_with_many_docs.sort(key=lambda x: len(x[1]), reverse=True)
152
+
153
+ query_id, doc_ids = queries_with_many_docs[0]
154
+
155
+ if query_id in queries:
156
+ print(f"\nQuery with {len(doc_ids)} Score 1 docs in TRAIN:")
157
+ print(f"QUERY: {queries[query_id]}")
158
+ print("-"*80)
159
+
160
+ # Show a sample of docs to see if they vary in relevance
161
+ print("\nShowing 5 random Score 1 docs from TRAIN for this query:")
162
+
163
+ sample_docs = random.sample(doc_ids, min(5, len(doc_ids)))
164
+ for i, doc_id in enumerate(sample_docs, 1):
165
+ if doc_id in corpus:
166
+ doc = corpus[doc_id]
167
+ print(f"\nDoc {i}:")
168
+ print(f"Title: {doc['title'][:80]}" if doc['title'] else "[No title]")
169
+ print(f"Text: {doc['text'][:200]}...")
170
+
171
+ print("\n[ANALYSIS]: Do these all seem EQUALLY relevant?")
172
+ print("If some are clearly more relevant than others, then TRAIN Score 1 is mixing relevance levels!")
173
+
174
+ if __name__ == "__main__":
175
+ examine_train_examples()
176
+ check_quality_variation()
train_datasets_creation/create_training_data_ids_batched.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Create training data - IDS ONLY with BATCHING (best of both worlds)."""
2
+
3
+ import json
4
+ import sys
5
+ import random
6
+ from pathlib import Path
7
+ from collections import defaultdict
8
+ import time
9
+
10
+ def process_dataset(dataset_name, config, negatives_ratio=10, batch_size=5000):
11
+ """Process dataset in batches, storing only IDs."""
12
+ print(f"\n{'='*50}\nProcessing: {dataset_name}")
13
+
14
+ # Load qrels
15
+ qrels_path = Path(f'../beir_data/{dataset_name}/qrels/merged.tsv')
16
+ if not qrels_path.exists():
17
+ print(f" ⚠️ No merged.tsv found")
18
+ return
19
+
20
+ print(f" Loading qrels...")
21
+ qrels = defaultdict(dict)
22
+ with open(qrels_path, 'r', encoding='utf-8') as f:
23
+ next(f) # Skip header
24
+ for line in f:
25
+ parts = line.strip().split('\t')
26
+ if len(parts) == 3:
27
+ qrels[parts[0]][parts[1]] = int(parts[2])
28
+
29
+ # Count scores for info
30
+ score_counts = defaultdict(int)
31
+ for docs in qrels.values():
32
+ for score in docs.values():
33
+ score_counts[score] += 1
34
+ print(f" Loaded {len(qrels):,} queries, scores: {dict(score_counts)}")
35
+
36
+ # Load queries
37
+ print(f" Loading queries...")
38
+ queries = {}
39
+ with open(f'../beir_data/{dataset_name}/queries.jsonl', 'r', encoding='utf-8') as f:
40
+ for line in f:
41
+ q = json.loads(line)
42
+ queries[q['_id']] = q['text']
43
+
44
+ # Get score mapping from config
45
+ score_map = config['datasets'][dataset_name]['score_to_category']
46
+
47
+ # Check if this dataset has both Score 1 and 2 as positive
48
+ has_score_2_positive = score_map.get('2') == 'positive'
49
+ has_score_1_positive = score_map.get('1') == 'positive'
50
+ both_scores_positive = has_score_2_positive and has_score_1_positive
51
+
52
+ # Process in batches
53
+ all_qids = list(qrels.keys())
54
+ random.seed(42)
55
+
56
+ # Create output file
57
+ output_dir = Path(f'datasets/{dataset_name}')
58
+ output_dir.mkdir(parents=True, exist_ok=True)
59
+ output_file = output_dir / 'training_ids.jsonl'
60
+
61
+ # Adjust batch size for dataset size
62
+ if len(all_qids) > 50000:
63
+ batch_size = 5000
64
+ elif len(all_qids) > 10000:
65
+ batch_size = 2000
66
+ else:
67
+ batch_size = len(all_qids) # Process small datasets in one batch
68
+
69
+ print(f" Processing {len(all_qids):,} queries in batches of {batch_size:,}...")
70
+
71
+ # Stats
72
+ total_examples = 0
73
+ total_easy_pos = 0
74
+ total_hard_pos = 0
75
+ total_hard_neg = 0
76
+ total_easy_neg = 0
77
+
78
+ with open(output_file, 'w', encoding='utf-8') as out_f:
79
+ for batch_start in range(0, len(all_qids), batch_size):
80
+ batch_end = min(batch_start + batch_size, len(all_qids))
81
+ batch_qids = all_qids[batch_start:batch_end]
82
+
83
+ if len(all_qids) > 10000: # Only show progress for large datasets
84
+ print(f" Processing batch: queries {batch_start:,}-{batch_end:,}")
85
+
86
+ # Process queries in this batch
87
+ for qid in batch_qids:
88
+ if qid not in queries:
89
+ continue
90
+
91
+ docs = qrels[qid]
92
+
93
+ # Categorize documents by score (IDs only!)
94
+ hard_positive_ids = []
95
+ easy_positive_ids = []
96
+ hard_negative_ids = []
97
+ easy_negative_ids = []
98
+
99
+ for doc_id, score in docs.items():
100
+ category = score_map.get(str(score), 'easy_negative')
101
+
102
+ if category == 'positive':
103
+ # If both 1 and 2 are positive, differentiate them
104
+ if both_scores_positive:
105
+ if score == 2:
106
+ easy_positive_ids.append(doc_id)
107
+ elif score == 1:
108
+ hard_positive_ids.append(doc_id)
109
+ else:
110
+ # Only one score is positive, treat all as easy positives
111
+ easy_positive_ids.append(doc_id)
112
+ elif category == 'hard_negative':
113
+ hard_negative_ids.append(doc_id)
114
+ elif category == 'easy_negative':
115
+ easy_negative_ids.append(doc_id)
116
+
117
+ # Combine positives
118
+ all_positive_ids = easy_positive_ids + hard_positive_ids
119
+ if not all_positive_ids:
120
+ continue
121
+
122
+ # Calculate 1:10 ratio based on total positives
123
+ total_positives = len(all_positive_ids)
124
+ total_negatives_have = len(hard_negative_ids) + len(easy_negative_ids)
125
+ total_negatives_needed = total_positives * negatives_ratio
126
+
127
+ # Only sample from other queries IN THIS BATCH if we need more
128
+ if total_negatives_have < total_negatives_needed:
129
+ need_more = total_negatives_needed - total_negatives_have
130
+
131
+ # Sample from batch queries only (like efficient version!)
132
+ other_batch_qids = [q for q in batch_qids if q != qid]
133
+ random.shuffle(other_batch_qids)
134
+
135
+ # Collect ALL doc IDs already judged for this query
136
+ current_query_docs = set(docs.keys())
137
+
138
+ added = 0
139
+ for other_qid in other_batch_qids:
140
+ if added >= need_more:
141
+ break
142
+ for doc_id in qrels[other_qid]:
143
+ # Check: not already judged for current query
144
+ if doc_id not in current_query_docs:
145
+ easy_negative_ids.append(doc_id)
146
+ added += 1
147
+ if added >= need_more:
148
+ break
149
+
150
+ # Write example directly to file (streaming)
151
+ example = {
152
+ 'query_id': qid,
153
+ 'query_text': queries[qid],
154
+ 'source_dataset': dataset_name,
155
+ 'easy_positive_ids': easy_positive_ids,
156
+ 'hard_positive_ids': hard_positive_ids,
157
+ 'hard_negative_ids': hard_negative_ids,
158
+ 'easy_negative_ids': easy_negative_ids[:total_negatives_needed - len(hard_negative_ids)] if easy_negative_ids else []
159
+ }
160
+
161
+ out_f.write(json.dumps(example) + '\n')
162
+
163
+ # Update stats
164
+ total_examples += 1
165
+ total_easy_pos += len(example['easy_positive_ids'])
166
+ total_hard_pos += len(example['hard_positive_ids'])
167
+ total_hard_neg += len(example['hard_negative_ids'])
168
+ total_easy_neg += len(example['easy_negative_ids'])
169
+
170
+ # Print stats
171
+ print(f" ✓ Created {total_examples:,} examples")
172
+ print(f" Easy positives: {total_easy_pos:,}")
173
+ print(f" Hard positives: {total_hard_pos:,}")
174
+ print(f" Hard negatives: {total_hard_neg:,}")
175
+ print(f" Easy negatives: {total_easy_neg:,}")
176
+
177
+ def main():
178
+ """Main function."""
179
+ print("="*50)
180
+ print("TRAINING DATA CREATION - IDS + BATCHING")
181
+ print("="*50)
182
+ print("Best of both worlds: IDs only (small files) + Batching (fast)")
183
+
184
+ # Load config
185
+ with open('../test_scores/dataset_reports/training_config.json', 'r', encoding='utf-8') as f:
186
+ config = json.load(f)['beir_training_config']
187
+
188
+ # Get datasets to process
189
+ datasets = [name for name, cfg in config['datasets'].items() if cfg['use']]
190
+
191
+ # Check if specific dataset requested
192
+ if len(sys.argv) > 1:
193
+ if sys.argv[1] in datasets:
194
+ datasets = [sys.argv[1]]
195
+ print(f"Processing only: {sys.argv[1]}")
196
+ else:
197
+ print(f"❌ Dataset '{sys.argv[1]}' not found or disabled")
198
+ print(f"Available: {', '.join(datasets)}")
199
+ return
200
+
201
+ print(f"Will process {len(datasets)} datasets")
202
+
203
+ total_start = time.time()
204
+
205
+ # Process each dataset
206
+ for idx, dataset_name in enumerate(datasets, 1):
207
+ print(f"\n[{idx}/{len(datasets)}] {dataset_name}")
208
+ dataset_start = time.time()
209
+ process_dataset(dataset_name, config)
210
+ dataset_time = time.time() - dataset_start
211
+ print(f" Dataset completed in {dataset_time:.2f} seconds")
212
+
213
+ total_time = time.time() - total_start
214
+ print(f"\n✅ Complete! Total time: {total_time:.2f} seconds")
215
+ print("\n📝 Output: datasets/<dataset>/training_ids.jsonl (IDs only)")
216
+ print("💾 File sizes: ~100x smaller than full text")
217
+ print("⚡ Speed: As fast as efficient version")
218
+
219
+ if __name__ == "__main__":
220
+ main()
train_datasets_creation/make_dataset_for_train.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json,pandas as pd,os
2
+ dfs = []
3
+ for fold in os.listdir('./datasets'):
4
+ if fold == 'fiqa':
5
+ continue
6
+ print(fold)
7
+ corpus = pd.DataFrame([json.loads(x) for x in open(f"../beir_data/{fold}/corpus.jsonl").readlines()])
8
+ df = pd.DataFrame([json.loads(x) for x in open(f"./datasets/{fold}/training_ids.jsonl").readlines()])
9
+ if df.size>1000:
10
+ df = df.sample(1000,replace=True)
11
+ rows = []
12
+ for _,row in df.iterrows():
13
+ query_id = row['query_id']
14
+ query_text = row['query_text']
15
+ for label in ['easy_positive_ids','hard_positive_ids', 'hard_negative_ids', 'easy_negative_ids']:
16
+ for did in row[label]:
17
+ rows.append({"query_id":query_id,"query_text":query_text,'doc_id':did,'label':label.rstrip('_ids')})
18
+ df = pd.DataFrame(rows)
19
+ df = pd.merge(df,corpus[['_id','title','text']],left_on='doc_id',right_on='_id')
20
+ df['dataset'] = fold
21
+
22
+ dfs.append(df)
23
+
24
+ pd.concat(dfs).to_csv('full_train_dataset.csv')
train_datasets_creation/merge_all_splits.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Merge all qrels splits (train/dev/test) into one merged.tsv file for each dataset.
3
+ We don't care about splits - we want ALL the data!
4
+ """
5
+
6
+ import os
7
+ from pathlib import Path
8
+
9
+ def merge_qrels_for_dataset(dataset_path):
10
+ """Merge all qrels files in a dataset into merged.tsv."""
11
+ qrels_dir = dataset_path / 'qrels'
12
+
13
+ if not qrels_dir.exists():
14
+ print(f" ⚠️ No qrels directory found")
15
+ return
16
+
17
+ # Find all .tsv files
18
+ tsv_files = list(qrels_dir.glob('*.tsv'))
19
+
20
+ if not tsv_files:
21
+ print(f" ⚠️ No TSV files found")
22
+ return
23
+
24
+ # Collect all unique entries (query_id, doc_id, score)
25
+ all_entries = {} # (qid, doc_id) -> score
26
+ header = None
27
+
28
+ for tsv_file in tsv_files:
29
+ if tsv_file.name == 'merged.tsv':
30
+ continue # Skip if already merged
31
+
32
+ print(f" Reading {tsv_file.name}...")
33
+
34
+ with open(tsv_file, 'r', encoding='utf-8') as f:
35
+ lines = f.readlines()
36
+
37
+ # Get header from first file
38
+ if header is None and lines:
39
+ header = lines[0].strip()
40
+
41
+ # Process data lines
42
+ for line in lines[1:]: # Skip header
43
+ if not line.strip():
44
+ continue
45
+
46
+ parts = line.strip().split('\t')
47
+ if len(parts) >= 3:
48
+ qid = parts[0]
49
+ doc_id = parts[1]
50
+ score = int(parts[2])
51
+
52
+ # Store or update if higher score
53
+ key = (qid, doc_id)
54
+ if key not in all_entries or score > all_entries[key]:
55
+ all_entries[key] = score
56
+
57
+ # Write merged file
58
+ merged_file = qrels_dir / 'merged.tsv'
59
+
60
+ with open(merged_file, 'w', encoding='utf-8') as f:
61
+ # Write header
62
+ f.write(header + '\n')
63
+
64
+ # Write all entries sorted by query_id then doc_id
65
+ for (qid, doc_id), score in sorted(all_entries.items()):
66
+ f.write(f"{qid}\t{doc_id}\t{score}\n")
67
+
68
+ print(f" ✓ Merged {len(all_entries)} unique entries into merged.tsv")
69
+ print(f" From splits: {', '.join(f.stem for f in tsv_files if f.name != 'merged.tsv')}")
70
+
71
+ return len(all_entries)
72
+
73
+ def main():
74
+ """Merge all splits for all datasets in beir_data."""
75
+ beir_data_dir = Path('../beir_data')
76
+
77
+ if not beir_data_dir.exists():
78
+ print(f"Error: {beir_data_dir} not found!")
79
+ return
80
+
81
+ # Get all dataset directories
82
+ dataset_dirs = [d for d in beir_data_dir.iterdir() if d.is_dir()]
83
+ dataset_dirs.sort()
84
+
85
+ print(f"Found {len(dataset_dirs)} datasets in beir_data")
86
+ print("="*60)
87
+
88
+ total_entries = 0
89
+
90
+ for dataset_dir in dataset_dirs:
91
+ print(f"\nProcessing {dataset_dir.name}...")
92
+
93
+ entries = merge_qrels_for_dataset(dataset_dir)
94
+ if entries:
95
+ total_entries += entries
96
+
97
+ print("\n" + "="*60)
98
+ print(f"DONE! Merged {total_entries} total qrel entries across all datasets")
99
+ print("All datasets now have a 'merged.tsv' file combining all splits")
100
+
101
+ if __name__ == "__main__":
102
+ main()
train_dpo_hf.py DELETED
@@ -1,268 +0,0 @@
1
- """
2
- DPO Training Script for Phi-3 Mini on Hugging Face
3
- Optimized for A10G GPU with checkpoint saving and custom validation
4
- """
5
-
6
- import os
7
- import json
8
- import torch
9
- import pandas as pd
10
- from transformers import (
11
- AutoTokenizer,
12
- AutoModelForCausalLM,
13
- TrainingArguments,
14
- TrainerCallback,
15
- TrainerState,
16
- TrainerControl
17
- )
18
- from trl import DPOTrainer, DPOConfig
19
- from datasets import Dataset, load_dataset
20
- from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
21
- from typing import Dict, List
22
- import logging
23
- from datetime import datetime
24
-
25
- logging.basicConfig(level=logging.INFO)
26
- logger = logging.getLogger(__name__)
27
-
28
- # Configuration
29
- MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
30
- HF_USERNAME = os.environ.get("HF_USERNAME", "your-username") # Set via environment
31
- HF_TOKEN = os.environ.get("HF_TOKEN", None) # Set via environment
32
- PROJECT_NAME = "phi3-dpo-beir"
33
- OUTPUT_DIR = f"./{PROJECT_NAME}-checkpoints"
34
-
35
- class ValidationCallback(TrainerCallback):
36
- """Custom callback to evaluate model similar to evaluate.py"""
37
-
38
- def __init__(self, tokenizer, val_dataset, eval_freq=500):
39
- self.tokenizer = tokenizer
40
- self.val_dataset = val_dataset
41
- self.eval_freq = eval_freq
42
-
43
- def format_prompt_for_inference(self, query, document):
44
- """Format for inference matching evaluate.py style"""
45
- prompt = f"""You are an AI content analyst.
46
-
47
- Task:
48
- 1. Given the following content and a user query, decide if the content is relevant.
49
- 2. If it is relevant:
50
- - Extract the top 2-3 key sentences
51
- - Suggest 3-5 relevant tags
52
- - Provide a short explanation or content extension (~2-3 sentences)
53
-
54
- Format your response in JSON with:
55
- {{
56
- "relevant": true or false,
57
- "key_sentences": [...],
58
- "tags": [...],
59
- "expansion": "..."
60
- }}
61
-
62
- User Query:
63
- {query}
64
-
65
- Content:
66
- {document}
67
-
68
- Response:"""
69
- return prompt
70
-
71
- def on_step_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
72
- """Run validation every N steps"""
73
- if state.global_step % self.eval_freq == 0 and state.global_step > 0:
74
- logger.info(f"\n🔍 Running custom validation at step {state.global_step}")
75
-
76
- model = kwargs["model"]
77
- model.eval()
78
-
79
- # Sample a few validation examples
80
- sample_size = min(5, len(self.val_dataset))
81
- samples = self.val_dataset.shuffle().select(range(sample_size))
82
-
83
- correct = 0
84
- for sample in samples:
85
- # Extract query and document from prompt
86
- prompt_text = sample["prompt"]
87
- lines = prompt_text.split("\n")
88
- query_idx = lines.index("Query:") + 1
89
- doc_idx = lines.index("Document:") + 1
90
-
91
- query = lines[query_idx].strip()
92
- doc_parts = lines[doc_idx:]
93
- document = "\n".join(doc_parts).strip()
94
-
95
- # Generate response
96
- inference_prompt = self.format_prompt_for_inference(query, document)
97
- inputs = self.tokenizer(inference_prompt, return_tensors="pt", truncation=True)
98
- inputs = {k: v.to(model.device) for k, v in inputs.items()}
99
-
100
- with torch.no_grad():
101
- outputs = model.generate(
102
- **inputs,
103
- max_new_tokens=256,
104
- temperature=0.1,
105
- do_sample=True,
106
- pad_token_id=self.tokenizer.eos_token_id
107
- )
108
-
109
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
110
- response = response[len(inference_prompt):].strip()
111
-
112
- # Simple accuracy check
113
- expected = sample["chosen"].lower()
114
- if expected in response.lower():
115
- correct += 1
116
-
117
- logger.info(f"Expected: {expected}, Got: {response[:100]}...")
118
-
119
- accuracy = correct / sample_size
120
- logger.info(f"✅ Validation accuracy: {accuracy:.2%}")
121
-
122
-
123
- return control
124
-
125
- def prepare_datasets():
126
- """Load and prepare DPO datasets"""
127
- logger.info("📊 Loading datasets...")
128
-
129
- # Load CSVs
130
- train_df = pd.read_csv("train.csv")
131
- val_df = pd.read_csv("val.csv") if os.path.exists("val.csv") else None
132
- test_df = pd.read_csv("test.csv") if os.path.exists("test.csv") else None
133
-
134
- # Convert to HF datasets
135
- train_dataset = Dataset.from_pandas(train_df)
136
- val_dataset = Dataset.from_pandas(val_df) if val_df is not None else None
137
- test_dataset = Dataset.from_pandas(test_df) if test_df is not None else None
138
-
139
- logger.info(f"✅ Loaded {len(train_dataset)} training examples")
140
- if val_dataset:
141
- logger.info(f"✅ Loaded {len(val_dataset)} validation examples")
142
-
143
- return train_dataset, val_dataset, test_dataset
144
-
145
- def get_model_and_tokenizer():
146
- """Load model and tokenizer with 4-bit quantization for A10G"""
147
- logger.info(f"🤖 Loading model: {MODEL_ID}")
148
-
149
- # Tokenizer
150
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
151
- tokenizer.pad_token = tokenizer.eos_token
152
- tokenizer.padding_side = "left" # Important for DPO
153
-
154
- # Model with 4-bit quantization
155
- model = AutoModelForCausalLM.from_pretrained(
156
- MODEL_ID,
157
- load_in_4bit=True,
158
- torch_dtype=torch.float16,
159
- device_map="auto",
160
- trust_remote_code=True
161
- )
162
-
163
- # Prepare for k-bit training
164
- model = prepare_model_for_kbit_training(model)
165
-
166
- return model, tokenizer
167
-
168
- def get_peft_config():
169
- """Get LoRA configuration optimized for A10G"""
170
- return LoraConfig(
171
- r=16,
172
- lora_alpha=32,
173
- target_modules=[
174
- "q_proj", "v_proj", "k_proj", "o_proj",
175
- "gate_proj", "up_proj", "down_proj"
176
- ],
177
- lora_dropout=0.1,
178
- bias="none",
179
- task_type="CAUSAL_LM",
180
- )
181
-
182
- def main():
183
- # Load datasets
184
- train_dataset, val_dataset, test_dataset = prepare_datasets()
185
-
186
- # Load model and tokenizer
187
- model, tokenizer = get_model_and_tokenizer()
188
-
189
- # LoRA config
190
- peft_config = get_peft_config()
191
-
192
- # Training arguments optimized for A10G
193
- training_args = DPOConfig(
194
- output_dir=OUTPUT_DIR,
195
- num_train_epochs=3,
196
- per_device_train_batch_size=2, # A10G can handle this
197
- per_device_eval_batch_size=2,
198
- gradient_accumulation_steps=4, # Effective batch size = 8
199
- gradient_checkpointing=True,
200
- learning_rate=5e-5,
201
- lr_scheduler_type="cosine",
202
- warmup_ratio=0.1,
203
- logging_steps=10,
204
- save_steps=100, # Save every 100 steps
205
- eval_steps=500,
206
- save_total_limit=5, # Keep last 5 checkpoints
207
- load_best_model_at_end=True,
208
- metric_for_best_model="loss",
209
- greater_is_better=False,
210
-
211
- # DPO specific
212
- beta=0.1, # DPO regularization
213
- max_prompt_length=512,
214
- max_length=768,
215
-
216
- # Optimization
217
- optim="paged_adamw_8bit",
218
- fp16=True,
219
-
220
- # Logging
221
- report_to="wandb" if os.environ.get("WANDB_API_KEY") else "none",
222
- run_name=f"{PROJECT_NAME}-{datetime.now().strftime('%Y%m%d-%H%M')}",
223
-
224
- # Hub integration
225
- push_to_hub=True if HF_TOKEN else False,
226
- hub_model_id=f"{HF_USERNAME}/{PROJECT_NAME}" if HF_TOKEN else None,
227
- hub_strategy="checkpoint", # Push every checkpoint
228
- hub_token=HF_TOKEN,
229
- )
230
-
231
- # Initialize trainer
232
- dpo_trainer = DPOTrainer(
233
- model=model,
234
- ref_model=None, # Will create a reference model copy
235
- args=training_args,
236
- train_dataset=train_dataset,
237
- eval_dataset=val_dataset,
238
- processing_class=tokenizer, # Changed from tokenizer to processing_class
239
- peft_config=peft_config,
240
- )
241
-
242
-
243
- # Add custom validation callback
244
- if val_dataset:
245
- val_callback = ValidationCallback(tokenizer, val_dataset)
246
- dpo_trainer.add_callback(val_callback)
247
-
248
- # Start training
249
- logger.info("🚀 Starting DPO training...")
250
- logger.info(f"💾 Checkpoints will be saved to: {OUTPUT_DIR}")
251
- if HF_TOKEN:
252
- logger.info(f"🤗 Model will be pushed to: https://huggingface.co/{HF_USERNAME}/{PROJECT_NAME}")
253
-
254
- dpo_trainer.train()
255
-
256
- # Save final model
257
- logger.info("💾 Saving final model...")
258
- dpo_trainer.save_model(f"{OUTPUT_DIR}/final")
259
-
260
- # Push to hub
261
- if HF_TOKEN:
262
- logger.info("🤗 Pushing final model to Hub...")
263
- dpo_trainer.push_to_hub()
264
-
265
- logger.info("✅ Training complete!")
266
-
267
- if __name__ == "__main__":
268
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_dpo_hf_fixed.py DELETED
@@ -1,366 +0,0 @@
1
- """
2
- DPO Training Script for Phi-3 Mini - Fixed version
3
- Handles dependency issues and provides cleaner error handling
4
- """
5
-
6
- import os
7
- import sys
8
- import json
9
- import warnings
10
- warnings.filterwarnings("ignore")
11
-
12
- # Set environment variables to avoid TensorFlow issues
13
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
14
- os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
15
-
16
- try:
17
- import torch
18
- import pandas as pd
19
- from transformers import (
20
- AutoTokenizer,
21
- AutoModelForCausalLM,
22
- TrainingArguments,
23
- TrainerCallback,
24
- TrainerState,
25
- TrainerControl
26
- )
27
- from trl import DPOTrainer
28
- from trl.trainer.dpo_config import DPOConfig
29
- from datasets import Dataset
30
- from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
31
- from datetime import datetime
32
- import logging
33
- except ImportError as e:
34
- print(f"Missing dependency: {e}")
35
- print("\nPlease install required packages:")
36
- print("pip install torch transformers trl peft datasets accelerate bitsandbytes pandas")
37
- print("\nIf you get Keras errors, also run:")
38
- print("pip install tf-keras")
39
- sys.exit(1)
40
-
41
- logging.basicConfig(level=logging.INFO)
42
- logger = logging.getLogger(__name__)
43
-
44
- # Configuration
45
- MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
46
- HF_USERNAME = os.environ.get("HF_USERNAME", "your-username")
47
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
48
- PROJECT_NAME = "phi3-dpo-beir"
49
- OUTPUT_DIR = f"./{PROJECT_NAME}-checkpoints"
50
-
51
- class ValidationCallback(TrainerCallback):
52
- """Custom callback to evaluate model similar to evaluate.py"""
53
-
54
- def __init__(self, tokenizer, val_dataset, eval_freq=500):
55
- self.tokenizer = tokenizer
56
- self.val_dataset = val_dataset
57
- self.eval_freq = eval_freq
58
-
59
- def format_prompt_for_inference(self, query, document):
60
- """Format for inference matching evaluate.py style"""
61
- prompt = f"""You are an AI content analyst.
62
-
63
- Task:
64
- 1. Given the following content and a user query, decide if the content is relevant.
65
- 2. If it is relevant:
66
- - Extract the top 2-3 key sentences
67
- - Suggest 3-5 relevant tags
68
- - Provide a short explanation or content extension (~2-3 sentences)
69
-
70
- Format your response in JSON with:
71
- {{
72
- "relevant": true or false,
73
- "key_sentences": [...],
74
- "tags": [...],
75
- "expansion": "..."
76
- }}
77
-
78
- User Query:
79
- {query}
80
-
81
- Content:
82
- {document}
83
-
84
- Response:"""
85
- return prompt
86
-
87
- def on_step_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
88
- """Run validation every N steps"""
89
- if state.global_step % self.eval_freq == 0 and state.global_step > 0:
90
- logger.info(f"\n🔍 Running custom validation at step {state.global_step}")
91
-
92
- model = kwargs["model"]
93
- model.eval()
94
-
95
- # Sample validation examples
96
- sample_size = min(5, len(self.val_dataset))
97
- samples = self.val_dataset.shuffle(seed=42).select(range(sample_size))
98
-
99
- correct = 0
100
- for sample in samples:
101
- try:
102
- # Extract query and document
103
- prompt_text = sample["prompt"]
104
- lines = prompt_text.split("\n")
105
-
106
- # Find query and document sections
107
- query_idx = -1
108
- doc_idx = -1
109
- for i, line in enumerate(lines):
110
- if line.strip() == "Query:":
111
- query_idx = i + 1
112
- elif line.strip() == "Document:":
113
- doc_idx = i + 1
114
-
115
- if query_idx == -1 or doc_idx == -1:
116
- continue
117
-
118
- query = lines[query_idx].strip()
119
- doc_parts = lines[doc_idx:]
120
- document = "\n".join(doc_parts).strip()
121
-
122
- # Generate response
123
- inference_prompt = self.format_prompt_for_inference(query, document)
124
- inputs = self.tokenizer(
125
- inference_prompt,
126
- return_tensors="pt",
127
- truncation=True,
128
- max_length=512
129
- )
130
- inputs = {k: v.to(model.device) for k, v in inputs.items()}
131
-
132
- with torch.no_grad():
133
- outputs = model.generate(
134
- **inputs,
135
- max_new_tokens=256,
136
- temperature=0.1,
137
- do_sample=True,
138
- pad_token_id=self.tokenizer.eos_token_id
139
- )
140
-
141
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
142
- response = response[len(inference_prompt):].strip()
143
-
144
- # Simple accuracy check
145
- expected = sample["chosen"].lower()
146
- if expected in response.lower():
147
- correct += 1
148
-
149
- logger.info(f"Expected: {expected}, Got: {response[:100]}...")
150
- except Exception as e:
151
- logger.error(f"Validation error: {e}")
152
- continue
153
-
154
- if sample_size > 0:
155
- accuracy = correct / sample_size
156
- logger.info(f"✅ Validation accuracy: {accuracy:.2%}")
157
-
158
- return control
159
-
160
- def prepare_datasets():
161
- """Load and prepare DPO datasets"""
162
- logger.info("📊 Loading datasets...")
163
-
164
- # Check if data files exist
165
- if not os.path.exists("train.csv"):
166
- logger.error("train.csv not found!")
167
- logger.info("Please run download_beir_datasets.py first or use generate_sample_data.py")
168
- return None, None, None
169
-
170
- # Load CSVs
171
- train_df = pd.read_csv("train.csv")
172
- val_df = pd.read_csv("val.csv") if os.path.exists("val.csv") else None
173
- test_df = pd.read_csv("test.csv") if os.path.exists("test.csv") else None
174
-
175
- # Convert to HF datasets
176
- train_dataset = Dataset.from_pandas(train_df)
177
- val_dataset = Dataset.from_pandas(val_df) if val_df is not None else None
178
- test_dataset = Dataset.from_pandas(test_df) if test_df is not None else None
179
-
180
- logger.info(f"✅ Loaded {len(train_dataset)} training examples")
181
- if val_dataset:
182
- logger.info(f"✅ Loaded {len(val_dataset)} validation examples")
183
-
184
- return train_dataset, val_dataset, test_dataset
185
-
186
- def get_model_and_tokenizer():
187
- """Load model and tokenizer with 4-bit quantization for A10G"""
188
- logger.info(f"🤖 Loading model: {MODEL_ID}")
189
-
190
- # Tokenizer
191
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
192
- tokenizer.pad_token = tokenizer.eos_token
193
- tokenizer.padding_side = "left" # Important for DPO
194
-
195
- # Check if CUDA is available
196
- if not torch.cuda.is_available():
197
- logger.warning("⚠️ CUDA not available. Loading model in CPU mode (will be slow!)")
198
- model = AutoModelForCausalLM.from_pretrained(
199
- MODEL_ID,
200
- torch_dtype=torch.float32,
201
- device_map="cpu",
202
- trust_remote_code=True
203
- )
204
- else:
205
- # Model with 4-bit quantization
206
- try:
207
- model = AutoModelForCausalLM.from_pretrained(
208
- MODEL_ID,
209
- load_in_4bit=True,
210
- torch_dtype=torch.float16,
211
- device_map="auto",
212
- trust_remote_code=True,
213
- bnb_4bit_compute_dtype=torch.float16,
214
- bnb_4bit_use_double_quant=True,
215
- bnb_4bit_quant_type="nf4"
216
- )
217
- model = prepare_model_for_kbit_training(model)
218
- except Exception as e:
219
- logger.error(f"Failed to load model in 4-bit: {e}")
220
- logger.info("Falling back to full precision...")
221
- model = AutoModelForCausalLM.from_pretrained(
222
- MODEL_ID,
223
- torch_dtype=torch.float16,
224
- device_map="auto",
225
- trust_remote_code=True
226
- )
227
-
228
- return model, tokenizer
229
-
230
- def get_peft_config():
231
- """Get LoRA configuration optimized for A10G"""
232
- return LoraConfig(
233
- r=16,
234
- lora_alpha=32,
235
- target_modules=[
236
- "q_proj", "v_proj", "k_proj", "o_proj",
237
- "gate_proj", "up_proj", "down_proj"
238
- ],
239
- lora_dropout=0.1,
240
- bias="none",
241
- task_type="CAUSAL_LM",
242
- )
243
-
244
- def main():
245
- logger.info("="*60)
246
- logger.info("🚀 Starting DPO Training for Phi-3 Mini")
247
- logger.info("="*60)
248
-
249
- # Load datasets
250
- train_dataset, val_dataset, test_dataset = prepare_datasets()
251
- if train_dataset is None:
252
- return
253
-
254
- # Load model and tokenizer
255
- try:
256
- model, tokenizer = get_model_and_tokenizer()
257
- except Exception as e:
258
- logger.error(f"Failed to load model: {e}")
259
- return
260
-
261
- # LoRA config
262
- peft_config = get_peft_config()
263
-
264
- # Training arguments optimized for A10G
265
- training_args = DPOConfig(
266
- output_dir=OUTPUT_DIR,
267
- num_train_epochs=3,
268
- per_device_train_batch_size=2, # A10G can handle this
269
- per_device_eval_batch_size=2,
270
- gradient_accumulation_steps=4, # Effective batch size = 8
271
- gradient_checkpointing=True,
272
- learning_rate=5e-5,
273
- lr_scheduler_type="cosine",
274
- warmup_ratio=0.1,
275
- logging_steps=10,
276
- save_steps=100, # Save every 100 steps
277
- eval_steps=500,
278
- save_total_limit=5, # Keep last 5 checkpoints
279
- load_best_model_at_end=True,
280
- metric_for_best_model="loss",
281
- greater_is_better=False,
282
-
283
- # DPO specific
284
- beta=0.1, # DPO regularization
285
- max_prompt_length=512,
286
- max_length=768,
287
-
288
- # Optimization
289
- optim="paged_adamw_8bit" if torch.cuda.is_available() else "adamw_torch",
290
- fp16=torch.cuda.is_available(),
291
-
292
- # Logging
293
- report_to="none", # Disable wandb for simplicity
294
- run_name=f"{PROJECT_NAME}-{datetime.now().strftime('%Y%m%d-%H%M')}",
295
-
296
- # Hub integration
297
- push_to_hub=True if HF_TOKEN else False,
298
- hub_model_id=f"{HF_USERNAME}/{PROJECT_NAME}" if HF_TOKEN else None,
299
- hub_strategy="checkpoint", # Push every checkpoint
300
- hub_token=HF_TOKEN,
301
- )
302
-
303
- # Initialize trainer
304
- try:
305
- dpo_trainer = DPOTrainer(
306
- model=model,
307
- ref_model=None, # Will create a reference model copy
308
- args=training_args,
309
- train_dataset=train_dataset,
310
- eval_dataset=val_dataset,
311
- processing_class=tokenizer, # Changed from tokenizer to processing_class
312
- peft_config=peft_config,
313
- )
314
- except Exception as e:
315
- logger.error(f"Failed to initialize trainer: {e}")
316
- return
317
-
318
- # Add custom validation callback
319
- if val_dataset:
320
- val_callback = ValidationCallback(tokenizer, val_dataset)
321
- dpo_trainer.add_callback(val_callback)
322
-
323
- # Start training
324
- logger.info("🚀 Starting DPO training...")
325
- logger.info(f"💾 Checkpoints will be saved to: {OUTPUT_DIR}")
326
- if HF_TOKEN:
327
- logger.info(f"🤗 Model will be pushed to: https://huggingface.co/{HF_USERNAME}/{PROJECT_NAME}")
328
-
329
- # Print some info about the data
330
- logger.info("\n📊 Data Statistics:")
331
- logger.info(f"Training samples: {len(train_dataset)}")
332
- if val_dataset:
333
- logger.info(f"Validation samples: {len(val_dataset)}")
334
-
335
- # Show a sample
336
- logger.info("\n📝 Sample training data:")
337
- sample = train_dataset[0]
338
- logger.info(f"Prompt (first 200 chars): {sample['prompt'][:200]}...")
339
- logger.info(f"Chosen: {sample['chosen']}")
340
- logger.info(f"Rejected: {sample['rejected']}")
341
-
342
- try:
343
- dpo_trainer.train()
344
- except KeyboardInterrupt:
345
- logger.info("\n⚠️ Training interrupted by user")
346
- except Exception as e:
347
- logger.error(f"\n❌ Training failed: {e}")
348
- return
349
-
350
- # Save final model
351
- logger.info("💾 Saving final model...")
352
- dpo_trainer.save_model(f"{OUTPUT_DIR}/final")
353
-
354
- # Push to hub
355
- if HF_TOKEN:
356
- logger.info("🤗 Pushing final model to Hub...")
357
- try:
358
- dpo_trainer.push_to_hub()
359
- except Exception as e:
360
- logger.error(f"Failed to push to hub: {e}")
361
-
362
- logger.info("✅ Training complete!")
363
- logger.info(f"📁 Model saved to: {OUTPUT_DIR}/final")
364
-
365
- if __name__ == "__main__":
366
- main()