Spaces:

amos1088
/

train-mbed

Paused

App Files Files Community

amos1088 commited on 8 days ago

Commit

cae25d0

1 Parent(s): 005e7d1

no

Browse files

Files changed (36) hide show

app.py +386 -60
common/ask_claude_clean.py +253 -0
common/ask_openai.py +404 -0
common/global_config.py +58 -0
common/trash.py +248 -0
download_beir_datasets.py +0 -287
evaluate.py +0 -273
generate_sample_data.py +0 -90
inference_chatgpt_simple.py +89 -0
inference_server.py +0 -179
setup_environment.py +0 -120
test_scores/analyze_each_dataset.py +272 -0
test_scores/analyze_webis_only.py +269 -0
test_scores/beir_analysis.json +0 -0
test_scores/dataset_reports/FINAL_TRAINING_SUMMARY.md +189 -0
test_scores/dataset_reports/arguana_analysis.md +157 -0
test_scores/dataset_reports/climate-fever_analysis.md +122 -0
test_scores/dataset_reports/dbpedia-entity_analysis.md +908 -0
test_scores/dataset_reports/fever_analysis.md +281 -0
test_scores/dataset_reports/fiqa_analysis.md +302 -0
test_scores/dataset_reports/hotpotqa_analysis.md +362 -0
test_scores/dataset_reports/nfcorpus_analysis.md +865 -0
test_scores/dataset_reports/quora_analysis.md +0 -0
test_scores/dataset_reports/scidocs_analysis.md +555 -0
test_scores/dataset_reports/scifact_analysis.md +264 -0
test_scores/dataset_reports/training_config.json +164 -0
test_scores/dataset_reports/trec-covid_analysis.md +782 -0
test_scores/dataset_reports/webis-touche2020_analysis.md +624 -0
test_scores/examine_more_scores.py +157 -0
test_scores/test_fiqa_relevance.py +146 -0
test_scores/test_nfcorpus_train_relevance.py +176 -0
train_datasets_creation/create_training_data_ids_batched.py +220 -0
train_datasets_creation/make_dataset_for_train.py +24 -0
train_datasets_creation/merge_all_splits.py +102 -0
train_dpo_hf.py +0 -268
train_dpo_hf_fixed.py +0 -366

app.py CHANGED Viewed

@@ -13,13 +13,12 @@ import gradio as gr
 from datasets import Dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel, prepare_model_for_kbit_training, LoraConfig, get_peft_model
-from torch.utils.data import DataLoader
-from torch.nn import CrossEntropyLoss
 from trl import DPOTrainer, DPOConfig
 import warnings
 import subprocess
 import gc
 import psutil
 warnings.filterwarnings("ignore")
 # ===== CONFIG =====
@@ -111,12 +110,17 @@ current_model = None
 current_tokenizer = None
 train_df = None
 test_df = None
-training_status = {"status": "idle", "progress": 0, "logs": []}
 training_lock = threading.Lock()
 def format_prompt(query, title, content):
     """Format the prompt for the model"""
     return f"""you would get a query and document's title and content and return yes (if the document is relevant to the query)/ or no (if the document is not relevant to the query)
 Answer only yes / no.
 Document:
@@ -221,6 +225,21 @@ def get_gpu_memory_status():
     return f"GPU Memory: {allocated:.1f}GB allocated / {reserved:.1f}GB reserved / {total:.1f}GB total"
 def switch_model(model_id):
     """Switch to a different model"""
     global current_model, current_tokenizer, current_model_id, OUTPUT_DIR
@@ -289,25 +308,55 @@ def collate_fn(batch):
 def prepare_dpo_dataset(df):
-    """Convert yes/no labels to DPO format with chosen/rejected pairs"""
     dpo_data = []
     for _, row in df.iterrows():
-        prompt = row['prompt'] if 'prompt' in row else format_prompt(row['query'], row['title'], row['content'])
-        if row['label'] == 'yes':
             # For 'yes' examples, chosen is "yes" and rejected is "no"
             dpo_data.append({
                 'prompt': prompt,
                 'chosen': 'yes',
-                'rejected': 'no'
             })
         else:
             # For 'no' examples, chosen is "no" and rejected is "yes"
             dpo_data.append({
                 'prompt': prompt,
                 'chosen': 'no',
-                'rejected': 'yes'
             })
     return pd.DataFrame(dpo_data)
@@ -446,10 +495,19 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
     import numpy as np
     def compute_accuracy_metrics(trainer, eval_dataset, num_samples=100):
-        """Compute accuracy metrics on a subset of eval data"""
         # Sample subset for faster evaluation
         indices = np.random.choice(len(eval_dataset), min(num_samples, len(eval_dataset)), replace=False)
         predictions_yes = 0
         predictions_no = 0
         correct = 0
@@ -457,7 +515,8 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
         for idx in indices:
             item = eval_dataset[int(idx)]
             prompt = item['prompt']
-            true_choice = item['chosen']  # This is the correct answer
             # Tokenize and run inference
             inputs = current_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
@@ -475,25 +534,45 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
                 no_logit = logits[no_token_id].item()
                 # Get prediction
-                prediction = " yes" if yes_logit > no_logit else " no"
-                if prediction == " yes":
                     predictions_yes += 1
                 else:
                     predictions_no += 1
-                if prediction.strip() == true_choice.strip():
                     correct += 1
         accuracy = correct / len(indices)
         yes_ratio = predictions_yes / len(indices)
         no_ratio = predictions_no / len(indices)
         return {
             'accuracy': accuracy,
             'yes_ratio': yes_ratio,
             'no_ratio': no_ratio,
-            'total_samples': len(indices)
         }
     class StatusCallback(TrainerCallback):
@@ -520,6 +599,17 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
                             f"No={metrics['no_ratio']:.1%}"
                         )
                         # Warn if model is biased
                         if metrics['yes_ratio'] < 0.2 or metrics['no_ratio'] < 0.2:
                             training_status["logs"].append(
@@ -555,9 +645,20 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
         with training_lock:
             training_status["logs"].append(f"\n=== FINAL RESULTS ===")
-            training_status["logs"].append(f"Accuracy: {final_metrics['accuracy']:.2%}")
             training_status["logs"].append(f"Yes predictions: {final_metrics['yes_ratio']:.1%}")
             training_status["logs"].append(f"No predictions: {final_metrics['no_ratio']:.1%}")
         # Update global model reference
         current_model = dpo_trainer.model
@@ -566,10 +667,117 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
         # Push to hub if token available
         if HF_TOKEN and HF_USERNAME:
             try:
-                repo_name = f"{HF_USERNAME}/phi3-dpo-relevance"
-                current_model.push_to_hub(repo_name, use_auth_token=HF_TOKEN)
                 current_tokenizer.push_to_hub(repo_name, use_auth_token=HF_TOKEN)
                 logger.info(f"Model pushed to hub: {repo_name}")
             except Exception as e:
                 logger.error(f"Failed to push to hub: {e}")
@@ -579,7 +787,7 @@ def train_model(train_df, val_df, epochs=3, batch_size=4, lr=2e-5, max_samples=N
 def run_training(csv_path, shuffle_flag=False, split_ratio=0.8):
-    """Run training from CSV file"""
     global train_df, test_df, training_status
     try:
@@ -591,30 +799,62 @@ def run_training(csv_path, shuffle_flag=False, split_ratio=0.8):
         df = pd.read_csv(csv_path)
         logger.info(f"Loaded {len(df)} samples from {csv_path}")
-        # Shuffle if requested
-        if shuffle_flag:
-            df = df.sample(frac=1).reset_index(drop=True)
-        # Check required columns
-        required_cols = ['query', 'title', 'content', 'label']
-        if not all(col in df.columns for col in required_cols):
-            # If prompt column exists, use it directly
-            if 'prompt' in df.columns and 'label' in df.columns:
-                logger.info("Using existing prompt column")
-            else:
-                raise ValueError(f"CSV must have columns: {required_cols} or ['prompt', 'label']")
-        else:
-            # Create prompt column if not exists
             if 'prompt' not in df.columns:
-                logger.info("Creating prompt column from query/title/content")
                 df['prompt'] = df.apply(
                     lambda row: format_prompt(row['query'], row['title'], row['content']),
                     axis=1
                 )
-        # Validate labels
-        if not all(label in ['yes', 'no'] for label in df['label'].unique()):
-            raise ValueError("Labels must be 'yes' or 'no'")
         # Split data
         split_idx = int(len(df) * split_ratio)
@@ -628,6 +868,8 @@ def run_training(csv_path, shuffle_flag=False, split_ratio=0.8):
         with training_lock:
             training_status["status"] = "training"
             training_status["logs"].append("Starting training...")
         # Conservative sample size for GPT-OSS-20B
         if current_model_id == "openai/gpt-oss-20b":
@@ -654,18 +896,17 @@ def run_inference(query, document_title, document_content, checkpoint="latest"):
     global current_model, current_tokenizer
     # Validate inputs
-    if not query or not query.strip():
         return "Error: Query cannot be empty"
-    if not document_title or not document_title.strip():
         return "Error: Document title cannot be empty"
-    if not document_content or not document_content.strip():
         return "Error: Document content cannot be empty"
-    # Truncate content if too long (warn user)
-    max_content_length = 1000
-    if len(document_content) > max_content_length:
-        logger.warning(f"Content truncated from {len(document_content)} to {max_content_length} characters")
-        document_content = document_content[:max_content_length]
     # Load model if needed
     if current_model is None:
@@ -740,8 +981,12 @@ def run_inference_by_row(split_choice, row_idx, checkpoint="latest"):
     row = df.iloc[int(row_idx)]
-    # Get data from row
-    if all(col in row for col in ['query', 'title', 'content']):
         query = str(row['query'])
         title = str(row['title'])
         content = str(row['content'])
@@ -759,10 +1004,22 @@ def run_inference_by_row(split_choice, row_idx, checkpoint="latest"):
     # Extract just the prediction (yes/no) from the result
     prediction = prediction_with_confidence.split()[0]  # Gets "yes" or "no" from "yes (confidence: X%)"
-    # Check if prediction matches actual
-    is_correct = prediction == actual
-    return f"Prediction: {prediction_with_confidence}\nActual: {actual}\nCorrect: {is_correct}"
 def run_csv_inference(csv_file, model_choice="finetuned", checkpoint_path=None, inference_batch_size=16, progress=gr.Progress()):
@@ -781,10 +1038,20 @@ def run_csv_inference(csv_file, model_choice="finetuned", checkpoint_path=None,
         df = pd.read_csv(csv_file.name)
         logger.info(f"Loaded {len(df)} samples from CSV for inference")
-        # Check required columns
-        required_cols = ['query', 'title', 'content']
-        if not all(col in df.columns for col in required_cols):
-            return None, f"CSV must have columns: {required_cols}"
         # Load appropriate model
         if model_choice == "base":
@@ -865,9 +1132,9 @@ def run_csv_inference(csv_file, model_choice="finetuned", checkpoint_path=None,
             prompts = []
             for _, row in batch_df.iterrows():
                 prompt = format_prompt(
-                    str(row['query']),
                     str(row['title']),
-                    str(row['content'])
                 )
                 prompts.append(prompt)
@@ -921,9 +1188,36 @@ def run_csv_inference(csv_file, model_choice="finetuned", checkpoint_path=None,
         # If labels exist, calculate accuracy and add is_right column
         if 'label' in df.columns:
-            df['is_right'] = df['prediction'] == df['label']
-            accuracy = df['is_right'].mean()
-            accuracy_text = f"\nAccuracy: {accuracy:.2%} ({df['is_right'].sum()}/{len(df)} correct)"
         else:
             accuracy_text = ""
@@ -955,7 +1249,7 @@ with gr.Blocks(title="Phi-3 Document Relevance Classifier") as demo:
     with gr.Tab("Training"):
         csv_input = gr.File(label="Upload Training CSV", file_types=[".csv"])
-        gr.Markdown("CSV should have columns: query, title, content, label (yes/no)")
         shuffle_flag = gr.Checkbox(label="Shuffle Dataset", value=True)
         split_slider = gr.Slider(0.5, 0.9, value=0.8, step=0.05, label="Train Split %")
         start_btn = gr.Button("Start Training", variant="primary")
@@ -1047,7 +1341,7 @@ with gr.Blocks(title="Phi-3 Document Relevance Classifier") as demo:
             file_types=[".csv"],
             file_count="single"
         )
-        gr.Markdown("CSV must have columns: `query`, `title`, `content`. Optional: `label` (for accuracy calculation)")
         with gr.Row():
             model_choice_radio = gr.Radio(
@@ -1093,6 +1387,38 @@ with gr.Blocks(title="Phi-3 Document Relevance Classifier") as demo:
             outputs=[csv_output_file, csv_results_text]
         )
     with gr.Tab("Model Settings"):
         gr.Markdown("### Model Selection")
         gr.Markdown("Choose which model to use for training and inference")

 from datasets import Dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel, prepare_model_for_kbit_training, LoraConfig, get_peft_model
 from trl import DPOTrainer, DPOConfig
 import warnings
 import subprocess
 import gc
 import psutil
 warnings.filterwarnings("ignore")
 # ===== CONFIG =====
 current_tokenizer = None
 train_df = None
 test_df = None
+training_status = {"status": "idle", "progress": 0, "logs": [], "model_repo": None}
+trained_models = []  # Keep track of all trained models
 training_lock = threading.Lock()
 def format_prompt(query, title, content):
     """Format the prompt for the model"""
+    # Truncate content if too long
+    if len(content) > 1000:
+        content = content[:1000] + "..."
     return f"""you would get a query and document's title and content and return yes (if the document is relevant to the query)/ or no (if the document is not relevant to the query)
 Answer only yes / no.
 Document:
     return f"GPU Memory: {allocated:.1f}GB allocated / {reserved:.1f}GB reserved / {total:.1f}GB total"
+def get_trained_models_list():
+    """Get formatted list of all trained models"""
+    if not trained_models:
+        return "No models trained yet in this session.\n\nPreviously trained models on HuggingFace:\n- amos1088/phi3-dpo-relevance"
+    text = "## Trained Models in This Session:\n\n"
+    for i, model in enumerate(trained_models, 1):
+        text += f"{i}. **{model['repo']}**\n"
+        text += f"   - Accuracy: {model['accuracy']:.2%}\n"
+        text += f"   - Predictions: Yes {model['yes_ratio']:.1%}, No {model['no_ratio']:.1%}\n"
+        text += f"   - Beta: {model['beta']}, Model: {model['model_id'].split('/')[-1]}\n"
+        text += f"   - Link: https://huggingface.co/{model['repo']}\n\n"
+    return text
 def switch_model(model_id):
     """Switch to a different model"""
     global current_model, current_tokenizer, current_model_id, OUTPUT_DIR
 def prepare_dpo_dataset(df):
+    """Convert 4-category labels to DPO format with chosen/rejected pairs"""
     dpo_data = []
+    # Map 4 categories to yes/no
+    label_mapping = {
+        'easy_positive': 'yes',
+        'hard_positive': 'yes',
+        'easy_negative': 'no',
+        'hard_negative': 'no',
+        'yes': 'yes',
+        'no': 'no'
+    }
     for _, row in df.iterrows():
+        # Handle both old and new column names
+        if 'query_text' in row:
+            query = row['query_text']
+            title = row['title']
+            content = row['text']
+        else:
+            query = row.get('query', '')
+            title = row.get('title', '')
+            content = row.get('content', '')
+        # Create prompt if not exists
+        if 'prompt' in row:
+            prompt = row['prompt']
+        else:
+            prompt = format_prompt(query, title, content)
+        # Get mapped label
+        original_label = row['label']
+        mapped_label = label_mapping.get(original_label, original_label)
+        if mapped_label == 'yes':
             # For 'yes' examples, chosen is "yes" and rejected is "no"
             dpo_data.append({
                 'prompt': prompt,
                 'chosen': 'yes',
+                'rejected': 'no',
+                'original_label': original_label  # Keep original for analysis
             })
         else:
             # For 'no' examples, chosen is "no" and rejected is "yes"
             dpo_data.append({
                 'prompt': prompt,
                 'chosen': 'no',
+                'rejected': 'yes',
+                'original_label': original_label  # Keep original for analysis
             })
     return pd.DataFrame(dpo_data)
     import numpy as np
     def compute_accuracy_metrics(trainer, eval_dataset, num_samples=100):
+        """Compute accuracy metrics and confusion matrix on a subset of eval data"""
         # Sample subset for faster evaluation
         indices = np.random.choice(len(eval_dataset), min(num_samples, len(eval_dataset)), replace=False)
+        # Initialize confusion matrix counters
+        # Rows: true labels, Cols: predicted labels
+        confusion_matrix = {
+            'easy_positive': {'yes': 0, 'no': 0},
+            'hard_positive': {'yes': 0, 'no': 0},
+            'easy_negative': {'yes': 0, 'no': 0},
+            'hard_negative': {'yes': 0, 'no': 0}
+        }
         predictions_yes = 0
         predictions_no = 0
         correct = 0
         for idx in indices:
             item = eval_dataset[int(idx)]
             prompt = item['prompt']
+            true_choice = item['chosen']  # This is the correct answer (yes/no)
+            original_label = item.get('original_label', None)  # Get original 4-category label
             # Tokenize and run inference
             inputs = current_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
                 no_logit = logits[no_token_id].item()
                 # Get prediction
+                prediction = "yes" if yes_logit > no_logit else "no"
+                if prediction == "yes":
                     predictions_yes += 1
                 else:
                     predictions_no += 1
+                if prediction == true_choice:
                     correct += 1
+                # Update confusion matrix if we have original label
+                if original_label and original_label in confusion_matrix:
+                    confusion_matrix[original_label][prediction] += 1
         accuracy = correct / len(indices)
         yes_ratio = predictions_yes / len(indices)
         no_ratio = predictions_no / len(indices)
+        # Calculate per-category accuracies
+        category_accuracies = {}
+        for category in confusion_matrix:
+            total = confusion_matrix[category]['yes'] + confusion_matrix[category]['no']
+            if total > 0:
+                if category in ['easy_positive', 'hard_positive']:
+                    # For positive categories, correct prediction is 'yes'
+                    category_accuracies[category] = confusion_matrix[category]['yes'] / total
+                else:
+                    # For negative categories, correct prediction is 'no'
+                    category_accuracies[category] = confusion_matrix[category]['no'] / total
+            else:
+                category_accuracies[category] = 0.0
         return {
             'accuracy': accuracy,
             'yes_ratio': yes_ratio,
             'no_ratio': no_ratio,
+            'total_samples': len(indices),
+            'confusion_matrix': confusion_matrix,
+            'category_accuracies': category_accuracies
         }
     class StatusCallback(TrainerCallback):
                             f"No={metrics['no_ratio']:.1%}"
                         )
+                        # Add confusion matrix info if available
+                        if 'confusion_matrix' in metrics:
+                            training_status["logs"].append("\n=== Confusion Matrix ===")
+                            for category, preds in metrics['confusion_matrix'].items():
+                                total = preds['yes'] + preds['no']
+                                if total > 0:
+                                    acc = metrics['category_accuracies'][category]
+                                    training_status["logs"].append(
+                                        f"{category}: Yes={preds['yes']}, No={preds['no']} (Acc: {acc:.1%})"
+                                    )
                         # Warn if model is biased
                         if metrics['yes_ratio'] < 0.2 or metrics['no_ratio'] < 0.2:
                             training_status["logs"].append(
         with training_lock:
             training_status["logs"].append(f"\n=== FINAL RESULTS ===")
+            training_status["logs"].append(f"Overall Accuracy: {final_metrics['accuracy']:.2%}")
             training_status["logs"].append(f"Yes predictions: {final_metrics['yes_ratio']:.1%}")
             training_status["logs"].append(f"No predictions: {final_metrics['no_ratio']:.1%}")
+            # Add final confusion matrix
+            if 'confusion_matrix' in final_metrics:
+                training_status["logs"].append("\n=== Final Confusion Matrix ===")
+                for category, preds in final_metrics['confusion_matrix'].items():
+                    total = preds['yes'] + preds['no']
+                    if total > 0:
+                        acc = final_metrics['category_accuracies'][category]
+                        training_status["logs"].append(
+                            f"{category}: Yes={preds['yes']}, No={preds['no']} (Accuracy: {acc:.1%})"
+                        )
         # Update global model reference
         current_model = dpo_trainer.model
         # Push to hub if token available
         if HF_TOKEN and HF_USERNAME:
             try:
+                # Generate unique repo name with timestamp
+                from datetime import datetime
+                timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+                model_short_name = current_model_id.split("/")[-1]
+                # Create descriptive repo name with training details
+                repo_name = f"{HF_USERNAME}/{model_short_name}-relevance-beta{int(training_args.beta)}-{timestamp}"
+                # Create model card with training information
+                model_card_content = f"""---
+tags:
+- document-relevance
+- dpo
+- {model_short_name}
+datasets:
+- custom-relevance-dataset
+metrics:
+- accuracy
+model-index:
+- name: {repo_name.split('/')[-1]}
+  results:
+  - task:
+      type: text-classification
+      name: Document Relevance Classification
+    metrics:
+    - type: accuracy
+      value: {final_metrics['accuracy']:.4f}
+      name: Validation Accuracy
+    - type: yes_ratio
+      value: {final_metrics['yes_ratio']:.4f}
+      name: Yes Prediction Ratio
+    - type: no_ratio
+      value: {final_metrics['no_ratio']:.4f}
+      name: No Prediction Ratio
+---
+# {model_short_name} Document Relevance Classifier
+This model was trained using DPO (Direct Preference Optimization) for document relevance classification.
+## Training Configuration
+- Base Model: {current_model_id}
+- DPO Beta: {training_args.beta}
+- Learning Rate: {training_args.learning_rate}
+- Batch Size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}
+- Epochs: {training_args.num_train_epochs}
+- Training Samples: {len(train_df)}
+- Validation Samples: {len(val_df)}
+## Performance Metrics
+- **Accuracy**: {final_metrics['accuracy']:.2%}
+- **Yes Predictions**: {final_metrics['yes_ratio']:.1%}
+- **No Predictions**: {final_metrics['no_ratio']:.1%}
+## Usage
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import PeftModel
+# Load base model
+model = AutoModelForCausalLM.from_pretrained("{current_model_id}")
+tokenizer = AutoTokenizer.from_pretrained("{current_model_id}")
+# Load adapter
+model = PeftModel.from_pretrained(model, "{HF_USERNAME}/{repo_name.split('/')[-1]}")
+```
+## Training Date
+{datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC')}
+"""
+                # Push model with model card
+                current_model.push_to_hub(
+                    repo_name,
+                    use_auth_token=HF_TOKEN,
+                    commit_message=f"DPO training with beta={training_args.beta}, accuracy={final_metrics['accuracy']:.2%}"
+                )
                 current_tokenizer.push_to_hub(repo_name, use_auth_token=HF_TOKEN)
+                # Save model card
+                try:
+                    from huggingface_hub import HfApi
+                    api = HfApi()
+                    api.upload_file(
+                        path_or_fileobj=model_card_content.encode(),
+                        path_in_repo="README.md",
+                        repo_id=repo_name,
+                        repo_type="model",
+                        token=HF_TOKEN
+                    )
+                except:
+                    pass  # Model card upload is optional
                 logger.info(f"Model pushed to hub: {repo_name}")
+                # Save repo name to training status and trained models list
+                with training_lock:
+                    training_status["logs"].append(f"Model saved to: https://huggingface.co/{repo_name}")
+                    training_status["model_repo"] = repo_name
+                    # Add to trained models list
+                    trained_models.append({
+                        "repo": repo_name,
+                        "timestamp": timestamp,
+                        "accuracy": final_metrics['accuracy'],
+                        "yes_ratio": final_metrics['yes_ratio'],
+                        "no_ratio": final_metrics['no_ratio'],
+                        "beta": training_args.beta,
+                        "model_id": current_model_id
+                    })
             except Exception as e:
                 logger.error(f"Failed to push to hub: {e}")
 def run_training(csv_path, shuffle_flag=False, split_ratio=0.8):
+    """Run training from CSV file with balanced sampling"""
     global train_df, test_df, training_status
     try:
         df = pd.read_csv(csv_path)
         logger.info(f"Loaded {len(df)} samples from {csv_path}")
+        # Check required columns for new format
+        new_format_cols = ['query_text', 'title', 'text', 'label']
+        old_format_cols = ['query', 'title', 'content', 'label']
+        if all(col in df.columns for col in new_format_cols):
+            # New format with 4 categories
+            logger.info("Using new CSV format with 4 categories")
+            # Validate labels
+            valid_labels = ['easy_positive', 'hard_positive', 'easy_negative', 'hard_negative']
+            if not all(label in valid_labels for label in df['label'].unique()):
+                raise ValueError(f"Labels must be one of: {valid_labels}")
+            # Create prompt column
+            if 'prompt' not in df.columns:
+                logger.info("Creating prompt column from query_text/title/text")
+                df['prompt'] = df.apply(
+                    lambda row: format_prompt(row['query_text'], row['title'], row['text']),
+                    axis=1
+                )
+            # Log original distribution
+            logger.info(f"Original label distribution: {df['label'].value_counts().to_dict()}")
+            # Balance the dataset by sampling equal amounts from each category
+            min_category_count = df['label'].value_counts().min()
+            logger.info(f"Balancing dataset to {min_category_count} samples per category")
+            balanced_dfs = []
+            for label in valid_labels:
+                category_df = df[df['label'] == label]
+                if len(category_df) > min_category_count:
+                    category_df = category_df.sample(n=min_category_count, random_state=42)
+                balanced_dfs.append(category_df)
+            df = pd.concat(balanced_dfs, ignore_index=True)
+            logger.info(f"Balanced dataset size: {len(df)}")
+            logger.info(f"Balanced label distribution: {df['label'].value_counts().to_dict()}")
+        elif all(col in df.columns for col in old_format_cols):
+            # Old format
+            logger.info("Using old CSV format")
             if 'prompt' not in df.columns:
                 df['prompt'] = df.apply(
                     lambda row: format_prompt(row['query'], row['title'], row['content']),
                     axis=1
                 )
+            # Validate labels for old format
+            if not all(label in ['yes', 'no'] for label in df['label'].unique()):
+                raise ValueError("Labels must be 'yes' or 'no' for old format")
+        else:
+            raise ValueError(f"CSV must have columns: {new_format_cols} or {old_format_cols}")
+        # Shuffle if requested
+        if shuffle_flag:
+            df = df.sample(frac=1).reset_index(drop=True)
         # Split data
         split_idx = int(len(df) * split_ratio)
         with training_lock:
             training_status["status"] = "training"
             training_status["logs"].append("Starting training...")
+            training_status["logs"].append(f"Train samples: {len(train_df)}")
+            training_status["logs"].append(f"Test samples: {len(test_df)}")
         # Conservative sample size for GPT-OSS-20B
         if current_model_id == "openai/gpt-oss-20b":
     global current_model, current_tokenizer
     # Validate inputs
+    if not query or not str(query).strip():
         return "Error: Query cannot be empty"
+    if not document_title or not str(document_title).strip():
         return "Error: Document title cannot be empty"
+    if not document_content or not str(document_content).strip():
         return "Error: Document content cannot be empty"
+    # Convert to strings to handle any data type
+    query = str(query)
+    document_title = str(document_title)
+    document_content = str(document_content)
     # Load model if needed
     if current_model is None:
     row = df.iloc[int(row_idx)]
+    # Get data from row - handle both old and new formats
+    if all(col in row for col in ['query_text', 'title', 'text']):
+        query = str(row['query_text'])
+        title = str(row['title'])
+        content = str(row['text'])
+    elif all(col in row for col in ['query', 'title', 'content']):
         query = str(row['query'])
         title = str(row['title'])
         content = str(row['content'])
     # Extract just the prediction (yes/no) from the result
     prediction = prediction_with_confidence.split()[0]  # Gets "yes" or "no" from "yes (confidence: X%)"
+    # Handle 4-category labels
+    if actual in ['easy_positive', 'hard_positive', 'easy_negative', 'hard_negative']:
+        # Map to yes/no for comparison
+        label_mapping = {
+            'easy_positive': 'yes',
+            'hard_positive': 'yes',
+            'easy_negative': 'no',
+            'hard_negative': 'no'
+        }
+        mapped_actual = label_mapping[actual]
+        is_correct = prediction == mapped_actual
+        return f"Prediction: {prediction_with_confidence}\nActual: {actual} (mapped to: {mapped_actual})\nCorrect: {is_correct}"
+    else:
+        # Old format with yes/no
+        is_correct = prediction == actual
+        return f"Prediction: {prediction_with_confidence}\nActual: {actual}\nCorrect: {is_correct}"
 def run_csv_inference(csv_file, model_choice="finetuned", checkpoint_path=None, inference_batch_size=16, progress=gr.Progress()):
         df = pd.read_csv(csv_file.name)
         logger.info(f"Loaded {len(df)} samples from CSV for inference")
+        # Check required columns - support both old and new formats
+        new_format_cols = ['query_text', 'title', 'text']
+        old_format_cols = ['query', 'title', 'content']
+        if all(col in df.columns for col in new_format_cols):
+            # New format
+            query_col = 'query_text'
+            content_col = 'text'
+        elif all(col in df.columns for col in old_format_cols):
+            # Old format
+            query_col = 'query'
+            content_col = 'content'
+        else:
+            return None, f"CSV must have columns: {new_format_cols} or {old_format_cols}"
         # Load appropriate model
         if model_choice == "base":
             prompts = []
             for _, row in batch_df.iterrows():
                 prompt = format_prompt(
+                    str(row[query_col]),
                     str(row['title']),
+                    str(row[content_col])
                 )
                 prompts.append(prompt)
         # If labels exist, calculate accuracy and add is_right column
         if 'label' in df.columns:
+            # Handle 4-category labels
+            if df['label'].iloc[0] in ['easy_positive', 'hard_positive', 'easy_negative', 'hard_negative']:
+                # Map 4 categories to yes/no for accuracy calculation
+                label_mapping = {
+                    'easy_positive': 'yes',
+                    'hard_positive': 'yes',
+                    'easy_negative': 'no',
+                    'hard_negative': 'no'
+                }
+                df['mapped_label'] = df['label'].map(label_mapping)
+                df['is_right'] = df['prediction'] == df['mapped_label']
+                # Calculate per-category accuracy
+                accuracy_text = "\n=== Overall Results ==="
+                accuracy = df['is_right'].mean()
+                accuracy_text += f"\nOverall Accuracy: {accuracy:.2%} ({df['is_right'].sum()}/{len(df)} correct)"
+                accuracy_text += "\n\n=== Per-Category Results ==="
+                for category in ['easy_positive', 'hard_positive', 'easy_negative', 'hard_negative']:
+                    category_df = df[df['label'] == category]
+                    if len(category_df) > 0:
+                        category_acc = category_df['is_right'].mean()
+                        category_count = len(category_df)
+                        correct_count = category_df['is_right'].sum()
+                        accuracy_text += f"\n{category}: {category_acc:.2%} ({correct_count}/{category_count} correct)"
+            else:
+                # Old format with yes/no labels
+                df['is_right'] = df['prediction'] == df['label']
+                accuracy = df['is_right'].mean()
+                accuracy_text = f"\nAccuracy: {accuracy:.2%} ({df['is_right'].sum()}/{len(df)} correct)"
         else:
             accuracy_text = ""
     with gr.Tab("Training"):
         csv_input = gr.File(label="Upload Training CSV", file_types=[".csv"])
+        gr.Markdown("CSV should have columns: query_text, title, text, label (easy_positive/hard_positive/easy_negative/hard_negative)")
         shuffle_flag = gr.Checkbox(label="Shuffle Dataset", value=True)
         split_slider = gr.Slider(0.5, 0.9, value=0.8, step=0.05, label="Train Split %")
         start_btn = gr.Button("Start Training", variant="primary")
             file_types=[".csv"],
             file_count="single"
         )
+        gr.Markdown("CSV must have columns: `query_text`, `title`, `text`. Optional: `label` (for accuracy calculation)")
         with gr.Row():
             model_choice_radio = gr.Radio(
             outputs=[csv_output_file, csv_results_text]
         )
+    with gr.Tab("Trained Models"):
+        gr.Markdown("### Model Training History")
+        gr.Markdown("View all models trained in this session and their performance metrics")
+        models_display = gr.Markdown(get_trained_models_list())
+        refresh_btn = gr.Button("Refresh Model List", variant="secondary")
+        # Update CSV inference dropdown with trained models
+        model_selector = gr.Dropdown(
+            label="Select Trained Model for Inference",
+            choices=["Latest"] + [m["repo"] for m in trained_models],
+            value="Latest",
+            interactive=True
+        )
+        def refresh_models_list():
+            models_text = get_trained_models_list()
+            choices = ["Latest", "amos1088/phi3-dpo-relevance"] + [m["repo"] for m in trained_models]
+            return models_text, gr.update(choices=choices)
+        refresh_btn.click(
+            refresh_models_list,
+            outputs=[models_display, model_selector]
+        )
+        # Auto-refresh after training
+        training_status_display = gr.Textbox(visible=False)
+        training_status_display.change(
+            lambda: (get_trained_models_list(), gr.update(choices=["Latest"] + [m["repo"] for m in trained_models])),
+            outputs=[models_display, model_selector]
+        )
     with gr.Tab("Model Settings"):
         gr.Markdown("### Model Selection")
         gr.Markdown("Choose which model to use for training and inference")

common/ask_claude_clean.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Clean Claude API wrapper - Class-based, simple, no hidden errors
+"""
+import anthropic
+import json
+import base64
+from typing import Optional, Dict, List, Union, Any
+from pathlib import Path
+# Import config - adjust path for hackathon folder
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+try:
+    from common.global_config import anthropic_client
+except ImportError:
+    # Fallback if config not available
+    anthropic_client = None
+class ClaudeClient:
+    """Clean wrapper for Claude API"""
+    def __init__(self, api_key: Optional[str] = None):
+        if api_key:
+            self.client = anthropic.Anthropic(api_key=api_key)
+        elif anthropic_client:
+            self.client = anthropic_client
+        else:
+            raise ValueError("No anthropic client available. Pass api_key or configure global.css client.")
+    async def ask(
+        self,
+        messages: List[Dict],
+        model: str = "claude-sonnet-4-20250514",
+        system: Optional[str] = None,
+        max_tokens: int = 20000,
+        temperature: float = 1.0,
+        tools: Optional[List[Dict]] = None,
+        tool_choice: Optional[Dict] = None,
+        web_search_config: Optional[Dict] = None,
+        thinking: Optional[Dict] = None,
+        stream: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        Call Claude API with messages.
+        Returns:
+            Dict with response, tool calls, and metadata
+        """
+        # Process messages for files
+        messages = self._process_files(messages.copy())
+        # Handle empty user messages
+        for msg in messages:
+            if msg.get("role") == "user" and not msg.get("content"):
+                msg["content"] = "Hello! How can I help you today?"
+        # Build API parameters
+        params = {
+            "model": model,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "messages": messages
+        }
+        # Add system parameter if provided
+        if system:
+            params["system"] = system
+        # Add tools
+        all_tools = tools or []
+        if web_search_config:
+            # Build web search tool with config
+            web_tool = {"type": "web_search_20250305"}
+            if "name" in web_search_config:
+                web_tool["name"] = web_search_config["name"]
+            else:
+                web_tool["name"] = "web_search"
+            # Add optional parameters if provided
+            if "max_uses" in web_search_config:
+                web_tool["max_uses"] = web_search_config["max_uses"]
+            if "allowed_domains" in web_search_config:
+                web_tool["allowed_domains"] = web_search_config["allowed_domains"]
+            if "blocked_domains" in web_search_config:
+                web_tool["blocked_domains"] = web_search_config["blocked_domains"]
+            if "user_location" in web_search_config:
+                web_tool["user_location"] = web_search_config["user_location"]
+            all_tools.append(web_tool)
+            # Web search requires specific model
+            params["model"] = "claude-opus-4-20250514"
+        # Convert tool format if needed
+        if all_tools:
+            formatted_tools = []
+            for tool in all_tools:
+                if "type" not in tool and "name" in tool:
+                    # Convert from test format to API format
+                    formatted_tools.append({
+                        "type": "custom",
+                        "name": tool["name"],
+                        "description": tool.get("description", ""),
+                        "input_schema": tool.get("input_schema", {})
+                    })
+                else:
+                    formatted_tools.append(tool)
+            params["tools"] = formatted_tools
+        if tool_choice:
+            params["tool_choice"] = tool_choice
+        # Add thinking mode
+        if thinking:
+            params["thinking"] = thinking
+            params["temperature"] = 1.0  # Override for thinking
+        # Make API call
+        if web_search_config or thinking:
+            response = self.client.beta.messages.create(**params)
+        else:
+            response = self.client.messages.create(**params)
+        # Check if response has tool use
+        has_tool_use = hasattr(response, "content") and any(
+            hasattr(block, "type") and block.type == "tool_use"
+            for block in response.content
+        )
+        # Build comprehensive result dict
+        result = {
+            "response": None,  # Will be set below
+            "raw": response,  # Keep raw for compatibility
+            "messages": messages,
+            "usage": response.usage if hasattr(response, 'usage') else None,
+            "model": response.model if hasattr(response, 'model') else model,
+            "has_tool_use": has_tool_use,
+            "tool_calls": []
+        }
+        # Extract tool calls if present
+        if has_tool_use:
+            for block in response.content:
+                if hasattr(block, 'type') and block.type == 'tool_use':
+                    result["tool_calls"].append({
+                        "id": block.id if hasattr(block, 'id') else None,
+                        "name": block.name,
+                        "input": block.input
+                    })
+            # For tool use, response is the full object
+            result["response"] = response
+        else:
+            # Extract text response
+            text_response = self._extract_text(response)
+            result["response"] = text_response
+        return result
+    def _process_files(self, messages: List[Dict]) -> List[Dict]:
+        """Process file references in messages"""
+        for msg in messages:
+            if not isinstance(msg.get("content"), list):
+                continue
+            new_content = []
+            for item in msg["content"]:
+                if isinstance(item, dict) and item.get("type") == "file":
+                    file_content = self._read_file(item["path"])
+                    new_content.append(file_content)
+                else:
+                    new_content.append(item)
+            msg["content"] = new_content
+        return messages
+    def _read_file(self, file_path: str) -> Dict:
+        """Read file and return content block"""
+        path = Path(file_path)
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        # Text files
+        text_exts = {'.txt', '.py', '.js', '.json', '.md', '.html', '.svg', '.xml', '.csv'}
+        if path.suffix.lower() in text_exts:
+            content = path.read_text(encoding='utf-8')
+            return {"type": "text", "text": f"File: {path.name}\n\n{content}"}
+        # PDF
+        if path.suffix.lower() == '.pdf':
+            data = base64.b64encode(path.read_bytes()).decode()
+            return {
+                "type": "document",
+                "source": {"type": "base64", "media_type": "application/pdf", "data": data}
+            }
+        # Images
+        img_exts = {'.png', '.jpg', '.jpeg', '.gif', '.webp'}
+        if path.suffix.lower() in img_exts:
+            mime_type = f"image/{path.suffix[1:]}"
+            data = base64.b64encode(path.read_bytes()).decode()
+            return {
+                "type": "image",
+                "source": {"type": "base64", "media_type": mime_type, "data": data}
+            }
+        # Try as text
+        content = path.read_text(encoding='utf-8')
+        return {"type": "text", "text": f"File: {path.name}\n\n{content}"}
+    def _extract_text(self, response) -> str:
+        """Extract text from API response"""
+        if not hasattr(response, "content"):
+            return str(response)
+        parts = []
+        for block in response.content:
+            if hasattr(block, "type") and block.type == "text":
+                parts.append(block.text)
+        return " ".join(parts).strip()
+# Create default client lazily
+_default_client = None
+# Backwards compatible function for tests
+async def ask_claude(messages: List[Dict], **kwargs) -> Any:
+    """Function wrapper for backwards compatibility"""
+    global _default_client
+    if _default_client is None:
+        if anthropic_client:
+            _default_client = ClaudeClient()
+        else:
+            # Try to create with environment variable
+            import os
+            api_key = os.getenv('ANTHROPIC_API_KEY')
+            if api_key:
+                _default_client = ClaudeClient(api_key=api_key)
+            else:
+                raise ValueError("No anthropic client available. Set ANTHROPIC_API_KEY or configure global.css client.")
+    return await _default_client.ask(messages, **kwargs)

common/ask_openai.py ADDED Viewed

	@@ -0,0 +1,404 @@

+"""
+Simple OpenAI wrapper - no lies, just works
+NOW WITH PROPER TOOL CALLING FLOW
+AND VISION SUPPORT
+"""
+import json
+import base64
+from openai import OpenAI
+from typing import Dict, List, Any, Optional, Callable, Union
+import os
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from common.global_config import openai_client, secrets
+class OpenAIClient:
+    """Simple OpenAI client that actually works"""
+    def __init__(self, api_key: Optional[str] = None):
+        if api_key:
+            self.client = OpenAI(api_key=api_key)
+        elif openai_client:
+            self.client = openai_client
+        else:
+            raise ValueError("No OpenAI client. Pass api_key or set up global_config.")
+    def ask(
+        self,
+        messages: List[Dict],
+        model: str = "gpt-4o-mini",
+        temperature: float = 0.7,
+        max_tokens: int = 4000,
+        response_format: Dict = None,
+        tools: List[Dict] = None,
+        tool_choice: Any = None,
+        strict_schema: bool = False,
+        max_retries: int = 3
+    ) -> Dict[str, Any]:
+        """Just ask OpenAI and get response"""
+        # Build params
+        params = {
+            "model": model,
+            "messages": messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens
+        }
+        if response_format:
+            params["response_format"] = response_format
+        if tools:
+            params["tools"] = tools
+        if tool_choice:
+            params["tool_choice"] = tool_choice
+        # Try to get response
+        attempt = 0
+        last_error = None
+        while attempt < max_retries:
+            try:
+                response = self.client.chat.completions.create(**params)
+                # Get the content
+                content = response.choices[0].message.content
+                # If we need JSON and strict_schema is on, validate it
+                if strict_schema and response_format and response_format.get("type") == "json_object":
+                    try:
+                        json_content = json.loads(content)
+                        content = json_content
+                    except json.JSONDecodeError as e:
+                        if attempt < max_retries - 1:
+                            # Tell it what went wrong
+                            messages.append({"role": "assistant", "content": content})
+                            messages.append({"role": "user", "content": f"Invalid JSON. Error: {str(e)}. Please return valid JSON."})
+                            attempt += 1
+                            continue
+                        else:
+                            return {
+                                "error": f"Invalid JSON after {max_retries} attempts",
+                                "last_response": content,
+                                "attempts": attempt + 1
+                            }
+                # Build result
+                result = {
+                    "content": content,
+                    "model": response.model,
+                    "usage": {
+                        "prompt_tokens": response.usage.prompt_tokens,
+                        "completion_tokens": response.usage.completion_tokens,
+                        "total_tokens": response.usage.total_tokens
+                    }
+                }
+                # Add tool calls if any
+                if response.choices[0].message.tool_calls:
+                    result["tool_calls"] = []
+                    for tc in response.choices[0].message.tool_calls:
+                        result["tool_calls"].append({
+                            "id": tc.id,
+                            "name": tc.function.name,
+                            "arguments": json.loads(tc.function.arguments)
+                        })
+                return result
+            except Exception as e:
+                last_error = str(e)
+                attempt += 1
+                if attempt < max_retries:
+                    continue
+        return {
+            "error": f"Failed after {max_retries} attempts",
+            "last_error": last_error,
+            "attempts": max_retries
+        }
+# Simple function to use
+def ask_gpt(messages, **kwargs):
+    """Simple function to ask GPT"""
+    client = OpenAIClient()
+    return client.ask(messages, **kwargs)
+# Helper functions for tool calling
+def format_tool_message(tool_call_id: str, result: Any) -> Dict[str, Any]:
+    """Format tool result for conversation"""
+    return {
+        "role": "tool",
+        "content": json.dumps(result) if not isinstance(result, str) else result,
+        "tool_call_id": tool_call_id
+    }
+def format_assistant_with_tools(content: str, tool_calls: List[Dict]) -> Dict[str, Any]:
+    """Format assistant message with tool calls"""
+    msg = {
+        "role": "assistant",
+        "content": content
+    }
+    if tool_calls:
+        msg["tool_calls"] = [
+            {
+                "id": tc['id'],
+                "type": "function",
+                "function": {
+                    "name": tc['name'],
+                    "arguments": json.dumps(tc['arguments']) if isinstance(tc['arguments'], dict) else tc['arguments']
+                }
+            }
+            for tc in tool_calls
+        ]
+    return msg
+def ask_gpt_with_validation(messages, expected_format="json", max_retries=3, **kwargs):
+    """
+    Ask GPT with automatic validation and retry
+    Args:
+        messages: Messages to send
+        expected_format: "json" or "text"
+        max_retries: Number of retries if validation fails
+        **kwargs: Other params for ask_gpt
+    """
+    client = OpenAIClient()
+    conversation = messages.copy()
+    for attempt in range(max_retries):
+        # Force JSON format if expected
+        if expected_format == "json" and "response_format" not in kwargs:
+            kwargs["response_format"] = {"type": "json_object"}
+            kwargs["strict_schema"] = True
+        result = client.ask(conversation, **kwargs)
+        # If no error and we got what we expected, return
+        if "error" not in result:
+            if expected_format == "json" and isinstance(result.get('content'), dict):
+                return result
+            elif expected_format == "text":
+                return result
+        # Validation failed - retry with repair
+        if attempt < max_retries - 1:
+            # Add failed response to conversation
+            if result.get('last_response'):
+                conversation.append({
+                    "role": "assistant",
+                    "content": str(result['last_response'])
+                })
+            # Add repair prompt
+            if expected_format == "json":
+                repair = "Your response was not valid JSON. Please return a properly formatted JSON object."
+            else:
+                repair = "Please try again with a clear response."
+            conversation.append({
+                "role": "user",
+                "content": repair
+            })
+    # All retries failed
+    return {
+        "error": f"Failed validation after {max_retries} attempts",
+        "attempts": max_retries
+    }
+def encode_image_to_base64(image_path: Union[str, Path]) -> str:
+    """Encode image to base64 for vision API"""
+    image_path = Path(image_path)
+    if not image_path.exists():
+        raise FileNotFoundError(f"Image not found: {image_path}")
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+def ask_gpt_with_image(
+    text_prompt: str,
+    image_path: Union[str, Path],
+    model: str = "gpt-4.1-mini",
+    system_prompt: str = None,
+    detail: str = "high",
+    max_tokens: int = 4000,
+    temperature: float = 0.7,
+    response_format: Dict = None,
+    **kwargs  # Can include run_id, agent_name, etc.
+) -> Dict[str, Any]:
+    """
+    Ask GPT with both text and image input
+    Args:
+        text_prompt: The question/instruction about the image
+        image_path: Path to image file
+        model: Model to use (default: gpt-4.1-mini)
+        system_prompt: System message for behavior/role
+        detail: Image detail level (low/high/auto, default: high)
+        max_tokens: Max response tokens
+        temperature: Response creativity (0-1)
+        response_format: OpenAI response format (e.g. {"type": "json_object"})
+        **kwargs: Other OpenAI parameters
+    Returns:
+        Dict with content, model, usage, etc
+    """
+    # Encode image
+    base64_image = encode_image_to_base64(image_path)
+    # Determine MIME type
+    suffix = Path(image_path).suffix.lower()
+    mime_types = {
+        '.png': 'image/png',
+        '.jpg': 'image/jpeg',
+        '.jpeg': 'image/jpeg',
+        '.gif': 'image/gif',
+        '.webp': 'image/webp'
+    }
+    mime_type = mime_types.get(suffix, 'image/jpeg')
+    # Build messages
+    messages = []
+    if system_prompt:
+        messages.append({
+            "role": "system",
+            "content": system_prompt
+        })
+    # Add user message with text and image
+    messages.append({
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": text_prompt
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:{mime_type};base64,{base64_image}",
+                    "detail": detail
+                }
+            }
+        ]
+    })
+    # Call OpenAI - filter out context params
+    api_kwargs = {
+        'model': model,
+        'max_tokens': max_tokens,
+        'temperature': temperature
+    }
+    if response_format:
+        api_kwargs['response_format'] = response_format
+    # Add any other OpenAI-specific kwargs
+    for k, v in kwargs.items():
+        if k not in ['run_id', 'agent_name', 'tool_name']:
+            api_kwargs[k] = v
+    return ask_gpt(
+        messages=messages,
+        **api_kwargs
+    )
+def parse_json_response(response: Union[str, Dict]) -> Dict[str, Any]:
+    """
+    Parse JSON from API response, handling markdown formatting
+    Args:
+        response: Either raw string or dict with 'content' key
+    Returns:
+        Parsed JSON object or error dict
+    """
+    # Extract content if dict
+    if isinstance(response, dict):
+        content = response.get('content', '')
+        # Check if we have an error response
+        if 'error' in response:
+            return response  # Return error as-is
+    else:
+        content = response
+    # Check for empty content
+    if not content or not content.strip():
+        return {
+            "error": "Empty response from API",
+            "raw": ""
+        }
+    # Clean markdown formatting if present
+    if content.strip().startswith('```'):
+        lines = content.strip().split('\n')
+        if lines[0].startswith('```'):
+            lines = lines[1:]
+        if lines[-1].startswith('```'):
+            lines = lines[:-1]
+        content = '\n'.join(lines)
+    # Parse JSON
+    try:
+        return json.loads(content)
+    except json.JSONDecodeError as e:
+        return {
+            "error": f"JSON parse error: {str(e)}",
+            "raw": content[:500] + "..." if len(content) > 500 else content
+        }
+def validate_against_schema(data: Dict, schema: Dict) -> Dict[str, Any]:
+    """
+    Simple schema validation (can be extended later)
+    Args:
+        data: Parsed data to validate
+        schema: Expected schema structure
+    Returns:
+        Dict with validation result
+    """
+    # Basic validation - just check if all schema keys exist
+    missing_keys = []
+    def check_keys(schema_part, data_part, path=""):
+        if isinstance(schema_part, dict):
+            if not isinstance(data_part, dict):
+                missing_keys.append(f"{path} should be dict")
+                return
+            for key in schema_part:
+                if key not in data_part:
+                    missing_keys.append(f"{path}.{key}" if path else key)
+                else:
+                    check_keys(schema_part[key], data_part[key], f"{path}.{key}" if path else key)
+    check_keys(schema, data)
+    if missing_keys:
+        return {
+            "valid": False,
+            "missing": missing_keys,
+            "data": data
+        }
+    return {
+        "valid": True,
+        "data": data
+    }

common/global_config.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import anthropic
+import json
+import os
+from pathlib import Path
+from google.cloud import bigquery
+from openai import OpenAI
+# Base directories
+BASE_DIR = Path(__file__).parent
+PROJECT_ROOT = BASE_DIR.parent  # dual_agent_system directory
+# Load secrets and scrapers config
+secret_path = BASE_DIR / "secrets.json"
+scrapers_path = BASE_DIR / "scrapers.json"
+secrets = json.load(open(secret_path))
+scrapers = json.load(open(scrapers_path))
+# API Clients
+anthropic_client = anthropic.Anthropic(api_key=secrets["anthropic_api_key"])
+bqclient = bigquery.Client.from_service_account_info(secrets["AI_TWIN_CREDENTIALS_GCP"])
+openai_client = OpenAI(api_key=secrets["openai_api_key"])
+# Paths
+PATHS = {
+    "logs": PROJECT_ROOT / "logs",
+    "tests": PROJECT_ROOT / "tests",
+    "files": PROJECT_ROOT / "files",
+    "raw_files": PROJECT_ROOT / "files" / "raw_files_file_handler",
+    "tables": PROJECT_ROOT / "files" / "tables",
+    "vector_stores": PROJECT_ROOT / "files" / "vector_stores"
+}
+# Ensure directories exist
+for path in PATHS.values():
+    path.mkdir(exist_ok=True)
+# API Tokens
+TOKENS = {
+    "apify": secrets.get("apify"),
+    "anthropic": secrets.get("anthropic_api_key"),
+    "slack": secrets.get("slack"),
+    "openai": secrets.get("openai_api_key"),
+    "perplexity": secrets.get("perplexity")
+}
+# Scraper endpoints
+SCRAPER_ENDPOINTS = {
+    "youtube_search": scrapers["youtube"]["youtube_scraper"]["endpoint_url"],
+    "youtube_transcript": scrapers["youtube"]["youtube_transcript_scraper"]["endpoint_url"],
+    "fast_youtube_channel_scraper": scrapers["youtube"]["fast_youtube_channel_scraper"]["endpoint_url"],
+    "linkedin_post_search_scraper": scrapers["linkedin"]["linkedin_post_search_scraper"]["endpoint_url"],
+    "linkedin_everything_scraper": scrapers["linkedin"]["linkedin_everything_scraper"]["endpoint_url"],
+    "tiktok_search_scraper": scrapers["tiktok"]["tiktok_search_scraper"]["endpoint_url"],
+    "twitter_search": scrapers["twitter"]["twitter_search_scraper"]["endpoint_url"]
+}

common/trash.py ADDED Viewed

	@@ -0,0 +1,248 @@

+xample_data=[
+  {
+    "role": "user",
+    "content": [
+      {
+        "widget_type": "text",
+        "content": "I want to brainstorm an AI-driven personal finance coach. Show me search interest trends over the last six months first.",
+        "is_expanded": True
+      }
+    ]
+  },
+  {
+    "role": "assistant",
+    "content": [
+      {
+        "widget_type": "text",
+        "content": "That’s a fantastic idea! Understanding market interest is key. Here’s the trend:",
+        "is_expanded": True
+      },
+      {
+        "widget_type": "line-chart",
+        "content": {
+          "title": "Search Interest: AI Finance Coaches",
+          "data": {
+            "labels": ["Nov 2024", "Dec 2024", "Jan 2025", "Feb 2025", "Mar 2025", "Apr 2025"],
+            "datasets": [
+              { "label": "Search Volume", "data": [80, 95, 110, 130, 150, 175] }
+            ]
+          },
+          "options": {
+            "fill": False,
+            "tension": 0.2,
+            "showPoints": True
+          }
+        },
+        "is_expanded": False
+      }
+    ]
+  },
+  {
+    "role": "user",
+    "content": [
+      {
+        "widget_type": "text",
+        "content": "Trend looks promising. Now, can we compare feature popularity for Budget Tracking, Investment Advice, and Spending Alerts?",
+        "is_expanded": True
+      }
+    ]
+  },
+  {
+    "role": "assistant",
+    "content": [
+      {
+        "widget_type": "text",
+        "content": "Sure—you can see how those features scored in our survey:",
+        "is_expanded": True
+      },
+      {
+        "widget_type": "bar-chart",
+        "content": {
+          "title": "Feature Popularity Scores",
+          "data": {
+            "labels": ["Budget Tracking", "Investment Advice", "Spending Alerts"],
+            "datasets": [
+              { "label": "Survey Score", "data": [90, 75, 85] }
+            ]
+          },
+          "options": {
+            "horizontal": False,
+            "stacked": False
+          }
+        },
+        "is_expanded": False
+      },
+      {
+        "widget_type": "text",
+        "content": "Budget Tracking leads the pack, which makes sense given demand for cashflow insights.",
+        "is_expanded": True
+      }
+    ]
+  },
+  {
+    "role": "user",
+    "content": [
+      {
+        "widget_type": "text",
+        "content": "Great insight. Let's brainstorm some MVP feature ideas based on those results.",
+        "is_expanded": True
+      }
+    ]
+  },
+  {
+    "role": "assistant",
+    "content": [
+      {
+        "widget_type": "text",
+        "content": "Here are some potential MVP features to focus on:",
+        "is_expanded": True
+      },
+      {
+        "widget_type": "thinking",
+        "content": {
+          "title": "Brainstormed MVP Features",
+          "steps": [
+            { "description": "Real-time budget categorization" },
+            { "description": "AI-driven savings recommendations" },
+            { "description": "Voice-activated expense logging" }
+          ],
+          "conclusion": "Three strong MVP ideas identified.",
+          "options": {
+            "expandable": True,
+            "showStepNumbers": True
+          }
+        },
+        "is_expanded": False
+      }
+    ]
+  },
+  {
+    "role": "user",
+    "content": [
+      {
+        "widget_type": "text",
+        "content": "I'd like to prototype real-time budget categorization and voice-activated expense logging first.",
+        "is_expanded": True
+      }
+    ]
+  },
+  {
+    "role": "assistant",
+    "content": [
+      {
+        "widget_type": "text",
+        "content": "Perfect. Here’s a timeline for prototyping those two features:",
+        "is_expanded": True
+      },
+      {
+        "widget_type": "timeline",
+        "content": {
+          "title": "Prototyping Timeline",
+          "events": [
+            { "date": "2025-05-01", "description": "Real-time Budgeting Prototype" },
+            { "date": "2025-05-15", "description": "Voice Logging Prototype" }
+          ],
+          "options": {
+            "showDates": True,
+            "orientation": "vertical"
+          }
+        },
+        "is_expanded": False
+      }
+    ]
+  },
+  {
+    "role": "user",
+    "content": [
+      {
+        "widget_type": "text",
+        "content": "That timeline works for me. What launch approach should we take—internal pilot or closed beta?",
+        "is_expanded": True
+      }
+    ]
+  },
+  {
+    "role": "assistant",
+    "content": [
+      {
+        "widget_type": "text",
+        "content": "Industry benchmarks suggest many opt for a closed beta. Here’s the typical split:",
+        "is_expanded": True
+      },
+      {
+        "widget_type": "pie-chart",
+        "content": {
+          "title": "Launch Strategy Preference",
+          "data": {
+            "labels": ["Internal Pilot", "Closed Beta"],
+            "datasets": [
+              {
+                "data": [40, 60],
+                "backgroundColor": ["#36A2EB", "#FF6384"]
+              }
+            ]
+          },
+          "options": {
+            "doughnut": False,
+            "showPercentage": True
+          }
+        },
+        "is_expanded": False
+      },
+      {
+        "widget_type": "suggestion",
+        "content": {
+          "title": "Select Launch Approach",
+          "suggestions": [
+            { "label": "Internal Pilot" },
+            { "label": "Closed Beta" }
+          ],
+          "options": {
+            "multiSelect": False,
+            "showButtons": True
+          }
+        },
+        "is_expanded": True
+      }
+    ]
+  },
+  {
+    "role": "user",
+    "content": [
+      {
+        "widget_type": "text",
+        "content": "Let's go with a closed beta group.",
+        "is_expanded": True
+      }
+    ]
+  },
+  {
+    "role": "assistant",
+    "content": [
+      {
+        "widget_type": "text",
+        "content": "Excellent choice! Let’s outline the next steps for a successful closed beta:",
+        "is_expanded": True
+      },
+      {
+        "widget_type": "suggestion",
+        "content": {
+          "title": "Next Steps",
+          "suggestions": [
+            { "label": "Recruit Beta Users" },
+            { "label": "Draft NDAs" },
+            { "label": "Set Up Feedback Channels" },
+            { "label": "Monitor Key Metrics" }
+          ],
+          "options": {
+            "multiSelect": True,
+            "showButtons": True
+          }
+        },
+        "is_expanded": True
+      }
+    ]
+  }
+]
+#load data
+data_path= "widget-examples.json"

download_beir_datasets.py DELETED Viewed

@@ -1,287 +0,0 @@
-import os
-import pandas as pd
-import json
-import random
-from sklearn.model_selection import train_test_split
-from beir import util
-from beir.datasets.data_loader import GenericDataLoader
-from app import format_prompt
-# BEIR datasets with corpus sizes
-BEIR_DATASETS = {
-    # Smaller datasets (good for testing)
-    "scifact": {"size": "5K docs", "domain": "Scientific"},
-    "nfcorpus": {"size": "3.6K docs", "domain": "Medical"},
-    "fiqa": {"size": "57K docs", "domain": "Financial QA"},
-    # Medium datasets
-    "scidocs": {"size": "25K docs", "domain": "Scientific"},
-    "arguana": {"size": "8.7K docs", "domain": "Argument mining"},
-    "quora": {"size": "523K docs", "domain": "Duplicate questions"},
-    # Large datasets
-    "msmarco": {"size": "8.8M docs", "domain": "Web search"},
-    "trec-covid": {"size": "171K docs", "domain": "COVID research"},
-    "hotpotqa": {"size": "5.2M docs", "domain": "Multi-hop QA"},
-}
-def download_beir_dataset(dataset_name, data_path="datasets"):
-    """
-Download
-a
-BEIR
-dataset
-"""
-    print(f"\nDownloading {dataset_name}...")
-    print(f"Info: {BEIR_DATASETS[dataset_name]}")
-    # Download dataset
-    url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset_name}.zip"
-    out_dir = os.path.join(data_path, dataset_name)
-    data_path = util.download_and_unzip(url, out_dir)
-    print(f"Downloaded to: {data_path}")
-    # Load the data
-    corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")
-    print(f"Loaded:")
-    print(f"  - Corpus: {len(corpus)} documents")
-    print(f"  - Queries: {len(queries)} queries")
-    print(f"  - Qrels: {len(qrels)} query-doc relevance pairs")
-    # Calculate average docs per query
-    docs_per_query = []
-    for qid in qrels:
-        relevant_docs = [doc_id for doc_id, score in qrels[qid].items() if score > 0]
-        docs_per_query.append(len(relevant_docs))
-    avg_docs = sum(docs_per_query) / len(docs_per_query) if docs_per_query else 0
-    print(f"  - Avg relevant docs per query: {avg_docs:.1f}")
-    return corpus, queries, qrels
-def create_retrieval_training_data(corpus, queries, qrels, num_samples=1000):
-    """
-Create
-training
-data
-for retrieval model"""
-    print(f"\nCreating {num_samples} training examples...")
-    training_data = []
-    for qid, query_text in list(queries.items())[:num_samples]:
-        if qid not in qrels:
-            continue
-        # Get relevant and non-relevant documents
-        relevant_docs = []
-        non_relevant_docs = []
-        for doc_id, score in qrels[qid].items():
-            if score > 0:
-                relevant_docs.append(doc_id)
-        # Sample some non-relevant documents
-        all_doc_ids = list(corpus.keys())
-        for _ in range(min(10, len(relevant_docs) * 2)):
-            doc_id = random.choice(all_doc_ids)
-            if doc_id not in relevant_docs:
-                non_relevant_docs.append(doc_id)
-        # Create positive examples
-        for doc_id in relevant_docs[:5]:  # Limit to 5 per query
-            if doc_id in corpus:
-                example = {
-                    "query": query_text,
-                    "document_id": doc_id,
-                    "document": corpus[doc_id]["text"][:1000],  # Truncate
-                    "title": corpus[doc_id].get("title", ""),
-                    "relevance_score": qrels[qid][doc_id],
-                    "label": "relevant",
-                    "task": "retrieve_all_relevant"
-                }
-                training_data.append(example)
-        # Create negative examples
-        for doc_id in non_relevant_docs[:5]:
-            if doc_id in corpus:
-                example = {
-                    "query": query_text,
-                    "document_id": doc_id,
-                    "document": corpus[doc_id]["text"][:1000],
-                    "title": corpus[doc_id].get("title", ""),
-                    "relevance_score": 0,
-                    "label": "not_relevant",
-                    "task": "retrieve_all_relevant"
-                }
-                training_data.append(example)
-    return training_data
-def show_examples(corpus, queries, qrels, n=3):
-    """Show examples of the retrieval task"""
-    print("\n" + "="*80)
-    print("EXAMPLES OF RETRIEVAL TASK")
-    print("="*80)
-    for i, (qid, query_text) in enumerate(list(queries.items())[:n]):
-        print(f"\n📌 Query {i+1}: {query_text}")
-        if qid in qrels:
-            relevant_docs = [(doc_id, score) for doc_id, score in qrels[qid].items() if score > 0]
-            relevant_docs.sort(key=lambda x: x[1], reverse=True)
-            print(f"   Found {len(relevant_docs)} relevant documents:")
-            for doc_id, score in relevant_docs[:3]:
-                if doc_id in corpus:
-                    doc = corpus[doc_id]
-                    print(f"\n   ✓ Doc ID: {doc_id} (Relevance: {score})")
-                    print(f"     Title: {doc.get('title', 'No title')}")
-                    print(f"     Text: {doc['text'][:200]}...")
-def main(dataset_name):
-    print("="*80)
-    print("BEIR - Real Retrieval Ranking Datasets")
-    print("Task: Given query, find ALL relevant docs from thousands!")
-    print("="*80)
-    try:
-        # Download
-        corpus, queries, qrels = download_beir_dataset(dataset_name)
-        # Show examples
-        show_examples(corpus, queries, qrels)
-        # Create training data
-        training_data = create_retrieval_training_data(corpus, queries, qrels, num_samples=100)
-        # Save training data
-        output_file = f"beir_{dataset_name}_training.jsonl"
-        with open(output_file, 'w') as f:
-            for example in training_data:
-                f.write(json.dumps(example) + '\n')
-        print(f"\n✅ Saved {len(training_data)} training examples to {output_file}")
-        # Explain the task
-        print("\n" + "="*80)
-        print("THE RETRIEVAL TASK:")
-        print("="*80)
-        print("1. You have a QUERY (e.g., 'coronavirus treatment options')")
-        print("2. You have a CORPUS of thousands of documents")
-        print("3. You must find ALL relevant documents (not just one!)")
-        print("4. Some queries have 1 relevant doc, others have 100+")
-        print("5. You must rank them by relevance")
-        print("\nThis is MUCH harder than binary classification!")
-    except Exception as e:
-        print(f"\nError: {e}")
-        print("\nInstall BEIR first:")
-        print("pip install beir")
-if __name__ == "__main__":
-    # Choose dataset
-    print("\nAvailable datasets:")
-    for dataset_name, info in BEIR_DATASETS.items():
-        print(f"  - {dataset_name}: {info['size']} ({info['domain']})")
-    # Download smaller datasets for training
-    selected_datasets = ["scifact", "nfcorpus", "fiqa"]  # Smaller datasets
-    #
-    # for dataset_name in selected_datasets:
-    #     try:
-    #         main(dataset_name)
-    #     except Exception as e:
-    #         print(f"Error processing {dataset_name}: {e}")
-    #         continue
-    # Process downloaded datasets
-    if os.path.exists('datasets'):
-        datasets = os.listdir('datasets')
-        all_rows = []
-        for dataset_name in datasets:
-            print(f"\nProcessing {dataset_name}...")
-            dataset_path = f"datasets/{dataset_name}/{dataset_name}"
-            # Check if required files exist
-            if not all(os.path.exists(f"{dataset_path}/{f}") for f in ["corpus.jsonl", "queries.jsonl", "qrels"]):
-                print(f"Skipping {dataset_name} - missing required files")
-                continue
-            # Load corpus
-            print(f"Loading corpus for {dataset_name}...")
-            corpus = pd.DataFrame([json.loads(line)  for line in open(f"{dataset_path}/corpus.jsonl", "r", encoding="utf-8")])
-            queries = pd.DataFrame([json.loads(line)  for line in open(f"{dataset_path}/queries.jsonl", "r", encoding="utf-8")])
-            # Process qrels files
-            qrels_files = [f for f in os.listdir(f"{dataset_path}/qrels") if f.endswith('.tsv') and not f.startswith('my_')]
-            qrels_df = pd.concat([pd.read_csv(f"{dataset_path}/qrels/{qrel_file}", sep="\t") for qrel_file in qrels_files])
-            corpus['_id'] = corpus['_id'].astype(str)
-            queries['_id'] = queries['_id'].astype(str)
-            qrels_df['corpus-id'] = qrels_df['corpus-id'].astype(str)
-            qrels_df['query-id'] = qrels_df['query-id'].astype(str)
-            qrels_df = pd.merge(left=qrels_df,right=corpus,left_on='corpus-id',right_on='_id',suffixes=('','_doc'))
-            qrels_df = pd.merge(left=qrels_df, right=queries, left_on='query-id', right_on='_id',suffixes=('','_query'))
-            qrels_df = qrels_df[qrels_df['score']==2]
-            if qrels_df.size == 0:
-                continue
-            qrels_df1 = qrels_df.sample(frac=1).reset_index(drop=True)[0:5000]
-            qrels_df2 = qrels_df.sample(frac=1).reset_index(drop=True)[0:5000]
-            for (_,row1),(_,row2) in zip(qrels_df1.iterrows(),qrels_df2.iterrows()):
-                query_text = row1.text_query
-                title = row1.title
-                content = row1.text
-                prompt = format_prompt(query_text, title, content)
-                example = {
-                    'query': query_text,
-                    'title': title,
-                    'content': content,
-                    'prompt': prompt,
-                    'label': 'yes',
-                    'dataset': dataset_name,
-                    'score': row1.score,
-                    'query-id':row1['query-id'],
-                    'doc_id': row1['corpus-id']
-                }
-                query_text = row2.text_query
-                all_rows.append(example)
-                prompt = format_prompt(query_text, title, content)
-                example = {
-                    'query': query_text,
-                    'title': title,
-                    'content': content,
-                    'prompt': prompt,
-                    'label': 'no',
-                    'dataset': dataset_name,
-                    'score': 0,
-                    'query-id': row2['query-id'],
-                    'doc_id': row1['corpus-id']
-                }
-                all_rows.append(example)
-            # Create DataFrame
-            df = pd.DataFrame(all_rows)
-            print(f"\nTotal examples collected: {len(df)}")
-            print(f"Label distribution: {df['label'].value_counts().to_dict()}")
-            print(f"Dataset distribution: {df['dataset'].value_counts().to_dict()}")
-            # Balance the dataset (equal yes/no)
-            # Save splits
-            df.to_csv('train_beir.csv', index=False)
-            df.sample(50).to_csv('sampled_beir.csv')

evaluate.py DELETED Viewed

@@ -1,273 +0,0 @@
-"""
-Evaluation Script - Compare Base vs Fine-tuned Phi-4-mini
-Optionally use GPT-4o as a judge
-"""
-import os
-import json
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from peft import PeftModel
-import logging
-from datetime import datetime
-from typing import Dict, List
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"  # Using stable Phi-3
-FINETUNED_DIR = "./phi4-lora-finetuned"
-def format_prompt(query, content):
-    """Format input into structured prompt"""
-    prompt = f"""You are an AI content analyst.
-Task:
-1. Given the following content and a user query, decide if the content is relevant.
-2. If it is relevant:
-   - Extract the top 2-3 key sentences
-   - Suggest 3-5 relevant tags
-   - Provide a short explanation or content extension (~2-3 sentences)
-Format your response in JSON with:
-{{
-  "relevant": true or false,
-  "key_sentences": [...],
-  "tags": [...],
-  "expansion": "..."
-}}
-User Query:
-{query}
-Content:
-{content}
-Response:"""
-    return prompt
-def generate_response(model, tokenizer, prompt, max_new_tokens=512):
-    """Generate response from model"""
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=max_new_tokens,
-            temperature=0.1,
-            do_sample=True,
-            top_p=0.9,
-            pad_token_id=tokenizer.eos_token_id
-        )
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Extract only the generated part
-    response = response[len(prompt):].strip()
-    return response
-def extract_json_from_response(response):
-    """Try to extract JSON from model response"""
-    try:
-        # Find JSON-like content
-        start = response.find('{')
-        end = response.rfind('}') + 1
-        if start >= 0 and end > start:
-            json_str = response[start:end]
-            return json.loads(json_str)
-    except:
-        pass
-    return None
-def evaluate_models(test_file="test_set.jsonl"):
-    """Compare base and fine-tuned models"""
-    logger.info("Loading models for evaluation...")
-    # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    tokenizer.pad_token = tokenizer.eos_token
-    # Load base model
-    logger.info("Loading base model...")
-    base_model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        device_map="auto",
-        load_in_4bit=True,
-        torch_dtype=torch.float16,
-        trust_remote_code=True
-    )
-    # Load fine-tuned model if available
-    finetuned_model = None
-    if os.path.exists(FINETUNED_DIR):
-        logger.info("Loading fine-tuned model...")
-        finetuned_model = PeftModel.from_pretrained(base_model, FINETUNED_DIR)
-    else:
-        logger.warning(f"Fine-tuned model not found at {FINETUNED_DIR}")
-        logger.warning("Run train.py first to create the fine-tuned model")
-    # Create test set if it doesn't exist
-    if not os.path.exists(test_file):
-        logger.info("Creating example test set...")
-        test_examples = [
-            {
-                "query": "What are smart contracts?",
-                "content": "Smart contracts are self-executing contracts with the terms directly written into code. They run on blockchain networks and automatically execute when predetermined conditions are met. This eliminates the need for intermediaries and ensures transparency.",
-                "expected_output": {
-                    "relevant": True,
-                    "key_sentences": [
-                        "Smart contracts are self-executing contracts with the terms directly written into code.",
-                        "They run on blockchain networks and automatically execute when predetermined conditions are met."
-                    ],
-                    "tags": ["smart-contracts", "blockchain", "automation", "decentralized"],
-                    "expansion": "This content explains the fundamental concept of smart contracts, emphasizing their self-executing nature and role in eliminating intermediaries through blockchain-based automation."
-                }
-            },
-            {
-                "query": "How does machine learning work?",
-                "content": "Pizza is a popular Italian dish consisting of a round, flat base of dough topped with tomatoes, cheese, and various toppings. It originated in Naples and has become one of the world's most popular foods.",
-                "expected_output": {
-                    "relevant": False,
-                    "key_sentences": [],
-                    "tags": [],
-                    "expansion": ""
-                }
-            },
-            {
-                "query": "What is DeFi?",
-                "content": "Decentralized Finance (DeFi) refers to financial services built on blockchain technology that operate without traditional intermediaries like banks. DeFi protocols use smart contracts to provide services such as lending, borrowing, and trading. Users maintain control of their assets through cryptocurrency wallets.",
-                "expected_output": {
-                    "relevant": True,
-                    "key_sentences": [
-                        "Decentralized Finance (DeFi) refers to financial services built on blockchain technology that operate without traditional intermediaries like banks.",
-                        "DeFi protocols use smart contracts to provide services such as lending, borrowing, and trading."
-                    ],
-                    "tags": ["defi", "blockchain", "smart-contracts", "lending", "decentralized-finance"],
-                    "expansion": "This content provides a clear overview of DeFi, explaining how it leverages blockchain and smart contracts to recreate traditional financial services in a decentralized manner."
-                }
-            }
-        ]
-        with open(test_file, 'w', encoding='utf-8') as f:
-            for example in test_examples:
-                f.write(json.dumps(example) + '\n')
-    # Load test data
-    logger.info(f"Loading test data from {test_file}...")
-    test_data = []
-    with open(test_file, 'r', encoding='utf-8') as f:
-        for line in f:
-            test_data.append(json.loads(line))
-    # Evaluate
-    results = []
-    for i, example in enumerate(test_data):
-        logger.info(f"\nEvaluating example {i+1}/{len(test_data)}...")
-        prompt = format_prompt(example["query"], example["content"])
-        # Get base model response
-        base_response = generate_response(base_model, tokenizer, prompt)
-        base_json = extract_json_from_response(base_response)
-        # Get fine-tuned model response if available
-        finetuned_response = None
-        finetuned_json = None
-        if finetuned_model:
-            finetuned_response = generate_response(finetuned_model, tokenizer, prompt)
-            finetuned_json = extract_json_from_response(finetuned_response)
-        result = {
-            "example_id": i + 1,
-            "query": example["query"],
-            "content": example["content"][:100] + "...",  # Truncate for display
-            "expected": example["expected_output"],
-            "base_response": base_response,
-            "base_json": base_json,
-            "finetuned_response": finetuned_response,
-            "finetuned_json": finetuned_json
-        }
-        results.append(result)
-        # Print comparison
-        print(f"\n{'='*80}")
-        print(f"Example {i+1}: {example['query']}")
-        print(f"{'='*80}")
-        print(f"\n📄 Content: {example['content'][:100]}...")
-        print(f"\n✅ Expected Output:")
-        print(json.dumps(example['expected_output'], indent=2))
-        print(f"\n🔹 Base Model Output:")
-        print(base_response[:500] + "..." if len(base_response) > 500 else base_response)
-        if finetuned_response:
-            print(f"\n🔸 Fine-tuned Model Output:")
-            print(finetuned_response[:500] + "..." if len(finetuned_response) > 500 else finetuned_response)
-    # Save results
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    results_file = f"evaluation_results_{timestamp}.json"
-    with open(results_file, 'w', encoding='utf-8') as f:
-        json.dump(results, f, indent=2)
-    logger.info(f"\n✅ Evaluation complete! Results saved to: {results_file}")
-    # Generate summary
-    print(f"\n{'='*80}")
-    print("EVALUATION SUMMARY")
-    print(f"{'='*80}")
-    base_valid_json = sum(1 for r in results if r['base_json'] is not None)
-    finetuned_valid_json = sum(1 for r in results if r['finetuned_json'] is not None)
-    print(f"Base model valid JSON responses: {base_valid_json}/{len(results)}")
-    if finetuned_model:
-        print(f"Fine-tuned model valid JSON responses: {finetuned_valid_json}/{len(results)}")
-    print("\n💡 To use GPT-4o as a judge:")
-    print("1. Copy the results from the evaluation file")
-    print("2. Ask GPT-4o to compare base vs fine-tuned outputs")
-    print("3. Request scores for relevance detection, tag quality, and key sentence extraction")
-    return results
-def create_gpt4_evaluation_prompt(results):
-    """Create a prompt for GPT-4 to evaluate the results"""
-    prompt = """You are an expert evaluator comparing two versions of a document analysis model.
-For each example below, score both the Base Model and Fine-tuned Model outputs on:
-1. Relevance Detection (0-5): How accurately did it determine if the content is relevant?
-2. Tag Quality (0-5): How good are the suggested tags?
-3. Key Sentences (0-5): How well did it extract the most important sentences?
-4. Expansion Quality (0-5): How insightful is the content expansion?
-Provide an overall winner for each example and a final recommendation.
-EVALUATION DATA:
-"""
-    for result in results:
-        prompt += f"\n\nExample {result['example_id']}:\n"
-        prompt += f"Query: {result['query']}\n"
-        prompt += f"Content: {result['content']}\n"
-        prompt += f"Expected: {json.dumps(result['expected'], indent=2)}\n"
-        prompt += f"Base Output: {result['base_response']}\n"
-        if result['finetuned_response']:
-            prompt += f"Fine-tuned Output: {result['finetuned_response']}\n"
-    prompt += "\n\nPlease evaluate and provide scores."
-    return prompt
-if __name__ == "__main__":
-    results = evaluate_models()
-    # Optionally create GPT-4 evaluation prompt
-    if results and len(results) > 0:
-        gpt4_prompt = create_gpt4_evaluation_prompt(results[:3])  # First 3 examples
-        with open("gpt4_evaluation_prompt.txt", "w", encoding="utf-8") as f:
-            f.write(gpt4_prompt)
-        print(f"\n📝 GPT-4 evaluation prompt saved to: gpt4_evaluation_prompt.txt")
-        print("Copy this to ChatGPT to get detailed scoring!")

generate_sample_data.py DELETED Viewed

@@ -1,90 +0,0 @@
-"""
-Generate sample train/val/test data for testing if CSV files don't exist
-"""
-import pandas as pd
-import os
-def generate_sample_data():
-    """Generate sample DPO data for BEIR-style relevance classification"""
-    # Sample data
-    samples = [
-        {
-            "query": "Do Cholesterol Statin Drugs Cause Breast Cancer?",
-            "document": "Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear.",
-            "relevant": True
-        },
-        {
-            "query": "What is machine learning?",
-            "document": "Pizza is a popular Italian dish consisting of a round, flat base of dough topped with tomatoes, cheese, and various toppings.",
-            "relevant": False
-        },
-        {
-            "query": "How does deep learning work?",
-            "document": "Deep learning is a subset of machine learning that uses neural networks with multiple layers to progressively extract higher-level features from raw input.",
-            "relevant": True
-        },
-        {
-            "query": "What are the symptoms of COVID-19?",
-            "document": "The stock market showed significant volatility today with major indices closing lower due to concerns about inflation.",
-            "relevant": False
-        },
-        {
-            "query": "What is blockchain technology?",
-            "document": "Blockchain is a distributed ledger technology that maintains a secure and decentralized record of transactions. It consists of blocks linked together using cryptography.",
-            "relevant": True
-        }
-    ]
-    # Create DPO format data
-    dpo_data = []
-    for sample in samples * 20:  # Repeat to create more samples
-        prompt = f"""you would get a query and document's title and content and return Relevant/Irrelevant.
-Query:
-{sample['query']}
-Document:
-title: Document
-content: {sample['document']}"""
-        if sample['relevant']:
-            chosen = "Relevant"
-            rejected = "Irrelevant"
-        else:
-            chosen = "Irrelevant"
-            rejected = "Relevant"
-        dpo_data.append({
-            "prompt": prompt,
-            "chosen": chosen,
-            "rejected": rejected
-        })
-    # Split into train/val/test
-    total = len(dpo_data)
-    train_size = int(0.7 * total)
-    val_size = int(0.15 * total)
-    train_data = dpo_data[:train_size]
-    val_data = dpo_data[train_size:train_size + val_size]
-    test_data = dpo_data[train_size + val_size:]
-    # Save to CSV
-    pd.DataFrame(train_data).to_csv("train.csv", index=False)
-    pd.DataFrame(val_data).to_csv("val.csv", index=False)
-    pd.DataFrame(test_data).to_csv("test.csv", index=False)
-    print(f"✅ Generated sample data:")
-    print(f"   - train.csv: {len(train_data)} samples")
-    print(f"   - val.csv: {len(val_data)} samples")
-    print(f"   - test.csv: {len(test_data)} samples")
-if __name__ == "__main__":
-    # Check if files exist
-    if not os.path.exists("train.csv"):
-        print("⚠️  train.csv not found. Generating sample data...")
-        generate_sample_data()
-    else:
-        print("✅ Data files already exist")

inference_chatgpt_simple.py ADDED Viewed

	@@ -0,0 +1,89 @@

+#!/usr/bin/env python3
+"""
+Simplified ChatGPT inference script for yes/no classification
+Forces single token responses for consistent results
+"""
+import os
+import pandas as pd
+from openai import OpenAI
+import time
+from datetime import datetime
+from app import format_prompt
+# Initialize OpenAI client
+MICROSERVICES_FIVERR_OS_FIVERR_OS_BACKEND_CREDENTIALS_OPENAI_API_KEY="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"
+client = OpenAI(api_key=MICROSERVICES_FIVERR_OS_FIVERR_OS_BACKEND_CREDENTIALS_OPENAI_API_KEY)
+def get_prediction(query, title, content, model="gpt-4.1"):
+    """Get yes/no prediction from ChatGPT"""
+    prompt = format_prompt(query, title, content)
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "user", "content": prompt}
+            ],
+        )
+        # Get prediction
+        prediction = response.choices[0].message.content.strip().lower()
+        # Ensure it's yes or no
+        if prediction not in ['yes', 'no']:
+            prediction = 'error'
+            print(prediction)
+        return prediction
+    except Exception as e:
+        print(f"API Error: {e}")
+        return 'error'
+def main():
+    csv_path ="train_datasets_creation/full_train_dataset.csv"
+    # Load CSV
+    print(f"Loading {csv_path}...")
+    df = pd.read_csv(csv_path)
+    dfs = [df[df['label']==d].sample(20) for d in df['label'].unique()]
+    df = pd.concat(dfs).reset_index()
+    # Process each row
+    predictions = []
+    for idx, row in df.iterrows():
+        # Show progress
+        if idx % 10 == 0:
+            print(f"Progress: {idx}/{len(df)}")
+        # Get prediction
+        pred = get_prediction(
+            str(row['query_text']),
+            str(row['title']),
+            str(row['text'])
+        )
+        predictions.append(pred)
+        # Rate limit
+        time.sleep(0.5)
+    # Add predictions
+    df['prediction'] = predictions
+    conf_matrix = pd.crosstab(
+        index=df['label'],  # True labels
+        columns=df['prediction'],  # Predicted labels
+        rownames=['Actual'],
+        colnames=['Predicted']
+    )
+    accuracy = (df['prediction']=='yes')&((df['label']=='easy_positive')|(df['label']=='hard_positive'))|(df['prediction']=='no')&((df['label']=='easy_negative')|(df['label']=='hard_negative'))
+    print(conf_matrix)
+    print(accuracy.mean())
+    output = f"chatgpt_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+    df.to_csv(output, index=False)
+    print(f"\nSaved to: {output}")
+    # Show summary
+    print("\nResults:")
+    print(df['prediction'].value_counts())
+if __name__ == "__main__":
+    main()

inference_server.py DELETED Viewed

@@ -1,179 +0,0 @@
-"""
-Inference server to test model while training
-Can load checkpoints on-the-fly
-"""
-from flask import Flask, request, jsonify
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from peft import PeftModel
-import json
-import os
-import glob
-import logging
-app = Flask(__name__)
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Global model and tokenizer
-model = None
-tokenizer = None
-current_checkpoint = None
-MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
-CHECKPOINT_DIR = "./phi3-dpo-beir-checkpoints"
-def load_latest_checkpoint():
-    """Load the latest checkpoint from training"""
-    global model, tokenizer, current_checkpoint
-    # Find latest checkpoint
-    checkpoints = glob.glob(f"{CHECKPOINT_DIR}/checkpoint-*")
-    if not checkpoints:
-        logger.warning("No checkpoints found")
-        return False
-    latest = max(checkpoints, key=os.path.getctime)
-    if latest == current_checkpoint:
-        return True  # Already loaded
-    logger.info(f"Loading checkpoint: {latest}")
-    try:
-        # Load tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-        tokenizer.pad_token = tokenizer.eos_token
-        # Load base model
-        base_model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            load_in_4bit=True,
-            torch_dtype=torch.float16,
-            device_map="auto",
-            trust_remote_code=True
-        )
-        # Load LoRA weights
-        model = PeftModel.from_pretrained(base_model, latest)
-        model.eval()
-        current_checkpoint = latest
-        logger.info(f"✅ Loaded checkpoint: {latest}")
-        return True
-    except Exception as e:
-        logger.error(f"Failed to load checkpoint: {e}")
-        return False
-def format_prompt(query, document):
-    """Format prompt for inference"""
-    return f"""You are an AI content analyst.
-Task:
-1. Given the following content and a user query, decide if the content is relevant.
-2. If it is relevant:
-   - Extract the top 2-3 key sentences
-   - Suggest 3-5 relevant tags
-   - Provide a short explanation or content extension (~2-3 sentences)
-Format your response in JSON with:
-{{
-  "relevant": true or false,
-  "key_sentences": [...],
-  "tags": [...],
-  "expansion": "..."
-}}
-User Query:
-{query}
-Content:
-{document}
-Response:"""
-@app.route('/health', methods=['GET'])
-def health():
-    """Health check endpoint"""
-    return jsonify({
-        "status": "healthy",
-        "model_loaded": model is not None,
-        "current_checkpoint": current_checkpoint
-    })
-@app.route('/inference', methods=['POST'])
-def inference():
-    """Run inference on query-document pair"""
-    global model, tokenizer
-    # Load latest checkpoint if needed
-    if not model or request.args.get('reload') == 'true':
-        if not load_latest_checkpoint():
-            return jsonify({"error": "No model available"}), 503
-    data = request.json
-    query = data.get('query', '')
-    document = data.get('document', '')
-    if not query or not document:
-        return jsonify({"error": "Missing query or document"}), 400
-    # Format prompt
-    prompt = format_prompt(query, document)
-    # Tokenize
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    # Generate
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=256,
-            temperature=0.1,
-            do_sample=True,
-            pad_token_id=tokenizer.eos_token_id
-        )
-    # Decode
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    response = response[len(prompt):].strip()
-    # Try to parse JSON
-    try:
-        # Find JSON in response
-        start = response.find('{')
-        end = response.rfind('}') + 1
-        if start >= 0 and end > start:
-            json_str = response[start:end]
-            result = json.loads(json_str)
-        else:
-            result = {"raw_response": response}
-    except:
-        result = {"raw_response": response}
-    return jsonify({
-        "checkpoint": current_checkpoint,
-        "result": result,
-        "raw_response": response
-    })
-@app.route('/checkpoints', methods=['GET'])
-def list_checkpoints():
-    """List available checkpoints"""
-    checkpoints = glob.glob(f"{CHECKPOINT_DIR}/checkpoint-*")
-    checkpoints.sort(key=os.path.getctime, reverse=True)
-    return jsonify({
-        "checkpoints": [os.path.basename(cp) for cp in checkpoints],
-        "current": os.path.basename(current_checkpoint) if current_checkpoint else None
-    })
-if __name__ == '__main__':
-    # Try to load initial checkpoint
-    load_latest_checkpoint()
-    # Run server
-    app.run(host='0.0.0.0', port=5000, debug=False)

setup_environment.py DELETED Viewed

@@ -1,120 +0,0 @@
-"""
-Setup script to ensure all dependencies are correctly installed
-"""
-import subprocess
-import sys
-import os
-def run_command(cmd):
-    """Run a command and return success status"""
-    try:
-        subprocess.check_call(cmd, shell=True)
-        return True
-    except subprocess.CalledProcessError:
-        return False
-def main():
-    print("🔧 Setting up environment for DPO training...")
-    print("="*60)
-    # Python version check
-    python_version = sys.version_info
-    print(f"Python version: {python_version.major}.{python_version.minor}.{python_version.micro}")
-    if python_version.major < 3 or (python_version.major == 3 and python_version.minor < 8):
-        print("❌ Python 3.8+ is required!")
-        sys.exit(1)
-    # Fix protobuf issues
-    print("\n📦 Fixing protobuf issues...")
-    run_command(f"{sys.executable} -m pip uninstall -y protobuf")
-    run_command(f"{sys.executable} -m pip install protobuf==3.20.3")
-    # Install tf-keras for compatibility
-    print("\n📦 Installing tf-keras for compatibility...")
-    run_command(f"{sys.executable} -m pip install tf-keras")
-    # Core dependencies
-    print("\n📦 Installing core dependencies...")
-    dependencies = [
-        "torch>=2.0.0",
-        "transformers>=4.36.0",
-        "datasets",
-        "accelerate>=0.25.0",
-        "peft>=0.7.0",
-        "trl>=0.7.0",
-        "bitsandbytes>=0.41.0",
-        "pandas",
-        "scipy",
-        "sentencepiece",  # Required for some tokenizers
-        "protobuf==3.20.3",  # Specific version to avoid issues
-    ]
-    for dep in dependencies:
-        print(f"Installing {dep}...")
-        if not run_command(f"{sys.executable} -m pip install {dep}"):
-            print(f"⚠️ Failed to install {dep}, continuing...")
-    # BEIR dependencies (optional)
-    print("\n📦 Installing BEIR dependencies (optional)...")
-    beir_deps = ["beir", "scikit-learn", "tqdm"]
-    for dep in beir_deps:
-        print(f"Installing {dep}...")
-        run_command(f"{sys.executable} -m pip install {dep}")
-    # Check CUDA
-    print("\n🔍 Checking CUDA availability...")
-    try:
-        import torch
-        if torch.cuda.is_available():
-            print(f"✅ CUDA is available!")
-            print(f"   Device: {torch.cuda.get_device_name(0)}")
-            print(f"   CUDA version: {torch.version.cuda}")
-        else:
-            print("⚠️ CUDA not available. Training will be slow on CPU.")
-    except Exception as e:
-        print(f"⚠️ Could not check CUDA: {e}")
-    # Test imports
-    print("\n🧪 Testing imports...")
-    test_imports = [
-        "torch",
-        "transformers",
-        "trl",
-        "peft",
-        "datasets",
-        "accelerate",
-        "bitsandbytes",
-        "pandas"
-    ]
-    failed = []
-    for module in test_imports:
-        try:
-            __import__(module)
-            print(f"✅ {module}")
-        except ImportError as e:
-            print(f"❌ {module}: {e}")
-            failed.append(module)
-    if failed:
-        print(f"\n⚠️ Some imports failed: {', '.join(failed)}")
-        print("Try running: pip install --upgrade " + " ".join(failed))
-    else:
-        print("\n✅ All imports successful!")
-    # Generate sample data if needed
-    if not os.path.exists("train.csv"):
-        print("\n📊 Generating sample data...")
-        try:
-            run_command(f"{sys.executable} generate_sample_data.py")
-        except:
-            print("⚠️ Could not generate sample data")
-    print("\n✅ Setup complete!")
-    print("\nTo start training, run:")
-    print(f"  {sys.executable} train_dpo_hf_fixed.py")
-if __name__ == "__main__":
-    main()

test_scores/analyze_each_dataset.py ADDED Viewed

	@@ -0,0 +1,272 @@

+"""
+Create individual markdown reports for each BEIR dataset
+Analyze Score 1 vs Score 2 meanings with examples
+"""
+import json
+import os
+import random
+from pathlib import Path
+from collections import defaultdict
+def create_dataset_report(dataset_name, beir_base_path):
+    """Create a detailed markdown report for a single dataset"""
+    dataset_path = Path(beir_base_path) / dataset_name
+    if not dataset_path.exists():
+        return None
+    report = []
+    report.append(f"# {dataset_name.upper()} Dataset Analysis\n")
+    report.append(f"## Overview\n")
+    # Check which splits exist
+    qrels_path = dataset_path / "qrels"
+    if not qrels_path.exists():
+        report.append("No qrels directory found.\n")
+        return "\n".join(report)
+    # Load queries
+    queries = {}
+    queries_file = dataset_path / "queries.jsonl"
+    if queries_file.exists():
+        with open(queries_file, 'r') as f:
+            for line in f:
+                q = json.loads(line)
+                queries[q['_id']] = q['text']
+    report.append(f"- Total queries available: {len(queries)}\n")
+    # Load MORE corpus for better examples
+    corpus = {}
+    corpus_count = 0
+    corpus_file = dataset_path / "corpus.jsonl"
+    if corpus_file.exists():
+        with open(corpus_file, 'r') as f:
+            for i, line in enumerate(f):
+                corpus_count += 1
+                if i < 50000:  # Load MUCH MORE for examples
+                    doc = json.loads(line)
+                    corpus[doc['_id']] = {
+                        'title': doc.get('title', ''),
+                        'text': doc.get('text', '')
+                    }
+    report.append(f"- Total corpus documents: {corpus_count}\n")
+    report.append(f"- Corpus documents loaded for examples: {len(corpus)}\n")
+    # Analyze each split
+    split_data = {}
+    all_score_examples = {}  # Collect examples across all splits
+    for split in ["train", "dev", "test"]:
+        qrel_file = qrels_path / f"{split}.tsv"
+        if qrel_file.exists():
+            score_dist = {}
+            queries_set = set()
+            total_pairs = 0
+            examples_by_query = {}
+            with open(qrel_file, 'r') as f:
+                # Skip header if exists
+                first_line = f.readline().strip()
+                if not first_line.startswith("query"):
+                    f.seek(0)
+                for line in f:
+                    parts = line.strip().split('\t')
+                    if len(parts) >= 3:
+                        if len(parts) == 3:
+                            query_id, doc_id, score = parts
+                        else:  # len(parts) == 4, TREC format
+                            query_id = parts[0]
+                            doc_id = parts[2]
+                            score = parts[3]
+                        score = int(float(score))
+                        queries_set.add(query_id)
+                        score_dist[score] = score_dist.get(score, 0) + 1
+                        total_pairs += 1
+                        # Collect ALL examples WITH SPLIT INFO
+                        if query_id in queries and doc_id in corpus:
+                            if score not in all_score_examples:
+                                all_score_examples[score] = {}
+                            key = f"{split}:{query_id}"
+                            if key not in all_score_examples[score]:
+                                all_score_examples[score][key] = []
+                            all_score_examples[score][key].append(doc_id)
+                            if query_id not in examples_by_query:
+                                examples_by_query[query_id] = {}
+                            if score not in examples_by_query[query_id]:
+                                examples_by_query[query_id][score] = []
+                            examples_by_query[query_id][score].append(doc_id)
+            split_data[split] = {
+                "queries": len(queries_set),
+                "total_pairs": total_pairs,
+                "score_dist": score_dist,
+                "examples": examples_by_query
+            }
+    # Write split statistics
+    report.append("\n## Split Statistics\n")
+    report.append("| Split | Queries | Total Pairs | Score Distribution |\n")
+    report.append("|-------|---------|-------------|-------------------|\n")
+    for split in ["train", "dev", "test"]:
+        if split in split_data:
+            data = split_data[split]
+            scores_str = ", ".join([f"{k}:{v}" for k, v in sorted(data["score_dist"].items())])
+            report.append(f"| {split} | {data['queries']} | {data['total_pairs']} | {scores_str} |\n")
+        else:
+            report.append(f"| {split} | N/A | N/A | N/A |\n")
+    # Analyze score meanings
+    report.append("\n## Score Analysis\n")
+    unique_scores = set()
+    for split in split_data.values():
+        unique_scores.update(split["score_dist"].keys())
+    if len(unique_scores) == 1:
+        report.append(f"**Only one score level ({list(unique_scores)[0]}) found in this dataset.**\n")
+        report.append("- This means all labeled documents are considered equally relevant\n")
+        report.append("- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)\n")
+    else:
+        report.append(f"**Multiple score levels found: {sorted(unique_scores)}**\n")
+    # Add examples
+    report.append("\n## Examples\n")
+    # Show multiple examples with score comparisons
+    report.append("### Score Comparison Examples\n")
+    # Find queries with multiple score levels
+    queries_with_multiple_scores = []
+    for split_name, split in split_data.items():
+        for query_id, scores_dict in split["examples"].items():
+            if len(scores_dict) > 1 and query_id in queries:
+                queries_with_multiple_scores.append((split_name, query_id, scores_dict))
+    # Show up to 5 comparison examples
+    if queries_with_multiple_scores:
+        for i, (split_name, query_id, scores_dict) in enumerate(queries_with_multiple_scores[:5], 1):
+            report.append(f"#### Comparison Example {i}\n")
+            report.append(f"**Query:** {queries[query_id]}\n")
+            report.append(f"**From:** {split_name} split\n\n")
+            for score in sorted(scores_dict.keys(), reverse=True):
+                doc_ids = scores_dict[score]
+                report.append(f"**Score {score} Documents:**\n")
+                shown = 0
+                for doc_id in doc_ids[:3]:  # Show up to 3 docs per score
+                    if doc_id in corpus:
+                        doc = corpus[doc_id]
+                        shown += 1
+                        report.append(f"\n*Document {shown}:*\n")
+                        if doc['title']:
+                            report.append(f"- Title: {doc['title'][:200]}\n")
+                        text_preview = doc['text'][:400].replace('\n', ' ')
+                        report.append(f"- Text: {text_preview}...\n")
+                report.append("\n")
+            report.append("---\n\n")
+    # Show examples PER SPLIT
+    report.append("### Examples by Split and Score\n")
+    for split in ["train", "dev", "test"]:
+        if split not in split_data:
+            continue
+        report.append(f"\n#### {split.upper()} Split\n")
+        # Get scores for this split
+        split_scores = sorted(split_data[split]["score_dist"].keys())
+        report.append(f"**Scores in {split}: {split_scores}**\n")
+        for score in split_scores:
+            report.append(f"\n##### Score {score} Examples ({split})\n")
+            # Find examples for this score in this split
+            examples_shown = 0
+            for key, doc_ids in all_score_examples.get(score, {}).items():
+                if key.startswith(f"{split}:"):
+                    query_id = key.split(':', 1)[1]
+                    if query_id in queries and examples_shown < 10:  # Changed from 3 to 5
+                        examples_shown += 1
+                        report.append(f"\n**Example {examples_shown}:**\n")
+                        report.append(f"- Query: {queries[query_id]}\n")
+                        # Show first doc
+                        for doc_id in doc_ids[:1]:
+                            if doc_id in corpus:
+                                doc = corpus[doc_id]
+                                if doc['title']:
+                                    report.append(f"- Doc Title: {doc['title']}\n")
+                                text_preview = doc['text'].replace('\n', ' ')
+                                report.append(f"- Doc Text: {text_preview}...\n")
+                                break
+            if examples_shown == 0:
+                report.append("No examples found in loaded corpus.\n")
+    # Add sections to be filled
+    report.append("\n## Training Recommendations\n")
+    report.append("[TO BE FILLED BASED ON ANALYSIS]\n")
+    report.append("\n## Observations\n")
+    # Check for train/test mismatch
+    if "train" in split_data and "test" in split_data:
+        train_scores = set(split_data["train"]["score_dist"].keys())
+        test_scores = set(split_data["test"]["score_dist"].keys())
+        if train_scores != test_scores:
+            report.append(f"⚠️ **Score mismatch between train and test:**\n")
+            report.append(f"- Train has scores: {sorted(train_scores)}\n")
+            report.append(f"- Test has scores: {sorted(test_scores)}\n")
+            report.append(f"- This could cause issues when training models\n")
+    # Check for single score in train
+    if "train" in split_data:
+        train_scores = set(split_data["train"]["score_dist"].keys())
+        if len(train_scores) == 1 and 1 in train_scores:
+            report.append(f"⚠️ **Training data only has Score 1:**\n")
+            report.append(f"- All training documents are marked as equally relevant\n")
+            report.append(f"- Model cannot learn to distinguish relevance levels\n")
+            report.append(f"- May indicate collapsed/merged relevance levels\n")
+    return "\n".join(report)
+def main():
+    beir_base = "../beir_data"
+    output_dir = Path(beir_base).parent / "test_scores" / "dataset_reports"
+    output_dir.mkdir(exist_ok=True)
+    # List all datasets
+    datasets = [d for d in os.listdir(beir_base)
+                if os.path.isdir(os.path.join(beir_base, d))]
+    print("="*80)
+    print("Creating individual dataset reports...")
+    print("="*80)
+    for dataset in sorted(datasets):
+        print(f"Processing {dataset}...")
+        report = create_dataset_report(dataset, beir_base)
+        if report:
+            # Save report
+            output_file = output_dir / f"{dataset}_analysis.md"
+            with open(output_file, 'w', encoding='utf-8') as f:
+                f.write(report)
+            print(f"  ✓ Report saved to: {output_file}")
+    print(f"\nAll reports saved in: {output_dir}")
+if __name__ == "__main__":
+    main()

test_scores/analyze_webis_only.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""
+Analyze ONLY webis-touche2020 dataset - SAME FORMAT AS analyze_each_dataset.py
+"""
+import json
+import os
+import random
+from pathlib import Path
+from collections import defaultdict
+def create_dataset_report(dataset_name, beir_base_path):
+    """Create a detailed markdown report for a single dataset"""
+    dataset_path = Path(beir_base_path) / dataset_name
+    if not dataset_path.exists():
+        return None
+    report = []
+    report.append(f"# {dataset_name.upper()} Dataset Analysis\n")
+    report.append(f"## Overview\n")
+    # Check which splits exist
+    qrels_path = dataset_path / "qrels"
+    if not qrels_path.exists():
+        report.append("No qrels directory found.\n")
+        return "\n".join(report)
+    # Load queries
+    queries = {}
+    queries_file = dataset_path / "queries.jsonl"
+    if queries_file.exists():
+        with open(queries_file, 'r') as f:
+            for line in f:
+                q = json.loads(line)
+                queries[q['_id']] = q['text']
+    report.append(f"- Total queries available: {len(queries)}\n")
+    # Load MORE corpus for better examples
+    corpus = {}
+    corpus_count = 0
+    corpus_file = dataset_path / "corpus.jsonl"
+    if corpus_file.exists():
+        with open(corpus_file, 'r') as f:
+            for i, line in enumerate(f):
+                corpus_count += 1
+                if i < 50000:  # Load MUCH MORE for examples
+                    doc = json.loads(line)
+                    corpus[doc['_id']] = {
+                        'title': doc.get('title', ''),
+                        'text': doc.get('text', '')
+                    }
+    report.append(f"- Total corpus documents: {corpus_count}\n")
+    report.append(f"- Corpus documents loaded for examples: {len(corpus)}\n")
+    # Analyze each split
+    split_data = {}
+    all_score_examples = {}  # Collect examples across all splits
+    for split in ["train", "dev", "test"]:
+        qrel_file = qrels_path / f"{split}.tsv"
+        if qrel_file.exists():
+            score_dist = {}
+            queries_set = set()
+            total_pairs = 0
+            examples_by_query = {}
+            with open(qrel_file, 'r') as f:
+                # Skip header if exists
+                first_line = f.readline().strip()
+                if not first_line.startswith("query"):
+                    f.seek(0)
+                for line in f:
+                    parts = line.strip().split('\t')
+                    if len(parts) >= 3:
+                        if len(parts) == 3:
+                            query_id, doc_id, score = parts
+                        else:  # len(parts) == 4, TREC format
+                            query_id = parts[0]
+                            doc_id = parts[2]
+                            score = parts[3]
+                        score = int(float(score))
+                        queries_set.add(query_id)
+                        score_dist[score] = score_dist.get(score, 0) + 1
+                        total_pairs += 1
+                        # Collect ALL examples WITH SPLIT INFO
+                        if query_id in queries and doc_id in corpus:
+                            if score not in all_score_examples:
+                                all_score_examples[score] = {}
+                            key = f"{split}:{query_id}"
+                            if key not in all_score_examples[score]:
+                                all_score_examples[score][key] = []
+                            all_score_examples[score][key].append(doc_id)
+                            if query_id not in examples_by_query:
+                                examples_by_query[query_id] = {}
+                            if score not in examples_by_query[query_id]:
+                                examples_by_query[query_id][score] = []
+                            examples_by_query[query_id][score].append(doc_id)
+            split_data[split] = {
+                "queries": len(queries_set),
+                "total_pairs": total_pairs,
+                "score_dist": score_dist,
+                "examples": examples_by_query
+            }
+    # Write split statistics
+    report.append("\n## Split Statistics\n")
+    report.append("| Split | Queries | Total Pairs | Score Distribution |\n")
+    report.append("|-------|---------|-------------|-------------------|\n")
+    for split in ["train", "dev", "test"]:
+        if split in split_data:
+            data = split_data[split]
+            scores_str = ", ".join([f"{k}:{v}" for k, v in sorted(data["score_dist"].items())])
+            report.append(f"| {split} | {data['queries']} | {data['total_pairs']} | {scores_str} |\n")
+        else:
+            report.append(f"| {split} | N/A | N/A | N/A |\n")
+    # Analyze score meanings
+    report.append("\n## Score Analysis\n")
+    unique_scores = set()
+    for split in split_data.values():
+        unique_scores.update(split["score_dist"].keys())
+    if len(unique_scores) == 1:
+        report.append(f"**Only one score level ({list(unique_scores)[0]}) found in this dataset.**\n")
+        report.append("- This means all labeled documents are considered equally relevant\n")
+        report.append("- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)\n")
+    else:
+        report.append(f"**Multiple score levels found: {sorted(unique_scores)}**\n")
+    # Add examples
+    report.append("\n## Examples\n")
+    # Show multiple examples with score comparisons
+    report.append("### Score Comparison Examples\n")
+    # Find queries with multiple score levels
+    queries_with_multiple_scores = []
+    for split_name, split in split_data.items():
+        for query_id, scores_dict in split["examples"].items():
+            if len(scores_dict) > 1 and query_id in queries:
+                queries_with_multiple_scores.append((split_name, query_id, scores_dict))
+    # Show up to 5 comparison examples
+    if queries_with_multiple_scores:
+        for i, (split_name, query_id, scores_dict) in enumerate(queries_with_multiple_scores[:5], 1):
+            report.append(f"#### Comparison Example {i}\n")
+            report.append(f"**Query:** {queries[query_id]}\n")
+            report.append(f"**From:** {split_name} split\n\n")
+            for score in sorted(scores_dict.keys(), reverse=True):
+                doc_ids = scores_dict[score]
+                report.append(f"**Score {score} Documents:**\n")
+                shown = 0
+                for doc_id in doc_ids[:3]:  # Show up to 3 docs per score
+                    if doc_id in corpus:
+                        doc = corpus[doc_id]
+                        shown += 1
+                        report.append(f"\n*Document {shown}:*\n")
+                        if doc['title']:
+                            report.append(f"- Title: {doc['title'][:200]}\n")
+                        text_preview = doc['text'][:400].replace('\n', ' ')
+                        report.append(f"- Text: {text_preview}...\n")
+                report.append("\n")
+            report.append("---\n\n")
+    # Show examples PER SPLIT
+    report.append("### Examples by Split and Score\n")
+    for split in ["train", "dev", "test"]:
+        if split not in split_data:
+            continue
+        report.append(f"\n#### {split.upper()} Split\n")
+        # Get scores for this split
+        split_scores = sorted(split_data[split]["score_dist"].keys())
+        report.append(f"**Scores in {split}: {split_scores}**\n")
+        for score in split_scores:
+            report.append(f"\n##### Score {score} Examples ({split})\n")
+            # Find examples for this score in this split
+            examples_shown = 0
+            for key, doc_ids in all_score_examples.get(score, {}).items():
+                if key.startswith(f"{split}:"):
+                    query_id = key.split(':', 1)[1]
+                    if query_id in queries and examples_shown < 10:  # Changed from 3 to 5 then to 10
+                        examples_shown += 1
+                        report.append(f"\n**Example {examples_shown}:**\n")
+                        report.append(f"- Query: {queries[query_id]}\n")
+                        # Show first doc
+                        for doc_id in doc_ids[:1]:
+                            if doc_id in corpus:
+                                doc = corpus[doc_id]
+                                if doc['title']:
+                                    report.append(f"- Doc Title: {doc['title']}\n")
+                                text_preview = doc['text'].replace('\n', ' ')
+                                report.append(f"- Doc Text: {text_preview}...\n")
+                                break
+            if examples_shown == 0:
+                report.append("No examples found in loaded corpus.\n")
+    # Add sections to be filled
+    report.append("\n## Training Recommendations\n")
+    report.append("[TO BE FILLED BASED ON ANALYSIS]\n")
+    report.append("\n## Observations\n")
+    # Check for train/test mismatch
+    if "train" in split_data and "test" in split_data:
+        train_scores = set(split_data["train"]["score_dist"].keys())
+        test_scores = set(split_data["test"]["score_dist"].keys())
+        if train_scores != test_scores:
+            report.append(f"⚠️ **Score mismatch between train and test:**\n")
+            report.append(f"- Train has scores: {sorted(train_scores)}\n")
+            report.append(f"- Test has scores: {sorted(test_scores)}\n")
+            report.append(f"- This could cause issues when training models\n")
+    # Check for single score in train
+    if "train" in split_data:
+        train_scores = set(split_data["train"]["score_dist"].keys())
+        if len(train_scores) == 1 and 1 in train_scores:
+            report.append(f"⚠️ **Training data only has Score 1:**\n")
+            report.append(f"- All training documents are marked as equally relevant\n")
+            report.append(f"- Model cannot learn to distinguish relevance levels\n")
+            report.append(f"- May indicate collapsed/merged relevance levels\n")
+    return "\n".join(report)
+def main():
+    beir_base = "../beir_data"
+    output_dir = Path(beir_base).parent / "test_scores" / "dataset_reports"
+    output_dir.mkdir(exist_ok=True)
+    # ONLY PROCESS WEBIS-TOUCHE2020
+    dataset = "webis-touche2020"
+    print("="*80)
+    print(f"Creating report for {dataset} ONLY...")
+    print("="*80)
+    print(f"Processing {dataset}...")
+    report = create_dataset_report(dataset, beir_base)
+    if report:
+        # Save report
+        output_file = output_dir / f"{dataset}_analysis.md"
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write(report)
+        print(f"  ✓ Report saved to: {output_file}")
+    print(f"\nReport saved in: {output_dir}")
+if __name__ == "__main__":
+    main()

test_scores/beir_analysis.json ADDED Viewed

File without changes

test_scores/dataset_reports/FINAL_TRAINING_SUMMARY.md ADDED Viewed

	@@ -0,0 +1,189 @@

+# BEIR Datasets Training Summary
+## Overview
+Comprehensive analysis of all 12 BEIR datasets for three-category training (positive, hard_negative, easy_negative) to train a tiny LLM retriever.
+## Three-Category Training System
+### Categories:
+1. **Positive**: Relevant documents (Score 1 or 2 depending on dataset)
+2. **Hard Negative**: Documents with keywords but wrong intent (Score 1 from DBPedia-Entity and Webis-Touche2020)
+3. **Easy Negative**: Clearly irrelevant documents (Score 0 or from other queries within same dataset)
+### Sampling Strategy:
+- **Adaptive Ratio**: For each query, count hard negatives (H), then sample H × 10 easy negatives
+- **Within-Dataset Priority**: Sample negatives from same dataset first (harder, same domain)
+- **Cross-Dataset Fallback**: Only use other datasets' docs if needed
+## Dataset-by-Dataset Analysis
+### 1. ArguAna
+- **Verdict**: GOOD FOR TRAINING ✓
+- **Scores Available**: Only 1
+- **Score 1 Meaning**: RELEVANT (counter-arguments to the query)
+- **Splits**: test only (1406 queries)
+- **Training Strategy**: Binary classification
+- **Special Notes**: Counter-arguments are considered relevant (opposing stance, same topic)
+### 2. Climate-FEVER
+- **Verdict**: SKIP (NO POSITIVE EXAMPLES) ✗
+- **Scores Available**: Only 1
+- **Score 1 Meaning**: QUESTIONABLY RELEVANT (often just keyword matching)
+- **Splits**: test only (1535 queries)
+- **Training Strategy**: DO NOT USE for training
+- **Special Notes**: Can provide easy negatives to other datasets
+### 3. DBPedia-Entity
+- **Verdict**: GOLD MINE FOR HARD NEGATIVES ✓🏆
+- **Scores Available**: 0, 1, 2
+- **Score 1 Meaning**: HARD NEGATIVE (keyword match without intent)
+- **Score 2 Meaning**: POSITIVE (true entity relevance)
+- **Score 0 Meaning**: EASY NEGATIVE
+- **Splits**: dev (67 queries), test (400 queries)
+- **Training Categories**: Score 2 → positive, Score 1 → hard_negative, Score 0 → easy_negative
+- **Special Notes**: Critical for teaching models not to be fooled by keyword matching!
+### 4. FEVER
+- **Verdict**: GOOD FOR TRAINING ✓
+- **Scores Available**: Only 1
+- **Score 1 Meaning**: RELEVANT (evidence supporting OR refuting claims)
+- **Splits**: train (140085 queries), dev (6666 queries), test (6666 queries)
+- **Training Strategy**: Binary classification
+- **Special Notes**: Documents that refute claims ARE relevant (fact verification task)
+### 5. FiQA
+- **Verdict**: GOOD FOR TRAINING ✓
+- **Scores Available**: Only 1
+- **Score 1 Meaning**: RELEVANT (answers to financial questions)
+- **Splits**: train (5500 queries), dev (500 queries), test (648 queries)
+- **Training Strategy**: Binary classification
+- **Special Notes**: Financial Q&A dataset with opinionated answers
+### 6. HotPotQA
+- **Verdict**: GOOD FOR TRAINING ✓
+- **Scores Available**: Only 1
+- **Score 1 Meaning**: RELEVANT (documents for multi-hop reasoning)
+- **Splits**: train (85000 queries), dev (5447 queries), test (7405 queries)
+- **Training Strategy**: Binary classification
+- **Special Notes**: Multi-hop Q&A requiring reasoning over multiple documents
+### 7. NFCorpus
+- **Verdict**: GOOD FOR TRAINING ✓
+- **Scores Available**: 1, 2
+- **Score 1 Meaning**: RELEVANT (related medical papers)
+- **Score 2 Meaning**: HIGHLY RELEVANT (direct answers)
+- **Splits**: train (110575 queries), dev (324 queries), test (323 queries)
+- **Training Strategy**: Binary (1+2 as positive) or graded relevance
+- **Special Notes**: WARNING - train has only Score 1, test/dev have 1 and 2
+### 8. Quora
+- **Verdict**: GOOD FOR TRAINING ✓
+- **Scores Available**: Only 1
+- **Score 1 Meaning**: RELEVANT (duplicate questions)
+- **Splits**: dev (5000 queries), test (10000 queries)
+- **Training Strategy**: Binary classification
+- **Special Notes**: Duplicate question detection - excellent for semantic similarity
+### 9. SciDocs
+- **Verdict**: GOOD FOR TRAINING ✓
+- **Scores Available**: 0, 1
+- **Score 1 Meaning**: RELEVANT (citation-worthy papers)
+- **Splits**: test only (1000 queries)
+- **Training Strategy**: Binary classification
+- **Special Notes**: Citation prediction for scientific papers
+### 10. SciFact
+- **Verdict**: GOOD FOR TRAINING ✓
+- **Scores Available**: Only 1
+- **Score 1 Meaning**: RELEVANT (evidence for scientific claims)
+- **Splits**: train (920 queries), test (300 queries)
+- **Training Strategy**: Binary classification
+- **Special Notes**: Scientific claim verification
+### 11. TREC-COVID
+- **Verdict**: GOOD FOR TRAINING ✓
+- **Scores Available**: 0, 1, 2
+- **Score 1 Meaning**: RELEVANT (related COVID-19 info)
+- **Score 2 Meaning**: HIGHLY RELEVANT (direct COVID-19 answers)
+- **Splits**: test only (50 queries)
+- **Training Strategy**: Binary (1+2 as positive) or graded relevance
+- **Special Notes**: No training data - only test split
+### 12. Webis-Touche2020
+- **Verdict**: GOLD MINE FOR HARD NEGATIVES ✓🏆
+- **Scores Available**: 0, 1, 2
+- **Score 1 Meaning**: HARD NEGATIVE (wrong topic/context)
+- **Score 2 Meaning**: POSITIVE (proper arguments)
+- **Score 0 Meaning**: EASY NEGATIVE
+- **Splits**: test only (49 queries)
+- **Training Categories**: Score 2 → positive, Score 1 → hard_negative, Score 0 → easy_negative
+- **Special Notes**: Critical for teaching models topic relevance! (e.g., school uniforms → hats is wrong topic)
+## Summary Statistics
+### Overall Results:
+- **Datasets Providing Positives**: 11/12 datasets (92%)
+- **Datasets Providing Hard Negatives**: 2/12 datasets (17%) - THE GOLD MINES!
+- **Skip Completely**: 1/12 dataset (8%)
+### Category Breakdown:
+#### Standard Positive Providers (9 datasets):
+1. **ArguAna** - Counter-arguments (test only)
+2. **FEVER** - Fact verification (140K train!)
+3. **FiQA** - Financial Q&A (5.5K train)
+4. **HotPotQA** - Multi-hop reasoning (85K train!)
+5. **NFCorpus** - Medical IR (110K train, Score 1&2 both positive)
+6. **Quora** - Duplicate questions (dev/test)
+7. **SciDocs** - Citation prediction (test only)
+8. **SciFact** - Scientific claims (train/test)
+9. **TREC-COVID** - COVID-19 IR (test only, Score 1&2 both positive)
+#### Hard Negative Gold Mines (2 datasets):
+1. **DBPedia-Entity** - Score 1 = keyword match failures
+2. **Webis-Touche2020** - Score 1 = wrong topic/context
+#### Skip (1 dataset):
+1. **Climate-FEVER** - No reliable positives
+## Key Findings
+1. **Hard Negatives Are Gold**: Only 2 datasets (DBPedia-Entity, Webis-Touche2020) provide hard negatives
+   - These are CRITICAL for teaching models not to be fooled by keyword matching
+   - Score 1 in these datasets = perfect training examples of what NOT to retrieve
+2. **Within-Dataset Sampling Is Key**:
+   - Easy negatives from same dataset are harder (same domain)
+   - Cross-dataset negatives are too easy (completely different domains)
+   - Example: COVID query with other COVID papers (hard) vs cooking recipes (trivial)
+3. **Adaptive Sampling Strategy**:
+   - Count hard negatives (H) for each query
+   - Sample H × 10 easy negatives
+   - Creates balanced difficulty gradient
+## Training Recommendations
+### Three-Category Training Implementation:
+1. **Use the JSON config** (`training_config_v2.json`) with:
+   - `score_to_category`: Maps scores to positive/hard_negative/easy_negative
+   - `provides_hard_negatives`: Identifies DBPedia-Entity and Webis-Touche2020
+   - `sampling`: Within-dataset priority
+2. **Training Data Priority**:
+   - **MUST INCLUDE**: DBPedia-Entity and Webis-Touche2020 (for hard negatives)
+   - **Large Training Sets**: FEVER (140K), HotPotQA (85K), NFCorpus (110K)
+   - **Domain Variety**: FiQA (financial), NFCorpus (medical), SciFact (scientific)
+3. **Sampling Strategy**:
+   ```python
+   for query in dataset:
+       positives = get_positives(query)  # Score 1 or 2
+       hard_negatives = get_hard_negatives(query)  # Score 1 from DBPedia/Webis
+       num_easy = len(hard_negatives) * 10
+       easy_negatives = sample_within_dataset(query, num_easy)
+   ```
+4. **Skip Only**:
+   - Climate-FEVER (no reliable positive examples)

test_scores/dataset_reports/arguana_analysis.md ADDED Viewed

	@@ -0,0 +1,157 @@

+# ARGUANA Dataset Analysis
+## Overview
+- Total queries available: 1406
+- Total corpus documents: 8674
+- Corpus documents loaded for examples: 8674
+## Split Statistics
+| Split | Queries | Total Pairs | Score Distribution |
+|-------|---------|-------------|-------------------|
+| train | N/A | N/A | N/A |
+| dev | N/A | N/A | N/A |
+| test | 1406 | 1406 | 1:1406 |
+## Score Analysis
+**Only one score level (1) found in this dataset.**
+- This means all labeled documents are considered equally relevant
+- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)
+## Examples
+### Score Comparison Examples
+### Examples by Split and Score
+#### TEST Split
+**Scores in test: [1]**
+##### Score 1 Examples (test)
+**Example 1:**
+- Query: Being vegetarian helps the environment  Becoming a vegetarian is an environmentally friendly thing to do. Modern farming is one of the main sources of pollution in our rivers. Beef farming is one of the main causes of deforestation, and as long as people continue to buy fast food in their billions, there will be a financial incentive to continue cutting down trees to make room for cattle. Because of our desire to eat fish, our rivers and seas are being emptied of fish and many species are facing extinction. Energy resources are used up much more greedily by meat farming than my farming cereals, pulses etc. Eating meat and fish not only causes cruelty to animals, it causes serious harm to the environment and to biodiversity. For example consider Meat production related pollution and deforestation  At Toronto’s 1992 Royal Agricultural Winter Fair, Agriculture Canada displayed two contrasting statistics: “it takes four football fields of land (about 1.6 hectares) to feed each Canadian” and “one apple tree produces enough fruit to make 320 pies.” Think about it — a couple of apple trees and a few rows of wheat on a mere fraction of a hectare could produce enough food for one person! [1]  The 2006 U.N. Food and Agriculture Organization (FAO) report concluded that worldwide livestock farming generates 18% of the planet's greenhouse gas emissions — by comparison, all the world's cars, trains, planes and boats account for a combined 13% of greenhouse gas emissions. [2]  As a result of the above point producing meat damages the environment. The demand for meat drives deforestation. Daniel Cesar Avelino of Brazil's Federal Public Prosecution Office says “We know that the single biggest driver of deforestation in the Amazon is cattle.” This clearing of tropical rainforests such as the Amazon for agriculture is estimated to produce 17% of the world's greenhouse gas emissions. [3] Not only this but the production of meat takes a lot more energy than it ultimately gives us chicken meat production consumes energy in a 4:1 ratio to protein output; beef cattle production requires an energy input to protein output ratio of 54:1.  The same is true with water use due to the same phenomenon of meat being inefficient to produce in terms of the amount of grain needed to produce the same weight of meat, production requires a lot of water. Water is another scarce resource that we will soon not have enough of in various areas of the globe. Grain-fed beef production takes 100,000 liters of water for every kilogram of food. Raising broiler chickens takes 3,500 liters of water to make a kilogram of meat. In comparison, soybean production uses 2,000 liters for kilogram of food produced; rice, 1,912; wheat, 900; and potatoes, 500 liters. [4] This is while there are areas of the globe that have severe water shortages. With farming using up to 70 times more water than is used for domestic purposes: cooking and washing. A third of the population of the world is already suffering from a shortage of water. [5] Groundwater levels are falling all over the world and rivers are beginning to dry up. Already some of the biggest rivers such as China’s Yellow river do not reach the sea. [6]  With a rising population becoming vegetarian is the only responsible way to eat.  [1] Stephen Leckie, ‘How Meat-centred Eating Patterns Affect Food Security and the Environment’, International development research center  [2] Bryan Walsh, Meat: Making Global Warming Worse, Time magazine, 10 September 2008 .  [3] David Adam, Supermarket suppliers ‘helping to destroy Amazon rainforest’, The Guardian, 21st June 2009.  [4] Roger Segelken, U.S. could feed 800 million people with grain that livestock eat, Cornell Science News, 7th August 1997.  [5] Fiona Harvey, Water scarcity affects one in three, FT.com, 21st August 2003  [6] Rupert Wingfield-Hayes, Yellow river ‘drying up’, BBC News, 29th July 2004
+- Doc Title: animals environment general health health general weight philosophy ethics
+- Doc Text: You don’t have to be vegetarian to be green. Many special environments have been created by livestock farming – for example chalk down land in England and mountain pastures in many countries. Ending livestock farming would see these areas go back to woodland with a loss of many unique plants and animals. Growing crops can also be very bad for the planet, with fertilisers and pesticides polluting rivers, lakes and seas. Most tropical forests are now cut down for timber, or to allow oil palm trees to be grown in plantations, not to create space for meat production.  British farmer and former editor Simon Farrell also states: “Many vegans and vegetarians rely on one source from the U.N. calculation that livestock generates 18% of global carbon emissions, but this figure contains basic mistakes. It attributes all deforestation from ranching to cattle, rather than logging or development. It also muddles up one-off emissions from deforestation with on-going pollution.”  He also refutes the statement of meat production inefficiency: “Scientists have calculated that globally the ratio between the amounts of useful plant food used to produce meat is about 5 to 1. If you feed animals only food that humans can eat — which is, indeed, largely the case in the Western world — that may be true. But animals also eat food we can't eat, such as grass. So the real conversion figure is 1.4 to 1.” [1] At the same time eating a vegetarian diet may be no more environmentally friendly than a meat based diet if it is not sustainably sourced or uses perishable fruit and vegetables that are flown in from around the world. Eating locally sourced food can has as big an impact as being vegetarian. [2]  [1] Tara Kelly, Simon Fairlie: How Eating Meat Can Save the World, 12 October 2010  [2] Lucy Siegle, ‘It is time to become a vegetarian?’ The Observer, 18th May 2008...
+**Example 2:**
+- Query: It is immoral to kill animals  As evolved human beings it is our moral duty to inflict as little pain as possible for our survival. So if we do not need to inflict pain to animals in order to survive, we should not do it. Farm animals such as chickens, pigs, sheep, and cows are sentient living beings like us - they are our evolutionary cousins and like us they can feel pleasure and pain. The 18th century utilitarian philosopher Jeremy Bentham even believed that animal suffering was just as serious as human suffering and likened the idea of human superiority to racism. It is wrong to farm and kill these animals for food when we do not need to do so. The methods of farming and slaughter of these animals are often barbaric and cruel - even on supposedly 'free range' farms. [1] Ten billion animals were slaughtered for human consumption each year, stated PETA. And unlike the farms long time ago, where animals roamed freely, today, most animals are factory farmed: crammed into cages where they can barely move and fed a diet adulterated with pesticides and antibiotics. These animals spend their entire lives in their “prisoner cells” so small that they can't even turn around. Many suffer serious health problems and even death because they are selectively bred to grow or produce milk or eggs at a far greater rate than their bodies are capable of coping with. At the slaughterhouse, there were millions of others who are killed every year for food.  Further on Tom Regan explains that all duties regarding animals are indirect duties to one another from a philosophical point of view. He illustrates it with an analogy regarding children: “Children, for example, are unable to sign contracts and lack rights. But they are protected by the moral contract nonetheless because of the sentimental interests of others. So we have, then, duties involving these children, duties regarding them, but no duties to them. Our duties in their case are indirect duties to other human beings, usually their parents.” [2] With this he supports the theory that animals must be protected from suffering, as it is moral to protect any living being from suffering, not because we have a moral contract with them, but mainly due to respect of life and recognition of suffering itself.  [1] Claire Suddath, A brief history of Veganism, Time, 30 October 2008  [2] Tom Regan, The case for animal rights, 1989
+- Doc Title: animals environment general health health general weight philosophy ethics
+- Doc Text: There is a great moral difference between humans and animals. Unlike animals, humans are capable of rational thought and can alter the world around them. Other creatures were put on this earth for mankind to use, and that includes eating meat. For all these reasons we say that men and women have rights and that animals don’t. This means that eating meat is in no way like murder. It is natural for human beings to farm, kill, and eat other species. In the wild there is a brutal struggle for existence. The fact that we humans have succeeded in that struggle by exploiting our natural environment means that we have a natural right over lower species. In fact farming animals is much less brutal than the pain and hardship that animals inflict on each other naturally in the wild.  Eating meat does not need to mean cruelty to animals. There are a growing number of organic and free-range farms that can provide meat without cruelty to animals. Similarly, it might be reasonable to argue for an extension of animal welfare laws to protect farm animals - but that does not mean that it is wrong in principle to eat meat....
+**Example 3:**
+- Query: Vegetarianism is healthier  There are significant health benefits to 'going veggie'; a vegetarian diet contains high quantities of fibre, vitamins, and minerals, and is low in fat. (A vegan diet is even better since eggs and dairy products are high in cholesterol.) The risk of contracting many forms of cancer is increased by eating meat: in 1996 the American Cancer Society recommended that red meat should be excluded from the diet entirely. Eating meat also increases the risk of heart disease - vegetables contain no cholesterol, which can build up to cause blocked arteries in meat-eaters. An American study found out that: “that men in the highest quintile of red-meat consumption — those who ate about 5 oz. of red meat a day, roughly the equivalent of a small steak had a 31% higher risk of death over a 10-year period than men in the lowest-consumption quintile, who ate less than 1 oz. of red meat per day, or approximately three slices of corned beef.” [1] A vegetarian diet reduces the risk for chronic degenerative diseases such as obesity, high blood pressure, diabetes and types of cancer including colon, breast, stomach, and lung cancer because of it's low fat/cholesterol content. There are plenty of vegetarian sources of protein, such as beans and bean curd; and spinach is one of the best sources of iron.  [1] Tiffany Sharples, ‘The Growing Case Against Red Meat’, Time, 23rd March 2009
+- Doc Title: animals environment general health health general weight philosophy ethics
+- Doc Text: The key to good health is a balanced diet, not a meat- and fish-free diet. Meat and fish are good sources of protein, iron, and other vitamins and minerals. Most of the health benefits of a vegetarian diet derive from its being high in fibre and low in fat and cholesterol. These can be achieved by avoiding fatty and fried foods, eating only lean grilled meat and fish, and including a large amount of fruit and vegetables in your diet along with meat and fish. In general, raw, unprocessed meat from the muscle is made up of the following: protein 15 - 22 % Fat 3 - 15 % Minerals, carbohydrates 1 - 5 % Water 65 - 75 %, all things that we need in moderation. [1] A meat- and fish-free diet is unbalanced and makes it more likely that you will go short of protein, iron and some minerals such as B12 for which we are primarily dependent on animal foodstuffs. Also, a vegetarian diet, in the West, is a more expensive option - a luxury for the middle classes. Fresh fruit and vegetables are extremely expensive compared to processed meats, bacon, burgers, sausages etc.  [1] Bell, ‘Nutrition &amp; Well-Being’...
+**Example 4:**
+- Query: Being vegetarian reduces risks of food poisoning  Almost all dangerous types of food poisoning are passed on through meat or eggs. So Campylobacter bacteria, the most common cause of food poisoning in England, are usually found in raw meat and poultry, unpasteurised milk and untreated water. Salmonella come from raw meat, poultry and dairy products and most cases of escherichia coli (E-Coli) food poisoning occur after eating undercooked beef or drinking unpasteurised milk. [1]  Close contact between humans and animals also leads to zoonosis – diseases such as bird ‘flu which can be passed on from animals to humans. Using animal brains in the processed feed for livestock led to BSE in cattle and to CJD in humans who ate beef from infected cows.  [1] Causes of food poisoning, nhs.co.uk, 23rd June 2009
+- Doc Title: animals environment general health health general weight philosophy ethics
+- Doc Text: Food safety and hygiene are very important for everyone, and governments should act to ensure that high standards are in place particularly in restaurants and other places where people get their food from. But food poisoning can occur anywhere “People don't like to admit that the germs might have come from their own home” [1] and while meat is particularly vulnerable to contamination there are bacteria that can be transmitted on vegetables, for example Listeria monocytogenes can be transmitted raw vegetables. [2]  Almost three-quarters of zoonotic transmissions are caused by pathogens of wildlife origin; even some that could have been caused by livestock such as avian flu could equally have come from wild animals. There is little we can do about the transmission of such diseases except by reducing close contact. Thus changing to vegetarianism may reduce such diseases by reducing contact but would not eliminate them. [3]  Just as meat production can raise health issues, so does the arable farming of plants – examples include GM crops and worries about pesticide residues on fruit and vegetables. The important thing is not whether the diet is meat based or vegetarian; just that we should ensure all food is produced in a safe and healthy way.  [1] ‘ 10 ways to prevent food poisoning’, nhs.co.uk, 28th November 2010.  [2] Food Poisoning, emedicinehealth.  [3] Ulrich Desselberger, ‘The significance of zoonotic transmission of viruses in human disease’, Microbiology Today, November 2009....
+**Example 5:**
+- Query: There are problems with being vegetarian  A vegetarian or vegan diet may result in a person not getting enough iron. This is because, although you can get iron from foods such as pulses, green leafy vegetables and nuts, the iron in these foods isn't absorbed so easily. The symptoms of this feeling breathless after little exercise, feeling tired and a short attention span and poor concentration. [1] These symptoms could negatively affect proficiency in school and the ability to perform well at work ultimately leading to a loss of productivity which has both personal effects and broader effects for the economy. Other conditions include frequently becoming ill, frequently becoming depressed, and malnourishment.  [1] Bupa's Health Information Team, ‘Iron-deficiency anaemia’, bupa.co.uk, March 2010,
+- Doc Title: animals environment general health health general weight philosophy ethics
+- Doc Text: The problems with fatigue, apathetic behaviour and concentration are mostly a result from a lack of iron in the diet. However as with any diet this is only a problem when not eating the right things, this regularly means that such iron deficiency can be a problem in the developing world where vegetarians have little choice – usually eating little else except what they grow, normally just cereals. “Although the iron stores of vegetarians are sometimes reduced, the incidence of iron-deficiency anaemia in vegetarians is not significantly different from that in the general population”, there are plenty of sources of iron that can be eaten by vegetarians such as legumes and whole grains that are a substantial part of most western vegetarian’s diets meaning it is not a problem. [1] Research done in Australia concludes that "There was no significant difference between mean daily iron intakes of vegetarians and omnivores". [2]  [1] David Ogilvie, Nutrition: Iron and Vegetarian Diets, Vegetarian Network Victoria, September 2010.  [2] Madeleine J Ball and Melinda A Bartlett, ‘Dietary intake and iron status of Australian vegetarian women’, American Society for Clinical Nutrition, 1999...
+**Example 6:**
+- Query: Survival of the fittest  It is natural for human beings to farm, kill, and eat other species. In the wild there is a brutal struggle for existence as is shown by Darwin’s On the Origin of the Species. [1] The fact that we humans have succeeded in that struggle by exploiting our natural environment means that we have a natural right over lower species. The concept of survival of the fittest may seem outdated but it is still the defining order of nature. In fact farming animals is much less brutal than the pain and hardship that animals inflict on each other naturally in the wild.  [1] Darwin, Charles, On the Origin of Species by Means of Natural Selection, or the Preservation of Favoured Races in the Struggle for Life., Literature.org
+- Doc Title: animals environment general health health general weight philosophy ethics
+- Doc Text: To suggest that battery farms are in some way 'natural' is absurd - they are unnatural and cruel. To eat meat is to perpetuate animal suffering on a huge scale - a larger, crueler, and more systematic scale than anything found in the wild. Furthermore, the very fact of humanity's 'superiority' over other animals means they have the reason and moral instinct to stop exploiting other species. If an alien species from another planet, much more intelligent and powerful than humans, came and colonized the earth and farmed (and force-fed) human beings in battery farm conditions we would think it was morally abhorrent. If this would be wrong, then is it not wrong for we 'superior' humans to farm 'lower' species on earth simply because of our ability to do so?...
+**Example 7:**
+- Query: Humans can choose their own nutrition plan  Humans are omnivores – we are meant to eat both meat and plants. Like our early ancestors we have sharp canine teeth for tearing animal flesh and digestive systems adapted to eating meat and fish as well as vegetables. Our stomachs are also adapted to eating both meat and vegetable matter. All of this means that eating meat is part of being human. Only in a few western countries are people self-indulgent enough to deny their nature and get upset about a normal human diet. We were made to eat both meat and vegetables - cutting out half of this diet will inevitably mean we lose that natural balance. Eating meat is entirely natural. Like many other species, human beings were once hunters. In the wild animals kill and are killed, often very brutally and with no idea of “rights”. As mankind has progressed over thousands of years we have largely stopped hunting wild animals. Instead we have found kinder and less wasteful ways of getting the meat in our diets through domestication. Farm animals today are descended from the animals we once hunted in the wild.
+- Doc Title: animals environment general health health general weight philosophy ethics
+- Doc Text: Human evolved as omnivores over thousands of years. Yet since the invention of farming there is no longer a need for us to be omnivores. Even if we wished to we could no longer collect, hunt and eat our food in the same way as our ancestors as we could not support the human population. We have outstripped the pace of our evolution and if we do not want to be turning ever more land over to farming we have get our food from the most efficient sources, which means being vegetarian....
+**Example 8:**
+- Query: Animal research causes severe harm to the animals involved  The point of animal research is that animals are harmed. Even if they don’t suffer in the experiment, almost all are killed afterwards. With 115 million animals used a year this is a big problem. Releasing medical research animals in to the wild would be dangerous for them, and they would not be usable as pets. [4]. The only solution is that they are wild from birth.  It is obvious that it’s not in the interest of animals to be killed or harmed. Research should be banned in order to prevent the deaths of millions of animals.
+- Doc Title: animals science science general ban animal testing junior
+- Doc Text: What then is the interest of the animal? If releasing these animals into the wild would kill them then surely it is humane to put them down after the experiment. It must also be remembered that the interest of the animal is not the main and is outweighed by the benefits to humans. [5]...
+**Example 9:**
+- Query: It would send out a consistent message  Most countries have animal welfare laws to prevent animal cruelty but have laws like the UK’s Animals (Scientific Procedures) Act 1986, [10] that stop animal testing being a crime. This makes means some people can do things to animals, but not others. If the government are serious about animal abuse, why allow anyone to do it?
+- Doc Title: animals science science general ban animal testing junior
+- Doc Text: There is a moral difference between harm for the sake of harming an animal and harm in order to save lives. Lifesaving drugs is a very different purpose to betting or enjoyment that animal welfare laws are aimed at....
+**Example 10:**
+- Query: Animals shouldn’t be harmed  The difference between us and other animals is a matter of degree rather than type [2]. Their bodies resemble ours, as do their ways of conveying meaning. They recoil from pain, appear to express fear of a tormentor, and appear to take pleasure in activities; a point clear to anyone who has observed a pet dog on hearing the word “walk”.  We believe other people experience feelings like us because they are like us in appearance and behaviour. An animal sharing our anatomical, physiological, and behavioural characteristics is surely likely to have feelings like us.  If people have a right to not be harmed, we must ask ourselves what makes animals different? If animals feel what we feel, and suffer like us, to condemn one to testing because of them being of a different species is similar to racism or sexism.[3]
+- Doc Title: animals science science general ban animal testing junior
+- Doc Text: The right of a human not to be harmed is based not on appearance but on not harming others. Animals don’t participate in this.  Animals won’t stop hunting because of the pain and feelings of other animals. Even if animal testing were to be abolished people would still eat meat, and kill animals for other less worthwhile reasons than animal testing....
+## CORE
+**Only Score 1 exists**
+**Score 1 = RELEVANT** ✓
+## Special Notes
+- **Original Purpose**: Find the best counterargument to any argument
+- **What Score 1 Really Means**: Documents that argue AGAINST the query's position (opposing stance, same topic)
+- **Dataset Type**: Argumentation/Debate
+- **Unique Characteristic**: Counterarguments ARE relevant (same topic, different stance)
+## Training Verdict
+**GOOD FOR TRAINING** ✓
+- Score 1 documents are truly relevant (topically related counterarguments)
+- Can use for binary classification: Score 1 = positive class
+## Observations

test_scores/dataset_reports/climate-fever_analysis.md ADDED Viewed

	@@ -0,0 +1,122 @@

+# CLIMATE-FEVER Dataset Analysis
+## Overview
+- Total queries available: 1535
+- Total corpus documents: 5416593
+- Corpus documents loaded for examples: 50000
+## Split Statistics
+| Split | Queries | Total Pairs | Score Distribution |
+|-------|---------|-------------|-------------------|
+| train | N/A | N/A | N/A |
+| dev | N/A | N/A | N/A |
+| test | 1535 | 4681 | 1:4681 |
+## Score Analysis
+**Only one score level (1) found in this dataset.**
+- This means all labeled documents are considered equally relevant
+- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)
+## Examples
+### Score Comparison Examples
+### Examples by Split and Score
+#### TEST Split
+**Scores in test: [1]**
+##### Score 1 Examples (test)
+**Example 1:**
+- Query: Duffy pointed out that his chart was from a single tide gauge station, near San Francisco, and that sea levels rise at different rates around the world
+- Doc Title: 1906 San Francisco earthquake
+- Doc Text: The 1906 San Francisco earthquake struck the coast of Northern California at 5:12 a.m. on April 18 with an estimated moment magnitude of 7.8 and a maximum Mercalli intensity of XI ( Extreme ) . Severe shaking was felt from Eureka on the North Coast to the Salinas Valley , an agricultural region to the south of the San Francisco Bay Area . Devastating fires soon broke out in the city and lasted for several days . As a result , about 3,000 people died and over 80 % of the city of San Francisco was destroyed . The events are remembered as one of the worst and deadliest natural disasters in the history of the United States . The death toll remains the greatest loss of life from a natural disaster in California 's history and high in the lists of American urban disasters ....
+**Example 2:**
+- Query: While such a storm does not happen in New England anymore, it happened again there in 1675, with elderly eyewitnesses comparing it to the 1635 storm.
+- Doc Title: 1993 Storm of the Century
+- Doc Text: The 1993 Storm of the Century ( also known as the 93 Super Storm or the Great Blizzard of 1993 ) was a large cyclonic storm that formed over the Gulf of Mexico on March 12 , 1993 . The storm eventually dissipated in the North Atlantic Ocean on March 15 , 1993 . It was unique for its intensity , massive size , and wide-reaching effects . At its height , the storm stretched from Canada to the Gulf of Mexico . The cyclone moved through the Gulf of Mexico and then through the eastern United States before moving onto Canada .   Heavy snow was first reported in highland areas as far south as Alabama and northern Georgia , with Union County , Georgia reporting up to 35 inches of snow in the north Georgia mountains . Birmingham , Alabama , reported a rare 13 in of snow .  The Florida Panhandle reported up to 4 in , with hurricane-force wind gusts and record low barometric pressures . Between Louisiana and Cuba , the hurricane-force winds produced high storm surges across Northwestern Florida which , in combination with scattered tornadoes , killed dozens of people .   Record cold temperatures were seen across portions of the south and east of the US in the wake of this storm . In the United States , the storm was responsible for the loss of electric power to more than 10 million households . An estimated 40 percent of the country 's population experienced the effects of the storm with a total of 208 fatalities ....
+**Example 3:**
+- Query: While volcanic eruptions are natural events, it was the timing of these that had such a noticeable effect on the trend
+- Doc Title: 1257 Samalas eruption
+- Doc Text: The 1257 Samalas eruption was a major volcanic eruption of the Samalas volcano , next to Mount Rinjani on the island of Lombok , Indonesia . It left a large caldera next to Rinjani volcano , the lake Segara Anak . This volcanic eruption reached a volcanic explosivity index of 7 , making it one of the largest eruptions of the Holocene . The occurrence of this eruption was originally deduced from ice core information revealing a spike in sulfate deposition around the year 1257 ; in 2013 the source of this eruption at Samalas was definitively proven as a result of historical records from that time .   This eruption proceeded during four distinct phases , alternately creating eruption columns reaching up for tens of kilometres and pyroclastic flows that buried large parts of Lombok , destroying human habitations including the city of Pamatan . Ash from the eruption fell as far as Java . The volume of the deposits left exceeds 10 km3 . Later activity from the volcano created additional volcanic centres in the caldera , including the present day active Barujari cone .   The eruption was witnessed by humans on Lombok , who recorded its effects on text written on palm leaves . The aerosols injected by the volcano into the atmosphere reduced the solar radiation reaching the surface of the Earth , triggering a volcanic winter and cooling lasting for several years . It is possible that the 1257 Samalas eruption helped trigger the Little Ice Age ....
+**Example 4:**
+- Query: ‘While volcanic eruptions are natural events, it was the timing of these that had such a noticeable effect on the trend.
+- Doc Title: 1257 Samalas eruption
+- Doc Text: The 1257 Samalas eruption was a major volcanic eruption of the Samalas volcano , next to Mount Rinjani on the island of Lombok , Indonesia . It left a large caldera next to Rinjani volcano , the lake Segara Anak . This volcanic eruption reached a volcanic explosivity index of 7 , making it one of the largest eruptions of the Holocene . The occurrence of this eruption was originally deduced from ice core information revealing a spike in sulfate deposition around the year 1257 ; in 2013 the source of this eruption at Samalas was definitively proven as a result of historical records from that time .   This eruption proceeded during four distinct phases , alternately creating eruption columns reaching up for tens of kilometres and pyroclastic flows that buried large parts of Lombok , destroying human habitations including the city of Pamatan . Ash from the eruption fell as far as Java . The volume of the deposits left exceeds 10 km3 . Later activity from the volcano created additional volcanic centres in the caldera , including the present day active Barujari cone .   The eruption was witnessed by humans on Lombok , who recorded its effects on text written on palm leaves . The aerosols injected by the volcano into the atmosphere reduced the solar radiation reaching the surface of the Earth , triggering a volcanic winter and cooling lasting for several years . It is possible that the 1257 Samalas eruption helped trigger the Little Ice Age ....
+**Example 5:**
+- Query: A drop in volcanic activity caused warming.
+- Doc Title: 1257 Samalas eruption
+- Doc Text: The 1257 Samalas eruption was a major volcanic eruption of the Samalas volcano , next to Mount Rinjani on the island of Lombok , Indonesia . It left a large caldera next to Rinjani volcano , the lake Segara Anak . This volcanic eruption reached a volcanic explosivity index of 7 , making it one of the largest eruptions of the Holocene . The occurrence of this eruption was originally deduced from ice core information revealing a spike in sulfate deposition around the year 1257 ; in 2013 the source of this eruption at Samalas was definitively proven as a result of historical records from that time .   This eruption proceeded during four distinct phases , alternately creating eruption columns reaching up for tens of kilometres and pyroclastic flows that buried large parts of Lombok , destroying human habitations including the city of Pamatan . Ash from the eruption fell as far as Java . The volume of the deposits left exceeds 10 km3 . Later activity from the volcano created additional volcanic centres in the caldera , including the present day active Barujari cone .   The eruption was witnessed by humans on Lombok , who recorded its effects on text written on palm leaves . The aerosols injected by the volcano into the atmosphere reduced the solar radiation reaching the surface of the Earth , triggering a volcanic winter and cooling lasting for several years . It is possible that the 1257 Samalas eruption helped trigger the Little Ice Age ....
+**Example 6:**
+- Query: A drop of volcanic activity in the early 20th century may have had a warming effect.
+- Doc Title: 1257 Samalas eruption
+- Doc Text: The 1257 Samalas eruption was a major volcanic eruption of the Samalas volcano , next to Mount Rinjani on the island of Lombok , Indonesia . It left a large caldera next to Rinjani volcano , the lake Segara Anak . This volcanic eruption reached a volcanic explosivity index of 7 , making it one of the largest eruptions of the Holocene . The occurrence of this eruption was originally deduced from ice core information revealing a spike in sulfate deposition around the year 1257 ; in 2013 the source of this eruption at Samalas was definitively proven as a result of historical records from that time .   This eruption proceeded during four distinct phases , alternately creating eruption columns reaching up for tens of kilometres and pyroclastic flows that buried large parts of Lombok , destroying human habitations including the city of Pamatan . Ash from the eruption fell as far as Java . The volume of the deposits left exceeds 10 km3 . Later activity from the volcano created additional volcanic centres in the caldera , including the present day active Barujari cone .   The eruption was witnessed by humans on Lombok , who recorded its effects on text written on palm leaves . The aerosols injected by the volcano into the atmosphere reduced the solar radiation reaching the surface of the Earth , triggering a volcanic winter and cooling lasting for several years . It is possible that the 1257 Samalas eruption helped trigger the Little Ice Age ....
+## CORE
+**Only Score 1 exists**
+**Score 1 = QUESTIONABLY RELEVANT** ⚠️
+## Special Notes
+- **Original Purpose**: Fact verification - find evidence that supports/refutes climate claims
+- **What Score 1 Really Means**: Documents supposedly providing evidence but often just share keywords
+- **Major Problem**: Documents don't actually address claims (e.g., "volcanic warming" query gets "1257 eruption" doc)
+- **Dataset Type**: Fact verification (but poorly executed in BEIR version)
+## Training Verdict
+**BAD FOR TRAINING** ✗
+- Score 1 documents often just keyword match without addressing the claim
+- High false positive rate - many "relevant" docs aren't actually relevant
+- Will train model to do keyword matching instead of true relevance
+## Observations

test_scores/dataset_reports/dbpedia-entity_analysis.md ADDED Viewed

	@@ -0,0 +1,908 @@

+# DBPEDIA-ENTITY Dataset Analysis
+## Overview
+- Total queries available: 467
+- Total corpus documents: 4635922
+- Corpus documents loaded for examples: 50000
+## Split Statistics
+| Split | Queries | Total Pairs | Score Distribution |
+|-------|---------|-------------|-------------------|
+| train | N/A | N/A | N/A |
+| dev | 67 | 5673 | 0:4268, 1:1024, 2:381 |
+| test | 400 | 43515 | 0:28229, 1:8785, 2:6501 |
+## Score Analysis
+**Multiple score levels found: [0, 1, 2]**
+## Examples
+### Score Comparison Examples
+#### Comparison Example 1
+**Query:** Eiffel
+**From:** dev split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Eiffel Tower
+- Text: The Eiffel Tower (/ˈaɪfəl ˈtaʊər/ EYE-fəl TOWR; French: tour Eiffel [tuʁ‿ɛfɛl] About this sound listen) is an iron lattice tower located on the Champ de Mars in Paris, France. It was named after the engineer Alexandre Gustave Eiffel, whose company designed and built the tower....
+*Document 2:*
+- Title: Gustave Eiffel
+- Text: Alexandre Gustave Eiffel (born Bönickhausen; /ˈaɪfəl/; French pronunciation: [efɛl]; 15 December 1832 – 27 December 1923) was a French civil engineer and architect. A graduate of the prestigious École Centrale des Arts et Manufactures of France, he made his name with various bridges for the French railway network, most famously the Garabit viaduct....
+**Score 1 Documents:**
+*Document 1:*
+- Title: Eiffel (programming language)
+- Text: Eiffel is an ISO-standardized, object-oriented programming language designed by Bertrand Meyer (an object-orientation proponent and author of Object-Oriented Software Construction) and Eiffel Software. The design of the language is closely connected with the Eiffel programming method....
+**Score 0 Documents:**
+*Document 1:*
+- Title: Lightning
+- Text: Lightning is a sudden electrostatic discharge during an electrical storm between electrically charged regions of a cloud (called intra-cloud lightning or IC), between that cloud and another cloud (CC lightning), or between a cloud and the ground (CG lightning). The charged regions in the atmosphere temporarily equalize themselves through this discharge referred to as a strike if it hits an object ...
+*Document 2:*
+- Title: Nice
+- Text: Nice (/ˈniːs/, French pronunciation: [nis]; Niçard Occitan: Niça [classical norm] or Nissa [nonstandard], Italian: Nizza or Nizza Marittima, Greek: Νίκαια, Latin: Nicaea) is the fifth most populous city in France, after Paris, Marseille, Lyon and Toulouse, and it is the capital of the Alpes Maritimes département. The urban area of Nice extends beyond the administrative city limits with a populati...
+---
+#### Comparison Example 2
+**Query:** europe solar power facility
+**From:** dev split
+**Score 1 Documents:**
+*Document 1:*
+- Title: Solar energy
+- Text: Solar energy is radiant light and heat from the Sun harnessed using a range of ever-evolving technologies such as solar heating, photovoltaics, solar thermal energy, solar architecture and artificial photosynthesis.It is an important source of renewable energy and its technologies are broadly characterized as either passive solar or active solar depending on the way they capture and distribute sol...
+**Score 0 Documents:**
+*Document 1:*
+- Title: Germany
+- Text: Germany (/ˈdʒɜrməni/; German: Deutschland [ˈdɔʏtʃlant]), officially the Federal Republic of Germany (German: Bundesrepublik Deutschland, About this sound listen ), is a federal parliamentary republic in western-central Europe. It includes 16 constituent states and covers an area of 357,021 square kilometres (137,847 sq mi) with a largely temperate seasonal climate. Its capital and largest city is ...
+*Document 2:*
+- Title: Renewable energy
+- Text: Renewable energy is generally defined as energy that comes from resources which are naturally replenished on a human timescale, such as sunlight, wind, rain, tides, waves, and geothermal heat....
+---
+#### Comparison Example 3
+**Query:** vietnam war facts
+**From:** dev split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Gulf of Tonkin incident
+- Text: The Gulf of Tonkin incident (Vietnamese: Sự kiện Vịnh Bắc Bộ), also known as the USS Maddox incident, is the name given to what were originally claimed to be two separate confrontations involving North Vietnam and the United States in the waters of the Gulf of Tonkin. The original American report blamed North Vietnam for both incidents, but eventually became very controversial with widespread clai...
+*Document 2:*
+- Title: Vietnam War
+- Text: The Vietnam War (Vietnamese: Chiến tranh Việt Nam), also known as the Second Indochina War, and also known in Vietnam as Resistance War Against America (Vietnamese: Kháng chiến chống Mỹ) or simply the American War, was a Cold War-era proxy war that occurred in Vietnam, Laos, and Cambodia from 1 November 1955 to the fall of Saigon on 30 April 1975....
+**Score 1 Documents:**
+*Document 1:*
+- Title: Agent Orange
+- Text: Agent Orange—or Herbicide Orange (HO)—is one of the herbicides and defoliants used by the U.S. military as part of its herbicidal warfare program, Operation Ranch Hand, during the Vietnam War from 1961 to 1971. It was a mixture of equal parts of two herbicides, 2,4,5-T and 2,4-D.During the late 1940s and 1950s, the US and British collaborated on development of herbicides with potential application...
+*Document 2:*
+- Title: My Lai Massacre
+- Text: The Mỹ Lai Massacre (Vietnamese: thảm sát Mỹ Lai [tʰɐ̃ːm ʂɐ̌ːt mǐˀ lɐːj], [mǐˀlɐːj]; /ˌmiːˈlaɪ/, /ˌmiːˈleɪ/, or /ˌmaɪˈlaɪ/) was the Vietnam War mass killing of between 347 and 504 unarmed civilians in South Vietnam on March 16, 1968. It was committed by U.S. Army soldiers from the Company C of the 1st Battalion, 20th Infantry Regiment, 11th Brigade of the 23rd (Americal) Infantry Division. Victims...
+*Document 3:*
+- Title: People's Army of Vietnam
+- Text: The Vietnam People's Army (VPA; Vietnamese: Quân Đội Nhân Dân Việt Nam), also known as the Vietnamese People's Army and the People's Army of Vietnam, is the military forces of the  Socialist Republic of Vietnam. VPA is a part of Vietnam People's Armed Forces. The VPA includes: Ground Force (including Strategic Rear Forces), Navy, Air Force, Border Defense Force, and Coast Guard. However, Vietnam d...
+**Score 0 Documents:**
+*Document 1:*
+- Title: Bảo Đại
+- Text: In the Vietnamese name below, Nguyễn Phúc is the surname.Bảo Đại (lit. "keeper of greatness", 22 October 1913 – 30 July 1997), born Nguyễn Phúc Vĩnh Thụy, was the 13th and final   emperor of the Nguyễn Dynasty, which was the last dynasty of Vietnam. From 1926 to 1945, he was king of Annam. During this period, Annam was a protectorate within French Indochina, covering the central two-thirds of the ...
+*Document 2:*
+- Title: French Indochina
+- Text: French Indo-China (now commonly known as French Indochina) (French: Indochine française; Khmer:  សហភាពឥណ្ឌូចិន; Vietnamese: Đông Dương thuộc Pháp, pronounced [ɗoŋm zɰəŋ tʰuə̀k fǎp], frequently abbreviated to Đông Pháp; Lao: ຝຣັ່ງແຫຼັມອິນດູຈີນ), officially known as the Indo-Chinese Union (French: Union indochinoise) after 1887 and the Indo-Chinese Federation (French: Fédération indochinoise) after ...
+*Document 3:*
+- Title: Mainland Southeast Asia
+- Text: Mainland Southeast Asia, also known as Indochina or the Indochinese Peninsula, refers to the continental portion of Southeast Asia lying east of India and roughly south or southwest of China....
+---
+#### Comparison Example 4
+**Query:** guitar chord minor
+**From:** dev split
+**Score 1 Documents:**
+*Document 1:*
+- Title: Flamenco
+- Text: Flamenco (Spanish pronunciation: [flaˈmeŋko]) is a genre of music and dance native to the southern Spanish regions of Andalusia, Extremadura and Murcia. It includes cante (singing), toque (guitar playing), baile (dance) and jaleo, which refers to the vocalizations and rhythmic sounds of palmas (handclapping) and pitos (finger snapping). First mentioned in literature in 1774, the genre is thought t...
+*Document 2:*
+- Title: Guitar
+- Text: The guitar is a popular musical instrument classified as a string instrument with anywhere from 4 to 18 strings, usually having 6. The sound is projected either acoustically or through electrical amplification (for an acoustic guitar or an electric guitar, respectively).  It is typically played by strumming or plucking the strings with the right hand while fretting (or pressing against the fret) t...
+*Document 3:*
+- Title: Jazz guitar
+- Text: The term jazz guitar may refer to either a type of guitar or to the variety of guitar playing styles used in the various genres which are commonly termed "jazz". The jazz-type guitar was born as a result of using electric amplification to increase the volume of conventional acoustic guitars.Conceived in the early 1930s, the electric guitar became a necessity as jazz musicians sought to amplify the...
+**Score 0 Documents:**
+*Document 1:*
+- Title: Index of music articles
+- Text: This page is a list articles related to music....
+*Document 2:*
+- Title: Interval (music)
+- Text: In music theory, an interval is the difference between two pitches. An interval may be described as horizontal, linear, or melodic if it refers to successively sounding tones, such as two adjacent pitches in a melody, and vertical or harmonic if it pertains to simultaneously sounding tones, such as in a chord.In Western music, intervals are most commonly differences between notes of a diatonic sca...
+*Document 3:*
+- Title: Rhythm guitar
+- Text: Rhythm guitar is a technique and role that performs a combination of two functions: to provide all or part of the rhythmic pulse in conjunction with singers or other instruments; and to provide all or part of the harmony, i.e. the chords, where a chord is a group of notes played together. Therefore, the basic technique of rhythm guitar is to hold down a series of chords with the fretting hand whil...
+---
+#### Comparison Example 5
+**Query:** guitar classical bach
+**From:** dev split
+**Score 1 Documents:**
+*Document 1:*
+- Title: Andrés Segovia
+- Text: Andrés Segovia Torres, 1st Marquis of Salobreña (Spanish: [anˈdɾes seˈɣoβja ˈtores]) (21 February 1893 – 2 June 1987), known as Andrés Segovia, was a virtuoso Spanish classical guitarist from Linares, Spain. Regarded as one of the greatest guitarists of all time, he is seen as the grandfather of the classical guitar....
+**Score 0 Documents:**
+*Document 1:*
+- Title: Carl Philipp Emanuel Bach
+- Text: Carl Philipp Emanuel Bach (8 March 1714 – 14 December 1788) was a German Classical period musician and composer, the fifth child and second (surviving) son of Johann Sebastian Bach and Maria Barbara Bach. His second name was given in honor of his godfather Georg Philipp Telemann, a friend of Johann Sebastian Bach.C. P. E. Bach was an influential composer working at a time of transition between his...
+*Document 2:*
+- Title: Classical guitar
+- Text: The classical guitar (also called the Spanish guitar, or less specifically, the nylon-string guitar) is the member of the guitar family used in classical music....
+*Document 3:*
+- Title: Classical period (music)
+- Text: The dates of the Classical period in Western music are generally accepted as being between about 1750 and 1820. However, the term classical music is used in a colloquial sense as a synonym for Western art music, which describes a variety of Western musical styles from the ninth century to the present, and especially from the sixteenth or seventeenth to the nineteenth....
+---
+### Examples by Split and Score
+#### DEV Split
+**Scores in dev: [0, 1, 2]**
+##### Score 0 Examples (dev)
+**Example 1:**
+- Query: Eiffel
+- Doc Title: Lightning
+- Doc Text: Lightning is a sudden electrostatic discharge during an electrical storm between electrically charged regions of a cloud (called intra-cloud lightning or IC), between that cloud and another cloud (CC lightning), or between a cloud and the ground (CG lightning). The charged regions in the atmosphere ...
+**Example 2:**
+- Query: europe solar power facility
+- Doc Title: Germany
+- Doc Text: Germany (/ˈdʒɜrməni/; German: Deutschland [ˈdɔʏtʃlant]), officially the Federal Republic of Germany (German: Bundesrepublik Deutschland, About this sound listen ), is a federal parliamentary republic in western-central Europe. It includes 16 constituent states and covers an area of 357,021 square ki...
+**Example 3:**
+- Query: vietnam war facts
+- Doc Title: Bảo Đại
+- Doc Text: In the Vietnamese name below, Nguyễn Phúc is the surname.Bảo Đại (lit. "keeper of greatness", 22 October 1913 – 30 July 1997), born Nguyễn Phúc Vĩnh Thụy, was the 13th and final   emperor of the Nguyễn Dynasty, which was the last dynasty of Vietnam. From 1926 to 1945, he was king of Annam. During th...
+**Example 4:**
+- Query: vietnam travel airports
+- Doc Title: Charles de Gaulle Airport
+- Doc Text: Paris Charles de Gaulle Airport (French: Aéroport de Paris-Charles-de-Gaulle, IATA: CDG, ICAO: LFPG), also known as Roissy Airport (or just Roissy in French), is one of the world's principal aviation centres, as well as France's largest international airport. It is named after Charles de Gaulle (189...
+**Example 5:**
+- Query: guitar chord minor
+- Doc Title: Index of music articles
+- Doc Text: This page is a list articles related to music....
+**Example 6:**
+- Query: guitar classical bach
+- Doc Title: Carl Philipp Emanuel Bach
+- Doc Text: Carl Philipp Emanuel Bach (8 March 1714 – 14 December 1788) was a German Classical period musician and composer, the fifth child and second (surviving) son of Johann Sebastian Bach and Maria Barbara Bach. His second name was given in honor of his godfather Georg Philipp Telemann, a friend of Johann ...
+**Example 7:**
+- Query: tango music instruments
+- Doc Title: Accordion
+- Doc Text: Accordions (from 19th-century German Akkordeon, from Akkord - "musical chord, concord of sounds") are a family of box-shaped musical instruments of the bellows-driven free-reed aerophone type, colloquially referred to as a squeezebox. A person who plays the accordion is called an accordionist....
+**Example 8:**
+- Query: bicycle benefits environment
+- Doc Title: Bicycle
+- Doc Text: A bicycle, often called a bike or cycle, is a human-powered, pedal-driven, single-track vehicle, having two wheels attached to a frame, one behind the other. A bicycle rider is called a cyclist, or bicyclist.Bicycles were introduced in the 19th century in Europe and, as of 2003, more than a billion ...
+**Example 9:**
+- Query: July, 1850  president died Millard Fillmore sworn following day
+- Doc Title: 1850
+- Doc Text: Year 1850 (MDCCCL) was a common year starting on Tuesday (link will display the full calendar) of the Gregorian calendar and a common year starting on Sunday of the 12-day-slower Julian calendar....
+**Example 10:**
+- Query: 1906 territory Papua island Australian
+- Doc Title: Australian dollar
+- Doc Text: The Australian dollar (sign: $; code: AUD) is the currency of the Commonwealth of Australia, including Christmas Island, Cocos (Keeling) Islands, and Norfolk Island, as well as the independent Pacific Island states of Kiribati, Nauru and Tuvalu. Within Australia it is almost always abbreviated with ...
+##### Score 1 Examples (dev)
+**Example 1:**
+- Query: Eiffel
+- Doc Title: Eiffel (programming language)
+- Doc Text: Eiffel is an ISO-standardized, object-oriented programming language designed by Bertrand Meyer (an object-orientation proponent and author of Object-Oriented Software Construction) and Eiffel Software. The design of the language is closely connected with the Eiffel programming method....
+**Example 2:**
+- Query: europe solar power facility
+- Doc Title: Solar energy
+- Doc Text: Solar energy is radiant light and heat from the Sun harnessed using a range of ever-evolving technologies such as solar heating, photovoltaics, solar thermal energy, solar architecture and artificial photosynthesis.It is an important source of renewable energy and its technologies are broadly charac...
+**Example 3:**
+- Query: vietnam war facts
+- Doc Title: Agent Orange
+- Doc Text: Agent Orange—or Herbicide Orange (HO)—is one of the herbicides and defoliants used by the U.S. military as part of its herbicidal warfare program, Operation Ranch Hand, during the Vietnam War from 1961 to 1971. It was a mixture of equal parts of two herbicides, 2,4,5-T and 2,4-D.During the late 1940...
+**Example 4:**
+- Query: guitar chord minor
+- Doc Title: Flamenco
+- Doc Text: Flamenco (Spanish pronunciation: [flaˈmeŋko]) is a genre of music and dance native to the southern Spanish regions of Andalusia, Extremadura and Murcia. It includes cante (singing), toque (guitar playing), baile (dance) and jaleo, which refers to the vocalizations and rhythmic sounds of palmas (hand...
+**Example 5:**
+- Query: guitar classical bach
+- Doc Title: Andrés Segovia
+- Doc Text: Andrés Segovia Torres, 1st Marquis of Salobreña (Spanish: [anˈdɾes seˈɣoβja ˈtores]) (21 February 1893 – 2 June 1987), known as Andrés Segovia, was a virtuoso Spanish classical guitarist from Linares, Spain. Regarded as one of the greatest guitarists of all time, he is seen as the grandfather of the...
+**Example 6:**
+- Query: tango music instruments
+- Doc Title: Brass instrument
+- Doc Text: A brass instrument is a musical instrument that produces sound by sympathetic vibration of air in a tubular resonator in sympathy with the vibration of the player's lips. Brass instruments are also called labrosones, literally meaning "lip-vibrated instruments".There are several factors involved in ...
+**Example 7:**
+- Query: bicycle benefits environment
+- Doc Title: Cycling
+- Doc Text: Cycling, also called bicycling or biking, is the use of bicycles for transport, recreation, exercise or sport. Persons engaged in cycling are referred to as "cyclists", "bikers", or less commonly, as "bicyclists". Apart from two-wheeled bicycles, "cycling" also includes the riding of unicycles, tric...
+**Example 8:**
+- Query: 1906 territory Papua island Australian
+- Doc Title: History of Papua New Guinea
+- Doc Text: The prehistory of Papua New Guinea can be traced back to about 60,000 years ago when people first migrated towards the Australian continent. The written history began when European navigators first sighted New Guinea in the early part of the 16th century....
+**Example 9:**
+- Query: seat Florida country Dade
+- Doc Title: Dade City, Florida
+- Doc Text: Dade City is a city in Pasco County, Florida. It is popular with tourists for its antique stores, restaurants and historic architecture including the Pasco County Courthouse, Hugh Embry Library, and Edwinola. The annual Kumquat Festival is hosted downtown and the surrounding area is a large producer...
+**Example 10:**
+- Query: animals lay eggs mammals
+- Doc Title: Mammal
+- Doc Text: Mammals (class Mammalia /məˈmeɪli.ə/ from Latin mamma "breast") are any members of a clade of endothermic amniotes distinguished from reptiles and birds by the possession of hair, three middle ear bones, mammary glands, and a neocortex (a region of the brain)....
+##### Score 2 Examples (dev)
+**Example 1:**
+- Query: Eiffel
+- Doc Title: Eiffel Tower
+- Doc Text: The Eiffel Tower (/ˈaɪfəl ˈtaʊər/ EYE-fəl TOWR; French: tour Eiffel [tuʁ‿ɛfɛl] About this sound listen) is an iron lattice tower located on the Champ de Mars in Paris, France. It was named after the engineer Alexandre Gustave Eiffel, whose company designed and built the tower....
+**Example 2:**
+- Query: vietnam war facts
+- Doc Title: Gulf of Tonkin incident
+- Doc Text: The Gulf of Tonkin incident (Vietnamese: Sự kiện Vịnh Bắc Bộ), also known as the USS Maddox incident, is the name given to what were originally claimed to be two separate confrontations involving North Vietnam and the United States in the waters of the Gulf of Tonkin. The original American report bl...
+**Example 3:**
+- Query: tango music instruments
+- Doc Title: Concertina
+- Doc Text: A concertina is a free-reed musical instrument, like the various accordions and the harmonica. It has a bellows, and buttons typically on both ends of it. When pressed, the buttons travel in the same direction as the bellows, unlike accordion buttons, which travel perpendicularly to the bellows.The ...
+**Example 4:**
+- Query: animals lay eggs mammals
+- Doc Title: Echidna
+- Doc Text: Echidnas /ɨˈkɪdnə/, sometimes known as spiny anteaters, belong to the family Tachyglossidae in the monotreme order of egg-laying mammals. The four extant species, together with the platypus, are the only surviving members of that order and are the only extant mammals that lay eggs. Their diet consis...
+**Example 5:**
+- Query: Operating systems to which Steve Jobs related
+- Doc Title: Apple Lisa
+- Doc Text: The Lisa is a personal computer designed by Apple Computer, Inc. during the early 1980s. It was the first personal computer to offer a graphical user interface in a machine aimed at individual business users. Development of the Lisa began in 1978. The Lisa sold poorly, with only 100,000 units sold.I...
+**Example 6:**
+- Query: countries which have won the FIFA world cup
+- Doc Title: Brazil
+- Doc Text: Brazil (/brəˈzɪl/; Portuguese: Brasil [bɾaˈziw] ), officially the Federative Republic of Brazil (Portuguese: República Federativa do Brasil, About this sound listen  ), is the largest country in both South America and the Latin American region. It is the world's fifth-largest country, both by geogra...
+**Example 7:**
+- Query: Compilers that can compile both C and C++
+- Doc Title: Cfront
+- Doc Text: Cfront was the original compiler for C++ (then known as "C with Classes") from around 1983, which converted C++ to C; developed by Bjarne Stroustrup. The preprocessor did not understand all of the language and much of the code was written via translations. Cfront had a complete parser, built symbol ...
+**Example 8:**
+- Query: What is the longest river?
+- Doc Title: Missouri River
+- Doc Text: The Missouri River is the longest river in North America. Rising in the Rocky Mountains of western Montana, the Missouri flows east and south for 2,341 miles (3,767 km) before entering the Mississippi River north of St. Louis, Missouri. The river takes drainage from a sparsely populated, semi-arid w...
+**Example 9:**
+- Query: Who founded Intel?
+- Doc Title: Intel
+- Doc Text: Intel Corporation (commonly referred to as Intel) is an American multinational technology company headquartered in Santa Clara, California. Intel is one of the world's largest and highest valued semiconductor chip makers, based on revenue. It is the inventor of the x86 series of microprocessors, the...
+**Example 10:**
+- Query: What is the largest city in Australia?
+- Doc Title: Sydney
+- Doc Text: Sydney /ˈsɪdni/ is the state capital of New South Wales and the most populous city in Australia and Oceania. Located on Australia's east coast, the metropolis surrounds the world's largest natural harbour, and sprawls towards the Blue Mountains to the west. Residents of Sydney are known as \Sydneysi...
+#### TEST Split
+**Scores in test: [0, 1, 2]**
+##### Score 0 Examples (test)
+**Example 1:**
+- Query: Szechwan dish food cuisine
+- Doc Title: Black cardamom
+- Doc Text: Black cardamom, also known as hill cardamom, Bengal cardamom, greater cardamom, Indian cardamom, Nepal cardamom, winged cardamom, or brown cardamom, comes from either of two species in the family Zingiberaceae. Its seed pods have a strong camphor-like flavor, with a smoky character derived from the ...
+**Example 2:**
+- Query: roman architecture
+- Doc Title: Antoninus Pius
+- Doc Text: Antoninus Pius (Latin: Titus Fulvus Aelius Hadrianus Antoninus Augustus Pius; born 19 September, 86 AD –  died 7 March, 161 AD), also known as Antoninus, was Roman Emperor from 138 to 161. He was one of the Five Good Emperors in the Nerva–Antonine dynasty and the Aurelii.He acquired the name Pius af...
+**Example 3:**
+- Query: finland car industry manufacturer saab sisu
+- Doc Title: DKW
+- Doc Text: DKW (Dampf-Kraft-Wagen, English: steam-driven car) is a defunct German car and motorcycle marque.In 1916, Danish engineer Jørgen Skafte Rasmussen founded a factory in Zschopau, Saxony, Germany, to produce steam fittings. That year he attempted to produce a steam-driven car, called the DKW. Although ...
+**Example 4:**
+- Query: france second world war normandy
+- Doc Title: Normandy
+- Doc Text: Normandy (/ˈnɔrməndi/; French: Normandie, pronounced [nɔʁ.mɑ̃.di], Norman: Normaundie, from Old French Normanz, plural of Normant, originally from the word for "northman" in several Scandinavian languages) is a geographical region of France corresponding to the former Duchy of Normandy.Since 1956, N...
+**Example 5:**
+- Query: social network group selection
+- Doc Title: Evolutionary psychology
+- Doc Text: Evolutionary psychology (EP) is a theoretical approach in the social and natural sciences that examines psychological structure from a modern evolutionary perspective. It seeks to identify which human psychological traits are evolved adaptations – that is, the functional products of natural selectio...
+**Example 6:**
+- Query: D-Day normandy invasion
+- Doc Title: Bedford, Virginia
+- Doc Text: Bedford is an incorporated town located within Bedford County in the U.S. state of Virginia. It serves as the county seat of Bedford County. As of the 2010 census, the population was 6,622....
+**Example 7:**
+- Query: web ranking scoring algorithm
+- Doc Title: Algorithm
+- Doc Text: In mathematics and computer science, an algorithm (/ˈælɡərɪðəm/ AL-gə-ri-dhəm) is a self-contained step-by-step set of operations to be performed.  Algorithms exist that perform calculation, data processing, and automated reasoning.An algorithm is an effective method that can be expressed within a f...
+**Example 8:**
+- Query: virtual museums
+- Doc Title: List of museums by country
+- Doc Text: This list of museums is defined for this context as institutions (including nonprofit organizations, government entities, and private businesses) that collect and care for objects of cultural, artistic, scientific, or historical interest and make their collections or related exhibits available for p...
+**Example 9:**
+- Query: Indian food
+- Doc Title: Rice
+- Doc Text: Rice is the seed of the grass species Oryza sativa (Asian rice) or Oryza glaberrima (African rice). As a cereal grain, it is the most widely consumed staple food for a large part of the world's human population, especially in Asia....
+**Example 10:**
+- Query: composer museum
+- Doc Title: Béla Bartók
+- Doc Text: Béla Viktor János Bartók (/ˈbɑrtɒk/; Hungarian pronunciation: [ˈbeːlɒ ˈbɒrtoːk]; March 25, 1881 – September 26, 1945) was a Hungarian composer and pianist. He is considered one of the most important composers of the 20th century; he and Liszt are regarded as Hungary's greatest composers (Gillies 200...
+##### Score 1 Examples (test)
+**Example 1:**
+- Query: Szechwan dish food cuisine
+- Doc Title: American Chinese cuisine
+- Doc Text: American Chinese cuisine, known in the United States as simply Chinese cuisine, is a style of food developed by Americans of Chinese descent and served in many North American Chinese restaurants. The dishes typically served in restaurants cater to American tastes and differ significantly from Chines...
+**Example 2:**
+- Query: roman architecture
+- Doc Title: Arch
+- Doc Text: An arch is a curved structure that spans a space and may or may not support weight above it. Arch may be synonymous with vault but vault may be distinguished as a continuous arch forming a roof. Arches appeared as early as the 2nd millennium BC in Mesopotamian brick architecture  and their systemati...
+**Example 3:**
+- Query: finland car industry manufacturer saab sisu
+- Doc Title: Valmet
+- Doc Text: The Valmet Corporation is a Finnish company, a leading global developer and supplier of services and technologies for the pulp, paper and energy industries. It was reborn through the demerger of the pulp, paper and power businesses from Metso Group in December 2013. Valmet is organized around three ...
+**Example 4:**
+- Query: france second world war normandy
+- Doc Title: Invasion of Normandy
+- Doc Text: The Invasion of Normandy was the invasion by and establishment of Western Allied forces in Normandy, during Operation Overlord in 1944 during World War II; the largest amphibious invasion to ever take place.D-Day, the day of the initial assaults, was Tuesday 6 June 1944. Allied land forces that saw ...
+**Example 5:**
+- Query: social network group selection
+- Doc Title: Community
+- Doc Text: A community is a social unit of any size that shares common values. Although embodied or face-to-face communities are usually small, larger or more extended communities such as a national community, international community and virtual community are also studied....
+**Example 6:**
+- Query: D-Day normandy invasion
+- Doc Title: 1944
+- Doc Text: 1944 (MCMXLIV)was a leap year starting on Saturday of the Gregorian calendar (dominical letter BA), the 1944th year of the Common Era (CE) and Anno Domini (AD) designations, the 944th  year of the 2nd millennium, the 44th  year of the 20th century, and the  5th   year of the 1940s decade....
+**Example 7:**
+- Query: web ranking scoring algorithm
+- Doc Title: Google Search
+- Doc Text: Google Search, commonly referred to as Google Web Search or just Google, is a web search engine owned by Google Inc. It is the most-used search engine on the World Wide Web, handling more than three billion searches each day. As of February 2015 it is the most used search engine in the US with 64.5%...
+**Example 8:**
+- Query: virtual museums
+- Doc Title: ArtCyclopedia
+- Doc Text: Artcyclopedia is an online database of museum-quality fine art founded by Canadian John Malyon....
+**Example 9:**
+- Query: Indian food
+- Doc Title: Cardamom
+- Doc Text: Cardamom (/ˈkɑrdəməm/), sometimes called cardamon (mostly in the UK), is a spice made from the seeds of several plants in the genera Elettaria and Amomum in the family Zingiberaceae. Both genera are native to Bangladesh, Bhutan, India, Indonesia, Nepal, and Pakistan; they are recognised by their sma...
+**Example 10:**
+- Query: composer museum
+- Doc Title: Edward Elgar
+- Doc Text: Sir Edward William Elgar, 1st Baronet OM GCVO (2 June 1857 – 23 February 1934) was an English composer, many of whose works have entered the British and international classical concert repertoire. Among his best-known compositions are orchestral works including the Enigma Variations, the Pomp and Ci...
+##### Score 2 Examples (test)
+**Example 1:**
+- Query: Szechwan dish food cuisine
+- Doc Title: Chinese cuisine
+- Doc Text: Chinese cuisine includes styles originating from the diverse regions of China, as well as from Chinese people in other parts of the world. The history of Chinese cuisine in China stretches back for thousands of years and has changed from period to period and in each region according to climate, impe...
+**Example 2:**
+- Query: roman architecture
+- Doc Title: Aedicula
+- Doc Text: In ancient Roman religion, an aedicula (plural aediculae) is a small shrine. The word aedicula is the diminutive of the Latin aedes, a temple building or house.Many aediculae were household shrines that held small altars or statues of the Lares and Penates. The Lares were Roman deities protecting th...
+**Example 3:**
+- Query: D-Day normandy invasion
+- Doc Title: Juno Beach
+- Doc Text: Juno or Juno Beach was one of five beaches of the Allied invasion of German occupied France in the Normandy landings on 6 June 1944, during the Second World War. The beach spanned from Courseulles-sur-Mer, a village just east of the British beach Gold, to Saint-Aubin-sur-Mer, just west of the Britis...
+**Example 4:**
+- Query: Indian food
+- Doc Title: Curry
+- Doc Text: Curry (/ˈkʌri/, plural curries) is a dish originating in the cuisine of the Indian Subcontinent and Southeast Asia. The common feature is the incorporation of complex combinations of spices or herbs, usually including fresh or dried hot chillies .  Some limit the use of the term curry to dishes prep...
+**Example 5:**
+- Query: List of films from the surrealist category
+- Doc Title: Blue Velvet (film)
+- Doc Text: Blue Velvet is a 1986 American neo-noir mystery film, written and directed by David Lynch. Blending psychological horror with film noir, the film stars Kyle MacLachlan, Isabella Rossellini, Dennis Hopper and Laura Dern. The title is taken from Bobby Vinton's 1963 song of the same name. Although init...
+**Example 6:**
+- Query: Einstein Relativity theory
+- Doc Title: Spacetime
+- Doc Text: In physics, spacetime (also space–time, space time or space–time continuum) is any mathematical model that combines space and time into a single interwoven continuum. The spacetime of our universe is usually interpreted from a Euclidean space perspective, which regards space as consisting of three d...
+**Example 7:**
+- Query: summer flowers
+- Doc Title: Achillea millefolium
+- Doc Text: Achillea millefolium, known commonly as yarrow /ˈjæroʊ/ or common yarrow, is a flowering plant in the family Asteraceae. It is native to temperate regions of the Northern Hemisphere in Asia, Europe, and North America. It has been introduced as a feed for live stock in places like New Zealand and Aus...
+**Example 8:**
+- Query: vietnam war movie
+- Doc Title: The Deer Hunter
+- Doc Text: The Deer Hunter is a 1978 American epic  war drama film co-written and directed by Michael Cimino about a trio of Russian American steelworkers and their service in the Vietnam War. The film stars Robert De Niro, Christopher Walken, John Savage, John Cazale, Meryl Streep, and George Dzundza....
+**Example 9:**
+- Query: guitar origin blues
+- Doc Title: Blind Blake
+- Doc Text: Arthur "Blind" Blake (1896 – December 1, 1934) was an American blues and ragtime singer and guitarist. He is known for his series of recordings for Paramount Records between 1926 and 1932 and the mystery surrounding his life....
+**Example 10:**
+- Query: tango music  composers
+- Doc Title: Astor Piazzolla
+- Doc Text: Ástor Pantaleón Piazzolla (Spanish pronunciation: [piasola], Italian pronunciation: [pjattsɔlla]; March 11, 1921 – July 4, 1992) was an Argentine tango composer, bandoneon player and arranger. His oeuvre revolutionized the traditional tango into a new style termed nuevo tango, incorporating elements...
+## CORE
+**Has Scores 0, 1, 2**
+**Score 2 = RELEVANT** ✓
+**Score 1 = NOT RELEVANT** ✗
+## Special Notes
+- **Original Purpose**: Entity retrieval - find relevant entities for queries
+- **What Scores Actually Mean**:
+  - Score 2: Truly relevant entities (e.g., "Eiffel" → Eiffel Tower)
+  - Score 1: KEYWORD MATCHES ONLY (e.g., "Eiffel" → Eiffel programming language - WTF!)
+  - Score 0: Not relevant
+- **Major Problem**: Score 1 is just keyword matching, not intent matching!
+## Training Verdict
+**BAD FOR TRAINING** ✗
+- Score 1 is NOT relevant - just keyword overlap
+- Only Score 2 is truly relevant
+- For binary: Must use ONLY Score 2 as positive, Score 0+1 as negative
+- Using Score 1 as positive will teach model to do keyword matching instead of relevance!
+## Observations

test_scores/dataset_reports/fever_analysis.md ADDED Viewed

	@@ -0,0 +1,281 @@

+# FEVER Dataset Analysis
+## Overview
+- Total queries available: 123142
+- Total corpus documents: 5416568
+- Corpus documents loaded for examples: 50000
+## Split Statistics
+| Split | Queries | Total Pairs | Score Distribution |
+|-------|---------|-------------|-------------------|
+| train | 109810 | 140085 | 1:140085 |
+| dev | 6666 | 8079 | 1:8079 |
+| test | 6666 | 7937 | 1:7937 |
+## Score Analysis
+**Only one score level (1) found in this dataset.**
+- This means all labeled documents are considered equally relevant
+- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)
+## Examples
+### Score Comparison Examples
+### Examples by Split and Score
+#### TRAIN Split
+**Scores in train: [1]**
+##### Score 1 Examples (train)
+**Example 1:**
+- Query: 10 Cloverfield Lane stars the actor John Gallagher, Jr.
+- Doc Title: 10 Cloverfield Lane
+- Doc Text: 10 Cloverfield Lane is a 2016 American science fiction psychological thriller film directed by Dan Trachtenberg , produced by J. J. Abrams and Lindsey Weber and written by Josh Campbell , Matthew Stucken and Damien Chazelle . The film stars John Goodman , Mary Elizabeth Winstead , and John Gallagher...
+**Example 2:**
+- Query: The 1991 NBA Finals had the Lakers in it.
+- Doc Title: 1991 NBA Finals
+- Doc Text: The 1991 NBA Finals was the championship round of the 1990 -- 91 NBA season . It was also the first NBA Finals broadcast by NBC after 17 years with CBS .   The documentary `` Learning to Fly , '' narrated by Jeff Kaye , recaps Chicago 's successful first championship season . The theme song is `` Le...
+**Example 3:**
+- Query: 100 Greatest of All Time had timed episodes.
+- Doc Title: 100 Greatest of All Time
+- Doc Text: 100 Greatest of All Time was a television series of five one-hour episodes , produced and first aired by Tennis Channel in March 2012 . It presented a list of the `` 100 greatest tennis players of all time '' , both men and women . The series was hosted by Jack Nicklaus , Jerry Rice , Wayne Gretzky ...
+**Example 4:**
+- Query: In 1992, the 1992 Los Angeles riots occurred.
+- Doc Title: 1992 Los Angeles riots
+- Doc Text: The 1992 Los Angeles riots , also known as the Rodney King riots , the South Central riots , the 1992 Los Angeles civil disturbance , the 1992 Los Angeles civil unrest , and the Los Angeles uprising , were a series of riots , lootings , arsons , and civil disturbances that occurred in Los Angeles Co...
+**Example 5:**
+- Query: The 1992 Los Angeles riots were a series of events of civil disorder.
+- Doc Title: 1992 Los Angeles riots
+- Doc Text: The 1992 Los Angeles riots , also known as the Rodney King riots , the South Central riots , the 1992 Los Angeles civil disturbance , the 1992 Los Angeles civil unrest , and the Los Angeles uprising , were a series of riots , lootings , arsons , and civil disturbances that occurred in Los Angeles Co...
+**Example 6:**
+- Query: Melbourne is in Russia.
+- Doc Title: 1956 Summer Olympics
+- Doc Text: The 1956 Summer Olympics , officially known as the Games of the XVI Olympiad , were an international multi-sport event which was held in Melbourne , Victoria , Australia , in 1956 , apart from the equestrian events , which were held five months earlier in Stockholm . The 1956 Games were the first to...
+**Example 7:**
+- Query: R. Kelly is incapable of being a musician.
+- Doc Title: 12 Play
+- Doc Text: 12 Play is the debut studio album by American R&B and soul musician R. Kelly ; it was released on November 9 , 1993 , by Jive Records . The album follows his tenure with R&B group Public Announcement , with whom he released one album , Born into the 90 's ( 1992 ) . It went on to top the R&B albums ...
+**Example 8:**
+- Query: Brad Pitt produced at least two Best Picture Oscar winners.
+- Doc Title: 12 Years a Slave (film)
+- Doc Text: 12 Years a Slave is a 2013 period drama film and an adaptation of the 1853 slave narrative memoir Twelve Years a Slave by Solomon Northup , a New York State-born free African-American man who was kidnapped in Washington , D.C. , in 1841 and sold into slavery . Northup worked on plantations in the st...
+**Example 9:**
+- Query: 19 Kids and Counting was cancelled in 2001.
+- Doc Title: 19 Kids and Counting
+- Doc Text: 19 Kids and Counting ( formerly 17 Kids and Counting and 18 Kids and Counting ) was an American reality television show that aired on the cable channel TLC for seven years , until its cancellation in 2015 . The show featured the Duggar family : parents Jim Bob and Michelle Duggar and their nineteen ...
+**Example 10:**
+- Query: 19 Kids and Counting was cancelled.
+- Doc Title: 19 Kids and Counting
+- Doc Text: 19 Kids and Counting ( formerly 17 Kids and Counting and 18 Kids and Counting ) was an American reality television show that aired on the cable channel TLC for seven years , until its cancellation in 2015 . The show featured the Duggar family : parents Jim Bob and Michelle Duggar and their nineteen ...
+#### DEV Split
+**Scores in dev: [1]**
+##### Score 1 Examples (dev)
+**Example 1:**
+- Query: Damon Albarn has released something.
+- Doc Title: 13 (Blur album)
+- Doc Text: 13 is the sixth studio album by English alternative rock band Blur , released on 15 March 1999 . Jettisoning the Britpop sound of Blur 's early career , 13 explores experimental , psychedelic and electronic music .   Recording took place from June to October 1998 in London and Reykjavík . The album ...
+**Example 2:**
+- Query: The New York Knicks are in the Prince of Wales Conference of the National Hockey League.
+- Doc Title: 1969–70 NBA season
+- Doc Text: The 1969 -- 70 NBA Season was the 24th season of the National Basketball Association . The season ended with the New York Knicks winning the NBA Championship , beating the Los Angeles Lakers 4 games to 3 in the NBA Finals ....
+**Example 3:**
+- Query: Aleister Crowley was European.
+- Doc Title: 100 Greatest Britons
+- Doc Text: 100 Greatest Britons was broadcast in 2002 by the BBC . The programme was based on a television poll conducted to determine whom the United Kingdom public considered the greatest British people in history . The series , Great Britons , included individual programmes featuring the individuals who fea...
+**Example 4:**
+- Query: Aleister Crowley was French.
+- Doc Title: 100 Greatest Britons
+- Doc Text: 100 Greatest Britons was broadcast in 2002 by the BBC . The programme was based on a television poll conducted to determine whom the United Kingdom public considered the greatest British people in history . The series , Great Britons , included individual programmes featuring the individuals who fea...
+**Example 5:**
+- Query: Nestor Carbonell played Mayor Anthony Garcia in The Dark Knight and The Dark Knight Rises during the 19th century.
+- Doc Title: 19th century
+- Doc Text: The 19th century ( 1 January 1801 -- 31 December 1900 ) was the century marked by the collapse of the Spanish , Napoleonic , Holy Roman and Mughal empires . This paved the way for the growing influence of the British Empire , the Russian Empire , the United States , the German Empire , the French co...
+#### TEST Split
+**Scores in test: [1]**
+##### Score 1 Examples (test)
+**Example 1:**
+- Query: John Goodman is not an actor.
+- Doc Title: 10 Cloverfield Lane
+- Doc Text: 10 Cloverfield Lane is a 2016 American science fiction psychological thriller film directed by Dan Trachtenberg , produced by J. J. Abrams and Lindsey Weber and written by Josh Campbell , Matthew Stucken and Damien Chazelle . The film stars John Goodman , Mary Elizabeth Winstead , and John Gallagher...
+**Example 2:**
+- Query: The Africa Cup of Nations is a friendly global soccer exhibition.
+- Doc Title: 1957 African Cup of Nations
+- Doc Text: The 1957 African Cup of Nations was the first edition of the Africa Cup of Nations , the football championship of Africa ( CAF ) . It was hosted by Sudan , and three teams participated : Egypt , Sudan , and Ethiopia . The tournament has the fewest goals to be scored , as well as the fewest games to ...
+**Example 3:**
+- Query: John Goodman starred in 10 Cloverfield Lane.
+- Doc Title: 10 Cloverfield Lane
+- Doc Text: 10 Cloverfield Lane is a 2016 American science fiction psychological thriller film directed by Dan Trachtenberg , produced by J. J. Abrams and Lindsey Weber and written by Josh Campbell , Matthew Stucken and Damien Chazelle . The film stars John Goodman , Mary Elizabeth Winstead , and John Gallagher...
+**Example 4:**
+- Query: Jimi Hendrix received training for air assault operations.
+- Doc Title: 101st Airborne Division
+- Doc Text: The 101st Airborne Division ( `` Screaming Eagles '' ) is an elite modular specialized light infantry division of the United States Army trained for air assault operations . It is the most potent and tactically mobile of the U.S. Army 's divisions , and has been referred to as `` the tip of the spea...
+**Example 5:**
+- Query: Jimi Hendrix was not trained for air assault operations.
+- Doc Title: 101st Airborne Division
+- Doc Text: The 101st Airborne Division ( `` Screaming Eagles '' ) is an elite modular specialized light infantry division of the United States Army trained for air assault operations . It is the most potent and tactically mobile of the U.S. Army 's divisions , and has been referred to as `` the tip of the spea...
+**Example 6:**
+- Query: B. R. Ambedkar was born in the 20th century.
+- Doc Title: 19th century
+- Doc Text: The 19th century ( 1 January 1801 -- 31 December 1900 ) was the century marked by the collapse of the Spanish , Napoleonic , Holy Roman and Mughal empires . This paved the way for the growing influence of the British Empire , the Russian Empire , the United States , the German Empire , the French co...
+## CORE
+**Only Score 1 exists**
+**Score 1 = RELEVANT** ✓
+## Special Notes
+- **Original Purpose**: Fact verification - find evidence that supports OR refutes claims
+- **What Score 1 Really Means**: Documents containing evidence to verify claims (support or refute)
+- **Dataset Type**: Fact verification
+- **Key Characteristic**: Documents that refute claims ARE relevant (e.g., "Melbourne is in Russia" → doc proving it's in Australia)
+## Training Verdict
+**GOOD FOR TRAINING** ✓
+- Score 1 documents provide actual evidence for/against claims
+- True relevance matching (not just keyword matching)
+- Can use for binary classification: Score 1 = positive class
+## Observations
+⚠️ **Training data only has Score 1:**
+- All training documents are marked as equally relevant
+- Model cannot learn to distinguish relevance levels
+- May indicate collapsed/merged relevance levels

test_scores/dataset_reports/fiqa_analysis.md ADDED Viewed

	@@ -0,0 +1,302 @@

+# FIQA Dataset Analysis
+## Overview
+- Total queries available: 6648
+- Total corpus documents: 57638
+- Corpus documents loaded for examples: 50000
+## Split Statistics
+| Split | Queries | Total Pairs | Score Distribution |
+|-------|---------|-------------|-------------------|
+| train | 5500 | 14166 | 1:14166 |
+| dev | 500 | 1238 | 1:1238 |
+| test | 648 | 1706 | 1:1706 |
+## Score Analysis
+**Only one score level (1) found in this dataset.**
+- This means all labeled documents are considered equally relevant
+- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)
+## Examples
+### Score Comparison Examples
+### Examples by Split and Score
+#### TRAIN Split
+**Scores in train: [1]**
+##### Score 1 Examples (train)
+**Example 1:**
+- Query: What is considered a business expense on a business trip?
+- Doc Text: The IRS Guidance pertaining to the subject.  In general the best I can say is your business expense may be deductible.  But it depends on the circumstances and what it is you want to deduct. Travel Taxpayers who travel away from home on business may deduct related   expenses, including the cost of r...
+**Example 2:**
+- Query: Business Expense - Car Insurance Deductible For Accident That Occurred During a Business Trip
+- Doc Text: As a general rule, you must choose between a mileage deduction or an actual expenses deduction.  The idea is that the mileage deduction is supposed to cover all costs of using the car.  Exceptions include parking fees and tolls, which can be deducted separately under either method.  You explicitly c...
+**Example 3:**
+- Query: Starting a new online business
+- Doc Text: Most US states have rules that go something like this: You will almost certainly have to pay some registration fees, as noted above.  Depending on how you organize, you may or may not need to file a separate tax return for the business. (If you're sole proprietor for tax purposes, then you file on S...
+**Example 4:**
+- Query: “Business day” and “due date” for bills
+- Doc Text: You definitely have an argument for getting them to reverse the late fee, especially if it hasn't happened very often. (If you are late every month they may be less likely to forgive.) As for why this happens, it's not actually about business days, but instead it's based on when they know that you p...
+**Example 5:**
+- Query: New business owner - How do taxes work for the business vs individual?
+- Doc Text: Through your question and then clarification through the comments, it looks like you have a U.S. LLC with at least two members.  If you did not elect some other tax treatment, your LLC will be treated as a partnership by the IRS.  The partnership should file a tax return on Form 1065.  Then each par...
+**Example 6:**
+- Query: Hobby vs. Business
+- Doc Text: Miscellaneous income -- same category used for hobbies....
+**Example 7:**
+- Query: Does U.S. tax code call for small business owners to count business purchases as personal income?
+- Doc Text: "I am going to keep things very simple and explain the common-sense reason why the accountant is right: Also, my sister in law owns a small restaurant, where they claim their accountant informed them of the same thing, where a portion of their business purchases had to be counted as taxable personal...
+**Example 8:**
+- Query: How can I register a UK business without providing a business address?
+- Doc Text: You don't have to provide your personal home address per se. You can provide a legal address where Companies house can send across paper correspondence to. Companies house legally requires an address because directors are liable to their shareholders(even if you are the only shareholder) and to stop...
+**Example 9:**
+- Query: What are 'business fundamentals'?
+- Doc Text: From http://financial-dictionary.thefreedictionary.com/Business+Fundamentals The facts that affect a company's   underlying value. Examples of business   fundamentals include debt, cash flow,   supply of and demand for the company's   products, and so forth. For instance,   if a company does not hav...
+**Example 10:**
+- Query: Business Investment Loss from prior year
+- Doc Text: You need to give specific dates! In the United States, you have three years to file an amended tax return. https://www.irs.gov/uac/Newsroom/Ten-Facts-about-Amended-Tax-Returns Did the restaurant fail in 2012? If so, that's probably the year to take the loss. If you need to amend your 2012 return, wh...
+#### DEV Split
+**Scores in dev: [1]**
+##### Score 1 Examples (dev)
+**Example 1:**
+- Query: Claiming business expenses for a business with no income
+- Doc Text: Yes you can claim your business deductions if you are not making any income yet. But first you should decide what structure you want to have for your business. Either a Company structure or a Sole Trader or Partnership. Company Structure If you choose a Company Structure (which is more expensive to ...
+**Example 2:**
+- Query: Transferring money from One business checking to another business checking
+- Doc Text: "You should have separate files for each of the two businesses.  The business that transfers money out should ""write check"" in its QB file.  The business that receives money should ""make deposit"" in its QB file. (In QB you ""write check"" even when you make the payment by some other means like A...
+**Example 3:**
+- Query: Having a separate bank account for business/investing, but not a “business account?”
+- Doc Text: "Having a separate checking account for the business makes sense. It simplifies documenting your income/expenses. You can ""explain"" every dollar entering and exiting the account without having to remember that some of them were for non-business items. My credit union allowed me to have a 2nd check...
+**Example 4:**
+- Query: Income tax exemptions for small business?
+- Doc Text: Yes, you should be able to deduct at least some of these expenses.  For expense incurred before you started the business: What Are Deductible Startup Costs? The IRS defines “startup costs” as deductible capital expenses that are used to pay for: 1) The cost of “investigating the creation or acquisit...
+**Example 5:**
+- Query: Do I need a business credit card?
+- Doc Text: It can certainly help build a credit score, but remember that businesses gain credit differently from individuals.  Depending on the country, there isn't usually a national register of business credit ratings the way there is for individuals.  The credit record you'd be gaining is with your own bank...
+**Example 6:**
+- Query: Personal credit card for business expenses
+- Doc Text: Early on, one might not be able to get credit for their business. For convenience, and the card perks, it makes sense to use the personal card. But for sake of a clean paper trail, I'd choose 1 card and use it exclusively, 100% for the business. Not one card here, one card there....
+**Example 7:**
+- Query: Borrow money to invest in a business venture with equity?
+- Doc Text: It's clearly a risk, but is it any different than investing in your own business? Yes, it is different.  If you own a business, you determine the path of the business.  You determine how much risk the business takes.  You can put in extra effort to try to make the business work.  You can choose to l...
+**Example 8:**
+- Query: What's “wrong” with taking money from your own business?
+- Doc Text: "I'm no expert on this, but I would say that, if you own the business entirely yourself, there is nothing terribly wrong with using it for your own purposes as you would any other asset that you own.  What is wrong is not keeping accurate records that distinguish between your money and the business'...
+**Example 9:**
+- Query: Can Health-Releated Services be a Business Expense?
+- Doc Text: Chris, since you own your own company, nobody can stop you from charging your personal expenses to your business account. IRS is not a huge fan of mixing business and personal expenses and this practice might indicate to them that you are not treating your business seriously, and it should classify ...
+**Example 10:**
+- Query: What forms do I need to fill out for a super basic LLC closing?
+- Doc Text: If it is a sole proprietorship and  you didn't make another mistake by explicitly asking the IRS to treat it as a corporation - there are no IRS forms to fill. You'll need to dissolve the LLC with your State, though, check the State's department of State/Corporations (depending on the State, the nam...
+#### TEST Split
+**Scores in test: [1]**
+##### Score 1 Examples (test)
+**Example 1:**
+- Query: How to deposit a cheque issued to an associate in my business into my business account?
+- Doc Text: Just have the associate sign the back and then deposit it.  It's called a third party cheque and is perfectly legal.  I wouldn't be surprised if it has a longer hold period and, as always, you don't get the money if the cheque doesn't clear. Now, you may have problems if it's a large amount or you'r...
+**Example 2:**
+- Query: Can I send a money order from USPS as a business?
+- Doc Text: Sure you can.  You can fill in whatever you want in the From section of a money order, so your business name and address would be fine. The price only includes the money order itself.  You can hand deliver it yourself if you want, but if you want to mail it, you'll have to provide an envelope and a ...
+**Example 3:**
+- Query: 1 EIN doing business under multiple business names
+- Doc Text: You're confusing a lot of things here. Company B LLC will have it's sales run under Company A LLC, and cease operating as a separate entity These two are contradicting each other. If B LLC ceases to exist - it is not going to have it's sales run under A LLC, since there will be no sales to run for a...
+**Example 4:**
+- Query: Applying for and receiving business credit
+- Doc Text: "I'm afraid the great myth of limited liability companies is that all such vehicles have instant access to credit.  Limited liability on a company with few physical assets to underwrite the loan, or with insufficient revenue, will usually mean that the owners (or others) will be asked to stand suret...
+**Example 5:**
+- Query: What are the ins/outs of writing equipment purchases off as business expenses in a home based business?
+- Doc Text: Most items used in business have to be depreciated; you get to deduct a small fraction of the cost each year depending on the lifetime of the item as per IRS rules. That is, you cannot assume a one-year life for an electronic item  even if it will be obsolete in three months.  Some items can be expe...
+**Example 6:**
+- Query: Intentions of Deductible Amount for Small Business
+- Doc Text: "If your sole proprietorship losses exceed all other sources of taxable income, then you have what's called a Net Operating Loss (NOL). You will have the option to ""carry back"" and amend a return you filed in the last 2 years where you owed tax, or you can ""carry forward"" the losses and decrease...
+**Example 7:**
+- Query: How can I deposit a check made out to my business into my personal account?
+- Doc Text: You should have a separate business account. Mixing business and personal funds is a bad practice. Shop around, you should be able to find a bank that will let you open a free checking account, especially if you are going to have minimal activity (e.g. less than 20 of checks per month) and perhaps m...
+**Example 8:**
+- Query: Filing personal with 1099s versus business s-corp?
+- Doc Text: Depends whom the 1099 was issued to. If it was issued to your corporation - then its your corporation's income, not yours. Why would it go to your tax return? Your corporation and you are two separate legal entities. You will have to file the 1120S, whether you have corporate income or not, it has t...
+**Example 9:**
+- Query: Using credit card points to pay for tax deductible business expenses
+- Doc Text: "For simplicity, let's start by just considering cash back. In general, cash back from credit cards for personal use is not taxable, but for business use it is taxable (sort of, I'll explain later). The reason is most personal purchases are made with after tax dollars; you typically aren't deducting...
+**Example 10:**
+- Query: What approaches are there for pricing a small business?
+- Doc Text: I don't have any experience in this, but this is my academic understanding of business pricing.  The LOWEST amount a seller would accept is the liquidation value. For a B&B, what would the value of the land, the house, the furnishings, accounts payable, etc. be if it had to be sold today, minus any ...
+## CORE
+**Only Score 1 exists**
+**Score 1 = RELEVANT** ✓
+## Special Notes
+- **Original Purpose**: Financial opinion-based question answering
+- **What Score 1 Really Means**: Documents that answer financial questions
+- **Dataset Type**: Domain-specific Q&A (financial)
+- **Key Characteristic**: Includes opinionated answers about financial topics
+## Training Verdict
+**GOOD FOR TRAINING** ✓
+- Score 1 documents genuinely answer the queries
+- True relevance matching in financial domain
+- Can use for binary classification: Score 1 = positive class
+## Observations
+⚠️ **Training data only has Score 1:**
+- All training documents are marked as equally relevant
+- Model cannot learn to distinguish relevance levels
+- May indicate collapsed/merged relevance levels

test_scores/dataset_reports/hotpotqa_analysis.md ADDED Viewed

	@@ -0,0 +1,362 @@

+# HOTPOTQA Dataset Analysis
+## Overview
+- Total queries available: 97852
+- Total corpus documents: 5233329
+- Corpus documents loaded for examples: 50000
+## Split Statistics
+| Split | Queries | Total Pairs | Score Distribution |
+|-------|---------|-------------|-------------------|
+| train | 85000 | 170000 | 1:170000 |
+| dev | 5447 | 10894 | 1:10894 |
+| test | 7405 | 14810 | 1:14810 |
+## Score Analysis
+**Only one score level (1) found in this dataset.**
+- This means all labeled documents are considered equally relevant
+- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)
+## Examples
+### Score Comparison Examples
+### Examples by Split and Score
+#### TRAIN Split
+**Scores in train: [1]**
+##### Score 1 Examples (train)
+**Example 1:**
+- Query: What name was given to the son of the man who defeated the usurper Allectus ?
+- Doc Title: Constantius Chlorus
+- Doc Text: Constantius I (Latin: "Marcus Flavius Valerius Constantius Herculius Augustus" ; 31 March  250 25 July 306), commonly known as Constantius Chlorus (Greek: Κωνστάντιος Χλωρός , "Kōnstantios Khlōrós", literally "Constantius the Pale"), was "Caesar", a form of Roman co-emperor, from 293 to 306. He was ...
+**Example 2:**
+- Query: What profession does Lewis Milestone and All Quiet on the Western Front have in common?
+- Doc Title: Lewis Milestone
+- Doc Text: Lewis Milestone (born Leib Milstein; September 30, 1895 – September 25, 1980) was a Moldovan-born American motion picture director. He is known for directing "Two Arabian Knights" (1927) and "All Quiet on the Western Front" (1930), both of which received Academy Awards for Best Director. He also dir...
+**Example 3:**
+- Query: University of Alabama in Huntsville is a college located in what county?
+- Doc Title: Huntsville, Alabama
+- Doc Text: Huntsville is a city located primarily in Madison County in the Appalachian region of northern Alabama. Huntsville is the county seat of Madison County. The city extends west into neighboring Limestone County. Huntsville's population was 180,105 as of the 2010 census. Huntsville is the third-largest...
+**Example 4:**
+- Query: What 1937 magazine did "Bringing Up Baby" film star and one of classic Hollywood's definitive leading men appear in?
+- Doc Title: Bringing Up Baby
+- Doc Text: Bringing Up Baby is a 1938 American screwball comedy film directed by Howard Hawks, starring Katharine Hepburn and Cary Grant, and released by RKO Radio Pictures. The film tells the story of a paleontologist in a number of predicaments involving a scatterbrained woman and a leopard named Baby. The s...
+**Example 5:**
+- Query: Who founded the organization whose Boston branch excluded the Christian discussion group later housed in the Boston Young Men's Christian Union?
+- Doc Title: YMCA
+- Doc Text: The Young Men's Christian Association, commonly known as the YMCA or simply the Y, is a worldwide organization based in Geneva, Switzerland, with more than 2 billion beneficiaries from 125 national associations. It was founded on 6 June 1844 by George Williams in London and aims to put Christian pri...
+**Example 6:**
+- Query: What language family is the language of the tribe of the man who instructed Jeff Ball in?
+- Doc Title: Choctaw
+- Doc Text: The Choctaw (In the Choctaw language, Chahta) are a Native American people originally occupying what is now the Southeastern United States (modern-day Alabama, Florida, Mississippi, and Louisiana). Their Choctaw language belongs to the Muskogean language family group....
+**Example 7:**
+- Query: Dayton, Newark is part of the county in New Jersey having what population as of 2016?
+- Doc Title: Essex County, New Jersey
+- Doc Text: Essex County is a county in the northeastern part of the U.S. state of New Jersey. As of the 2016 Census estimate, the county's population was 796,914, making it the state's third-most populous county, an increase of 1.7% from the 2010 United States Census, when its population was enumerated at 783,...
+**Example 8:**
+- Query: What album produced by George Martin was supposed to contain a song that ended up unreleased until 1996?
+- Doc Title: Help! (album)
+- Doc Text: Help! is the fifth studio album by English rock band the Beatles, the soundtrack from their film "Help!", and released on 6 August 1965. Produced by George Martin, it was the fifth UK album release by the band, and contains fourteen songs in its original British form. Seven of these, including the s...
+**Example 9:**
+- Query: Which of the following is acclaimed for his "lyrical flow of his statements": Nâzım Hikmet or Arthur Miller?
+- Doc Title: Arthur Miller
+- Doc Text: Arthur Asher Miller (October 17, 1915 – February 10, 2005) was an American playwright, essayist, and figure in twentieth-century American theater. Among his most popular plays are "All My Sons" (1947), "Death of a Salesman" (1949), "The Crucible" (1953) and "A View from the Bridge" (1955, revised 19...
+**Example 10:**
+- Query: Michel Wachenheim a French ambassador and permanent representative of France of what specialized agency of the United Nations?
+- Doc Title: International Civil Aviation Organization
+- Doc Text: The International Civil Aviation Organization (ICAO) (pronounced ; French: "Organisation de l'aviation civile internationale" , OACI), is a specialized agency of the United Nations. It codifies the principles and techniques of international air navigation and fosters the planning and development of ...
+#### DEV Split
+**Scores in dev: [1]**
+##### Score 1 Examples (dev)
+**Example 1:**
+- Query: Daniel Márcio Fernandes plays for a club founded in which year ?
+- Doc Title: Lillestrøm SK
+- Doc Text: Lillestrøm Sportsklubb is a Norwegian football club from the city of Lillestrøm. It was founded in 1917, after the merger of two local football clubs. Their home ground is Åråsen Stadion, which has a capacity of 12,250 people, while the principal training ground is Lillestrøm stadion, or the indoor ...
+**Example 2:**
+- Query: How many members actually compete in the conference in which the Northwestern Wildcats compete?
+- Doc Title: Big Ten Conference
+- Doc Text: The Big Ten Conference (B1G), formerly Western Conference and Big Nine Conference, is the oldest Division I collegiate athletic conference in the United States. Despite its name, the conference consists of 14 members (as of 2016). They compete in the NCAA Division I; its football teams compete in th...
+**Example 3:**
+- Query: The composer of The String Quartet No. 3 in E-flat minor was awarded a lifetime pension by what emperor?
+- Doc Title: Pyotr Ilyich Tchaikovsky
+- Doc Text: Pyotr Ilyich Tchaikovsky ( ; Russian: Пётр Ильи́ч Чайко́вский ; 25 April/7 May 1840 – 25 October/6 November 1893), often anglicized as Peter Ilich Tchaikovsky, was a Russian composer of the romantic period, some of whose works are among the most popular music in the classical repertoire. He was the ...
+**Example 4:**
+- Query: Which Russian linguist was a member of the Kazan School?
+- Doc Title: Nikolai Trubetzkoy
+- Doc Text: Prince Nikolai Sergeyevich Trubetzkoy (also Troubetskoy; Russian: Никола́й Серге́евич Трубецко́й; Moscow, April 16, 1890 – Vienna, June 25, 1938) was a Russian linguist and historian whose teachings formed a nucleus of the Prague School of structural linguistics. He is widely considered to be the fo...
+**Example 5:**
+- Query: The election of Donald rump to the position that encompasses the duty of commander-in-chief of the United States Armed Forces was an expression of what political movement?
+- Doc Title: President of the United States
+- Doc Text: The President of the United States (informally referred to as "POTUS") is the head of state and head of government of the United States. The president directs the executive branch of the federal government and is the commander-in-chief of the United States Armed Forces....
+**Example 6:**
+- Query: In Star Wars: A New Hope, what planet is the leader of the rebel alliance from?
+- Doc Title: Star Wars (film)
+- Doc Text: Star Wars (later retitled Star Wars: Episode IV – A New Hope) is a 1977 American epic space opera film written and directed by George Lucas. It is the first film in the original "Star Wars" trilogy, the first "Star Wars" movie in general, and the beginning of the "Star Wars" franchise. Starring Mark...
+**Example 7:**
+- Query: What primarily competed with Nintendo 64 and Sega Staurn has a horror-themed adventure game published by Jaleco in 1999?
+- Doc Title: PlayStation (console)
+- Doc Text: The PlayStation (officially abbreviated to PS, and commonly known as the PS1 or PSX) is a home video game console developed and marketed by Sony Computer Entertainment. The console was released on 3 December 1994 in Japan, 9 September 1995 in North America, 29 September 1995 in Europe, and for 15 No...
+**Example 8:**
+- Query: Franklin Armstrong is a comic strip character create by a man that died in what year?
+- Doc Title: Charles M. Schulz
+- Doc Text: Charles Monroe Schulz (November 26, 1922 – February 12, 2000), nicknamed Sparky, was an American cartoonist best known for the comic strip "Peanuts" (which featured the characters Charlie Brown and Snoopy, among others). He is widely regarded as one of the most influential cartoonists of all time, c...
+**Example 9:**
+- Query: Did Fred Savage and James Tinling both work during the Silent Era?
+- Doc Title: Fred Savage
+- Doc Text: Frederick Aaron "Fred" Savage (born July 9, 1976) is an American actor, television director, and producer. He is best known for his role as Kevin Arnold in the American television series "The Wonder Years", which ran from 1988 to 1993. He has earned several awards and nominations, such as People's C...
+**Example 10:**
+- Query: Cliff Clavin was a character on the show that was produced by what company?
+- Doc Title: Cheers
+- Doc Text: Cheers is an American sitcom that ran on NBC from September 30, 1982, to May 20, 1993, with a total of 275 half-hour episodes spanning over eleven seasons. The show was produced by Charles/Burrows/Charles Productions in association with Paramount Network Television. The show was created by the team ...
+#### TEST Split
+**Scores in test: [1]**
+##### Score 1 Examples (test)
+**Example 1:**
+- Query: Were Scott Derrickson and Ed Wood of the same nationality?
+- Doc Title: Ed Wood
+- Doc Text: Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, writer, producer, and director....
+**Example 2:**
+- Query: What screenwriter with credits for "Evolution" co-wrote a film starring Nicolas Cage and Téa Leoni?
+- Doc Title: The Family Man
+- Doc Text: The Family Man is a 2000 American romantic comedy-drama film directed by Brett Ratner, written by David Diamond and David Weissman, and starring Nicolas Cage and Téa Leoni. Cage's production company, Saturn Films, helped produce the film. The film centers on a man who sees what could have been had h...
+**Example 3:**
+- Query: The football manager who recruited David Beckham managed Manchester United during what timeframe?
+- Doc Title: Alex Ferguson
+- Doc Text: Sir Alexander Chapman Ferguson, CBE (born 31 December 1941) is a Scottish former football manager and player who managed Manchester United from 1986 to 2013. He is regarded by many players, managers and analysts to be one of the greatest and most successful managers of all time....
+**Example 4:**
+- Query: Brown State Fishing Lake is in a country that has a population of how many inhabitants ?
+- Doc Title: Brown County, Kansas
+- Doc Text: Brown County (county code BR) is a county located in the northeast portion of the U.S. state of Kansas. As of the 2010 census, the county population was 9,984. Its county seat and most populous city is Hiawatha. Brown County is the location of the Kickapoo Indian Reservation of Kansas, the majority ...
+**Example 5:**
+- Query: Are Giuseppe Verdi and Ambroise Thomas both Opera composers ?
+- Doc Title: Giuseppe Verdi
+- Doc Text: Giuseppe Fortunino Francesco Verdi (] ; 9 or 10 October 1813 – 27 January 1901) was an Italian opera composer....
+**Example 6:**
+- Query: Roger O. Egeberg was Assistant Secretary for Health and Scientific Affairs during the administration of a president that served during what years?
+- Doc Title: Richard Nixon
+- Doc Text: Richard Milhous Nixon (January 9, 1913 – April 22, 1994) was the 37th President of the United States from 1969 until 1974, when he resigned from office, the only U.S. president to do so. He had previously served as the 36th Vice President of the United States from 1953 to 1961, and prior to that as ...
+**Example 7:**
+- Query: What is the name for the adventure in "Tunnels and Trolls", a game designed by Ken St. Andre?
+- Doc Title: Tunnels &amp; Trolls
+- Doc Text: Tunnels & Trolls (abbreviated "T&T") is a fantasy role-playing game designed by Ken St. Andre and first published in 1975 by Flying Buffalo. The second modern role-playing game published, it was written by Ken St. Andre to be a more accessible alternative to "Dungeons & Dragons" and is suitable for ...
+**Example 8:**
+- Query: Alexander Kerensky was defeated and destroyed by the Bolsheviks in the course of a civil war that ended when ?
+- Doc Title: Russian Civil War
+- Doc Text: The Russian Civil War (Russian: Гражда́нская война́ в Росси́и , "Grazhdanskaya voyna v Rossiyi" ; November 1917 – October 1922) was a multi-party war in the former Russian Empire immediately after the Russian Revolutions of 1917, as many factions vied to determine Russia's political future. The two ...
+**Example 9:**
+- Query: Ralph Hefferline was a psychology professor at a university that is located in what city?
+- Doc Title: Columbia University
+- Doc Text: Columbia University (Columbia; officially Columbia University in the City of New York), established in 1754, is a private Ivy League research university in Upper Manhattan, New York City, often cited as one of the world's most prestigious universities....
+**Example 10:**
+- Query: The battle in which Giuseppe Arimondi lost his life secured what for Ethiopia?
+- Doc Title: Battle of Adwa
+- Doc Text: The Battle of Adwa (Amharic: አድዋ; Amharic translated: Adowa, or sometimes by the Italian name "Adua") was fought on 2 March 1896/Yekatit 23, 1889 according to Ethiopian calendar between the Ethiopian Empire and the Kingdom of Italy near the town of Adwa, Ethiopia, in Tigray. This climactic battle of...
+## CORE
+**Only Score 1 exists**
+**Score 1 = RELEVANT** ✓
+## Special Notes
+- **Original Purpose**: Multi-hop question answering requiring reasoning over multiple documents
+- **What Score 1 Really Means**: Documents containing facts needed for multi-hop reasoning
+- **Dataset Type**: Multi-hop Q&A
+- **Key Characteristic**: Each query typically needs 2+ Wikipedia articles to answer
+## Training Verdict
+**GOOD FOR TRAINING** ✓
+- Score 1 documents provide necessary facts for answering
+- True relevance matching for complex reasoning
+- Can use for binary classification: Score 1 = positive class
+## Observations
+⚠️ **Training data only has Score 1:**
+- All training documents are marked as equally relevant
+- Model cannot learn to distinguish relevance levels
+- May indicate collapsed/merged relevance levels

test_scores/dataset_reports/nfcorpus_analysis.md ADDED Viewed

	@@ -0,0 +1,865 @@

+# NFCORPUS Dataset Analysis
+## Overview
+- Total queries available: 3237
+- Total corpus documents: 3633
+- Corpus documents loaded for examples: 3633
+## Split Statistics
+| Split | Queries | Total Pairs | Score Distribution |
+|-------|---------|-------------|-------------------|
+| train | 2590 | 110575 | 1:110575 |
+| dev | 324 | 11385 | 1:10864, 2:521 |
+| test | 323 | 12334 | 1:11758, 2:576 |
+## Score Analysis
+**Multiple score levels found: [1, 2]**
+- Higher scores indicate higher relevance
+- Score 0 typically means not relevant
+- Score 1 typically means relevant
+- Score 2 typically means highly relevant
+## Examples
+### Score Comparison Examples
+#### Comparison Example 1
+**Query:** Why Deep Fried Foods May Cause Cancer
+**From:** dev split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Birth Weight, Head Circumference, and Prenatal Exposure to Acrylamide from Maternal Diet: The European Prospective Mother–Child Study (NewGeneris)
+- Text: Background: Acrylamide is a common dietary exposure that crosses the human placenta. It is classified as a probable human carcinogen, and developmental toxicity has been observed in rodents. Objectives: We examined the associations between prenatal exposure to acrylamide and birth outcomes in a prospective European mother–child study. Methods: Hemoglobin (Hb) adducts of acrylamide and its metaboli...
+*Document 2:*
+- Title: A statistical regression model for the estimation of acrylamide concentrations in French fries for excess lifetime cancer risk assessment.
+- Text: Human exposure to acrylamide (AA) through consumption of French fries and other foods has been recognized as a potential health concern. Here, we used a statistical non-linear regression model, based on the two most influential factors, cooking temperature and time, to estimate AA concentrations in French fries. The R(2) of the predictive model is 0.83, suggesting the developed model was significa...
+*Document 3:*
+- Title: Chronic intake of potato chips in humans increases the production of reactive oxygen radicals by leukocytes and increases plasma C-reactive protein...
+- Text: BACKGROUND: Relatively high concentrations of acrylamide in commonly ingested food products, such as French fries, potato chips, or cereals, may constitute a potential risk to human health. OBJECTIVE: The objective of this pilot study was to investigate the possible connection between chronic ingestion of acrylamide-containing potato chips and oxidative stress or inflammation. DESIGN: Fourteen hea...
+**Score 1 Documents:**
+*Document 1:*
+- Title: Review of epidemiologic studies of dietary acrylamide intake and the risk of cancer.
+- Text: Conjectured associations between dietary acrylamide intake and cancer have been evaluated in more than 15 epidemiologic studies examining almost every major cancer site. We have critically reviewed the epidemiologic studies of estimated dietary acrylamide exposure and cancer. As substantially greater acrylamide exposure occurs through tobacco smoke than dietary exposure, we present the results sep...
+*Document 2:*
+- Title: Intake of fried meat and risk of cancer: a follow-up study in Finland.
+- Text: It has been suggested that mutagens in fried meat may be involved in the cancer process. Therefore the relationships between intake of fried meat and subsequent risk of cancers at different sites were studied among 9,990 Finnish men and women, 15-99 years of age and initially free of cancer. The baseline study was carried out in 1966-1972, and cases of cancer were identified through data linkage w...
+*Document 3:*
+- Title: Dietary acrylamide exposure of the French population: results of the second French Total Diet Study.
+- Text: Acrylamide is a heat-induced carcinogen compound that is found in some foods consequently to cooking or other thermal processes. In the second French Total Diet Study (TDS), acrylamide was analysed in 192 food samples collected in mainland France to be representative of the population diet and prepared "as consumed". Highest mean concentrations were found in potato chips/crisps (954 μg/kg), French...
+---
+#### Comparison Example 2
+**Query:** Living Longer by Reducing Leucine Intake
+**From:** dev split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Insights into the beneficial effect of caloric/ dietary restriction for a healthy and prolonged life
+- Text: Over the last several years, new evidence has kept pouring in about the remarkable effect of caloric restriction (CR) on the conspicuous bedfellows- aging and cancer. Through the use of various animal models, it is now well established that by reducing calorie intake one can not only increase life span but, also, lower the risk of various age related diseases such as cancer. Cancer cells are belie...
+*Document 2:*
+- Title: Macronutrient balance and lifespan
+- Text: Dietary restriction (DR) without malnutrition is widely regarded to be a universal mechanism for prolonging lifespan. It is generally believed that the benefits of DR arise from eating fewer calories (termed caloric restriction, CR). Here we argue that, rather than calories, the key determinant of the relationship between diet and longevity is the balance of protein to non-protein energy ingested....
+*Document 3:*
+- Title: Dietary Restriction, Growth Factors and Aging: from yeast to humans
+- Text: Dietary restriction (DR) and reduced growth factor signaling both elevate resistance to oxidative stress, reduce macromolecular damage, and increase lifespan in model organisms. In rodents, both DR and decreased growth factor signaling reduce the incidence of tumors and slow down cognitive decline and aging. DR reduces cancer and cardiovascular disease and mortality in monkeys, and reduces metabol...
+**Score 1 Documents:**
+*Document 1:*
+- Title: mTOR is a key modulator of ageing and age-related disease
+- Text: Many experts in the biology of ageing believe that pharmacological interventions to slow ageing are a matter of ‘when’ rather than ‘if’. A leading target for such interventions is the nutrient response pathway defined by the mechanistic target of rapamycin (mTOR). Inhibition of this pathway extends lifespan in model organisms and confers protection against a growing list of age-related pathologies...
+*Document 2:*
+- Title: Amino acid sensing and regulation of mTORC1.
+- Text: Amino acids play fundamental roles in the cell both as the building blocks of new proteins and as metabolic precursors. To adapt to their limitation during periods of protein starvation, multiple adaptive mechanisms have evolved, including a rapid cessation of new protein synthesis, an increase in amino acid biosynthesis and transport, and autophagy. Here, we discuss what we currently know about h...
+*Document 3:*
+- Title: Caloric restriction, the traditional Okinawan diet, and healthy aging: the diet of the world's longest-lived people and its potential impact on mor...
+- Text: Long-term caloric restriction (CR) is a robust means of reducing age-related diseases and extending life span in multiple species, but the effects in humans are unknown. The low caloric intake, long life expectancy, and the high prevalence of centenarians in Okinawa have been used as an argument to support the CR hypothesis in humans. However, no long-term, epidemiologic analysis has been conducte...
+---
+#### Comparison Example 3
+**Query:** Why are Cancer Rates so Low in India?
+**From:** dev split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Curcumin and cancer: an "old-age" disease with an "age-old" solution.
+- Text: Cancer is primarily a disease of old age, and that life style plays a major role in the development of most cancers is now well recognized. While plant-based formulations have been used to treat cancer for centuries, current treatments usually involve poisonous mustard gas, chemotherapy, radiation, and targeted therapies. While traditional plant-derived medicines are safe, what are the active prin...
+*Document 2:*
+- Title: Curcumin, a component of turmeric: from farm to pharmacy.
+- Text: Curcumin, an active polyphenol of the golden spice turmeric, is a highly pleiotropic molecule with the potential to modulate the biological activity of a number of signaling molecules. Traditionally, this polyphenol has been used in Asian countries to treat such human ailments as acne, psoriasis, dermatitis, and rash. Recent studies have indicated that curcumin can target newly identified signalin...
+*Document 3:*
+- Title: Dietary turmeric potentially reduces the risk of cancer.
+- Text: Turmeric, a plant rhizome that is often dried, ground and used as a cooking spice, has also been used medicinally for several thousand years. Curcumin, the phytochemical that gives turmeric its golden color, is responsible for most of the therapeutic effects of turmeric. In recent years curcumin has been studied for its effects on chronic diseases such as diabetes, Alzheimer's, and cancer. Though ...
+**Score 1 Documents:**
+*Document 1:*
+- Title: Apoptosis induced by death receptors.
+- Text: Death receptors belong to the TNF receptor family and are characterised by an intracellular death domain that serves to recruit adapter proteins such as TRADD and FADD and cysteine proteases such as Caspase-8. Activation of Caspase-8 on the aggregated receptor leads to apoptosis. Triggering of death receptors is mediated through the binding of specific ligands of the TNF family, which are homotrim...
+*Document 2:*
+- Title: Curcumin sensitizes tumor necrosis factor-related apoptosis-inducing ligand (TRAIL)-induced apoptosis through reactive oxygen species-mediated upre...
+- Text: Curcumin exhibits anti-inflammatory and antitumor activities. Although its functional mechanism has not been elucidated so far, numerous studies have shown that curcumin induces apoptosis in cancer cells. In the present study, we show that subtoxic concentrations of curcumin sensitize human renal cancer cells to the tumor necrosis factor-related apoptosis inducing ligand (TRAIL)-mediated apoptosis...
+*Document 3:*
+- Title: New perspectives of curcumin in cancer prevention
+- Text: Numerous natural compounds have been extensively investigated for their potential for cancer prevention over decades. Curcumin, from Curcuma longa, is a highly promising natural compound that can be potentially used for chemoprevention of multiple cancers. Curcumin modulates multiple molecular pathways involved in the lengthy carcinogenesis process to exert its chemopreventive effects through seve...
+---
+#### Comparison Example 4
+**Query:** Organic Milk and Prostate Cancer
+**From:** dev split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Milk Intake in Early Life and Risk of Advanced Prostate Cancer
+- Text: The authors investigated whether early-life residency in certain areas of Iceland marked by distinct differences in milk intake was associated with risk of prostate cancer in a population-based cohort of 8,894 men born between 1907 and 1935. Through linkage to cancer and mortality registers, the men were followed for prostate cancer diagnosis and mortality from study entry (in waves from 1967 to 1...
+*Document 2:*
+- Title: The experience of Japan as a clue to the etiology of breast and ovarian cancers: relationship between death from both malignancies and dietary prac...
+- Text: Although breast and ovarian cancers are rare in Japan compared with other developed countries, the death rates for both are increasing. In Japan, dramatic lifestyle changes occurred after World War II. Over the past 50 years (1947-1997), the age-standardized death rates of breast and ovarian cancers increased about 2- and 4-fold, respectively, and the respective intake of milk, meat, and eggs incr...
+*Document 3:*
+- Title: Milk consumption is a risk factor for prostate cancer in Western countries: evidence from cohort studies.
+- Text: We have previously found a positive association between milk consumption and prostate cancer risk using meta-analysis to analyze published case-control studies. In the present study, further meta-analysis was conducted to estimate the summary relative risk (RR) between the consumption of milk and dairy products and prostate cancer from cohort studies published between 1966- 2006. We found 18 relev...
+**Score 1 Documents:**
+*Document 1:*
+- Title: The experience of Japan as a clue to the etiology of testicular and prostatic cancers.
+- Text: In Japan dramatic lifestyle changes occurred after World War 2. To examine the experience of Japan as a clue to the etiology, trends in the mortality rates of testicular and prostatic cancers from 1947 to 1998 were related to changes in dietary practices. The male population born before 1945 had a peak in death from testicular cancer in their thirties or forties, whereas those born after 1946 had ...
+*Document 2:*
+- Title: Effect of diet and exercise on serum insulin, IGF-I, and IGFBP-1 levels and growth of LNCaP cells in vitro (United States).
+- Text: OBJECTIVE: Accumulating evidence indicates that prostate cancer is associated with high levels of serum IGF-I. This study was conducted to determine whether a low-fat diet and exercise (DE) intervention may modulate the IGF axis and reduce prostate cancer cell growth in vitro. METHODS: Fasting serum was obtained from 14 men (age 60 +/- 3 years) participating in an 11-day DE program and from eight ...
+*Document 3:*
+- Title: Intensive lifestyle changes may affect the progression of prostate cancer.
+- Text: PURPOSE: Men with prostate cancer are often advised to make changes in diet and lifestyle, although the impact of these changes has not been well documented. Therefore, we evaluated the effects of comprehensive lifestyle changes on prostate specific antigen (PSA), treatment trends and serum stimulated LNCaP cell growth in men with early, biopsy proven prostate cancer after 1 year. MATERIALS AND ME...
+---
+#### Comparison Example 5
+**Query:** Foods for Macular Degeneration
+**From:** dev split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Blueberry anthocyanins: protection against ageing and light-induced damage in retinal pigment epithelial cells.
+- Text: Retinal pigment epithelium (RPE) cells are vital for retinal health. However, they are susceptible to injury with ageing and exposure to excessive light, including UV (100-380 nm) and visible (380-760 nm) radiation. To evaluate the protective effect of blueberry anthocyanins on RPE cells, in vitro cell models of replicative senescent and light-induced damage were established in the present study. ...
+*Document 2:*
+- Title: Macular pigment and risk for age-related macular degeneration in subjects from a Northern European population.
+- Text: PURPOSE: Age and advanced disease in the fellow eye are the two most important risk factors for age-related macular degeneration (AMD). In this study, the authors investigated the relationship between these variables and the optical density of macular pigment (MP) in a group of subjects from a northern European population. METHODS: The optical density of MP was measured psychophysically in 46 subj...
+*Document 3:*
+- Title: A 12-wk egg intervention increases serum zeaxanthin and macular pigment optical density in women.
+- Text: Two carotenoids found in egg yolk, lutein and zeaxanthin, accumulate in the macular retina where they may reduce photostress. Increases in serum lutein and zeaxanthin were observed in previous egg interventions, but no study measured macular carotenoids. The objective of this project was to determine whether increased consumption of eggs would increase retinal lutein and zeaxanthin, or macular pig...
+**Score 1 Documents:**
+*Document 1:*
+- Title: Iris color and macular pigment optical density.
+- Text: The present study was designed to assess the relationship between iris color and macular pigment optical density. Both melanin and carotenoids (responsible for iris color and macular pigment composition, respectively) appear to protect the retina through similar mechanisms and higher concentrations may reduce the incidence of retinal degenerations. To evaluate this relationship, 95 subjects were e...
+*Document 2:*
+- Title: Nearwork-induced transient myopia (NITM) and permanent myopia--is there a link?
+- Text: Myopia is a worldwide public health problem. However, its understanding is incomplete, and many of its preventative and therapeutic aspects remain controversial. Nearwork is a primary, environmentally based factor in the aetiology of permanent myopia (PM), with nearwork-induced transient myopia (NITM) being a possible contributory component. A relationship between PM and NITM has been suggested, b...
+*Document 3:*
+- Title: Variations in product choices of frequently purchased herbs: caveat emptor.
+- Text: BACKGROUND: Patients who report use of herbs to their physicians may not be able to accurately describe the ingredients or recommended dosage because the products for the same herb may differ. The purpose of this study was to describe variations in label information of products for each of the 10 most commonly purchased herbs. METHODS: Products for each of 10 herbs were surveyed in a convenience s...
+---
+### Examples by Split and Score
+#### TRAIN Split
+**Scores in train: [1]**
+##### Score 1 Examples (train)
+**Example 1:**
+- Query: Breast Cancer Cells Feed on Cholesterol
+- Doc Title: Content of low density lipoprotein receptors in breast cancer tissue related to survival of patients.
+- Doc Text: The content of low density lipoprotein (LDL) receptors in tissue from primary breast cancers was determined and its prognostic information compared with that of variables of established prognostic importance. Frozen tumour specimens were selected, and tissue from 72 patients (32 of whom had died) we...
+**Example 2:**
+- Query: Using Diet to Treat Asthma and Eczema
+- Doc Title: Vegetarian diet ameliorates symptoms of atopic dermatitis through reduction of the number of peripheral eosinophils and of PGE2 synthesis by monocy...
+- Doc Text: Many patients with atopic dermatitis are dissatisfied with conventional treatments based on topical steroids and have experienced some traditional remedies and alternative therapies. However, most of such therapies have not been evaluated scientifically and clinically by specialists. This study was ...
+**Example 3:**
+- Query: Treating Asthma With Plants vs. Pills
+- Doc Title: Dietary polyphenols in the prevention and treatment of allergic diseases.
+- Doc Text: Allergic disorders encompass skin, food and respiratory allergies. Sensitization to a normally harmless allergen results in the immune system being biased to a predominant T-helper type 2 response. Re-exposure to the same allergen leads to a robust secretion of allergy-related mediators that eventua...
+**Example 4:**
+- Query: How Fruits and Vegetables Can Treat Asthma
+- Doc Title: Diet and asthma, allergic rhinoconjunctivitis and atopic eczema symptom prevalence: an ecological analysis of the International Study of Asthma and...
+- Doc Text: Several studies have suggested that the increasing prevalence of symptoms of asthma, rhinitis and eczema, could be associated with dietary factors. In the present paper, a global analysis of prevalence rates of wheeze, allergic rhinoconjunctivitis and atopic eczema was performed in relation to diet,...
+**Example 5:**
+- Query: How Fruits and Vegetables Can Prevent Asthma
+- Doc Title: The association of diet with respiratory symptoms and asthma in schoolchildren in Taipei, Taiwan.
+- Doc Text: This study aimed to evaluate the association of diet with respiratory symptoms and asthma in schoolchildren in Taipei, Taiwan. An in-class interview survey elicited experiences of asthma and respiratory symptoms and consumption frequencies of the major food categories in 2290 fifth graders. Respirat...
+**Example 6:**
+- Query: Our Tax Dollars Subsidize Unhealthy Foods
+- Doc Title: Food prices and blood cholesterol.
+- Doc Text: Cardiovascular diseases (CVD) cost Americans billions of dollars per year. High cholesterol levels, which are closely related to dietary habits, are a major contributor to CVD. In this article, we study whether changes in food prices are related to cholesterol levels and whether taxes or subsidies o...
+**Example 7:**
+- Query: Reducing Arsenic in Chicken and Rice
+- Doc Title: Arsenic and lead in juice: apple, citrus, and apple-base.
+- Doc Text: Exposure limits for arsenic and lead in drinking water have long been established by the U.S. Environmental Protection Agency and new regulations regarding the presence of these contaminants in bottled water went into effect in California in 2009. No comparable exposure limits or regulations are ava...
+**Example 8:**
+- Query: How Contaminated Are Our Children?
+- Doc Title: Cancer and non-cancer health effects from food contaminant exposures for children and adults in California: a risk assessment
+- Doc Text: Background In the absence of current cumulative dietary exposure assessments, this analysis was conducted to estimate exposure to multiple dietary contaminants for children, who are more vulnerable to toxic exposure than adults. Methods We estimated exposure to multiple food contaminants based on di...
+**Example 9:**
+- Query: Blood Type Diet Perceived as "Crass Fraud"
+- Doc Title: Blood type diets lack supporting evidence: a systematic review.
+- Doc Text: BACKGROUND: Diets that are based on the ABO blood group system have been promoted over the past decade and claim to improve health and decrease risk of disease. To our knowledge, the evidence to support the effectiveness of blood type diets has not previously been assessed in the scientific literatu...
+**Example 10:**
+- Query: Why Do Heart Doctors Favor Surgery and Drugs Over Diet?
+- Doc Title: Cross-analysis of dietary prescriptions and adherence in 356 hypercholesterolaemic patients.
+- Doc Text: BACKGROUND: One of the major issues in controlling serum cholesterol through dietetic intervention appears to be the need to improve patient adherence. AIMS: To explore the many questions regarding barriers to, and motivators for, cholesterol-lowering diet adherence. METHODS: We surveyed French gene...
+#### DEV Split
+**Scores in dev: [1, 2]**
+##### Score 1 Examples (dev)
+**Example 1:**
+- Query: Why Deep Fried Foods May Cause Cancer
+- Doc Title: Review of epidemiologic studies of dietary acrylamide intake and the risk of cancer.
+- Doc Text: Conjectured associations between dietary acrylamide intake and cancer have been evaluated in more than 15 epidemiologic studies examining almost every major cancer site. We have critically reviewed the epidemiologic studies of estimated dietary acrylamide exposure and cancer. As substantially greate...
+**Example 2:**
+- Query: Living Longer by Reducing Leucine Intake
+- Doc Title: mTOR is a key modulator of ageing and age-related disease
+- Doc Text: Many experts in the biology of ageing believe that pharmacological interventions to slow ageing are a matter of ‘when’ rather than ‘if’. A leading target for such interventions is the nutrient response pathway defined by the mechanistic target of rapamycin (mTOR). Inhibition of this pathway extends ...
+**Example 3:**
+- Query: Why are Cancer Rates so Low in India?
+- Doc Title: Apoptosis induced by death receptors.
+- Doc Text: Death receptors belong to the TNF receptor family and are characterised by an intracellular death domain that serves to recruit adapter proteins such as TRADD and FADD and cysteine proteases such as Caspase-8. Activation of Caspase-8 on the aggregated receptor leads to apoptosis. Triggering of death...
+**Example 4:**
+- Query: Peeks Behind the Egg Industry Curtain
+- Doc Title: Characterization of bacteria, clostridia and Bacteroides in faeces of vegetarians using qPCR and PCR-DGGE fingerprinting.
+- Doc Text: BACKGROUND/AIMS: This study aimed to investigate the quantitative and qualitative changes of bacteria, Bacteroides, Bifidobacterium and Clostridium cluster IV in faecal microbiota associated with a vegetarian diet. METHODS: Bacterial abundances were measured in faecal samples of 15 vegetarians and 1...
+**Example 5:**
+- Query: Organic Milk and Prostate Cancer
+- Doc Title: The experience of Japan as a clue to the etiology of testicular and prostatic cancers.
+- Doc Text: In Japan dramatic lifestyle changes occurred after World War 2. To examine the experience of Japan as a clue to the etiology, trends in the mortality rates of testicular and prostatic cancers from 1947 to 1998 were related to changes in dietary practices. The male population born before 1945 had a p...
+**Example 6:**
+- Query: Foods for Macular Degeneration
+- Doc Title: Iris color and macular pigment optical density.
+- Doc Text: The present study was designed to assess the relationship between iris color and macular pigment optical density. Both melanin and carotenoids (responsible for iris color and macular pigment composition, respectively) appear to protect the retina through similar mechanisms and higher concentrations ...
+**Example 7:**
+- Query: How Beans Help Our Bones
+- Doc Title: Neuroprotective effect of the natural iron chelator, phytic acid in a cell culture model of Parkinson's disease.
+- Doc Text: Disrupted iron metabolism and excess iron accumulation has been reported in the brains of Parkinson's disease (PD) patients. Because excessive iron can induce oxidative stress subsequently causing degradation of nigral dopaminergic neurons in PD, we determined the protective effect of a naturally oc...
+**Example 8:**
+- Query: Phosphate Additives in Chicken Banned Elsewhere
+- Doc Title: The Prevalence of Phosphorus Containing Food Additives in Top Selling Foods in Grocery Stores
+- Doc Text: Objective To determine the prevalence of phosphorus-containing food additives in best selling processed grocery products and to compare the phosphorus content of a subset of top selling foods with and without phosphorus additives. Design The labels of 2394 best selling branded grocery products in no...
+**Example 9:**
+- Query: How to Boost the Benefits of Exercise
+- Doc Title: Antigenotoxic activity of watercress extract in an in vitro mammalian system using comet assay.
+- Doc Text: Watercress (Cruciferae), an integral part of Mediterranean diets, is a nutritive food which is used in the treatment of several diseases. Oxidative DNA damage seems to play a crucial role in chronic, aging-related diseases and it is considered an important and probably carcinogenic factor. The aim o...
+**Example 10:**
+- Query: How to Treat Multiple Sclerosis With Diet
+- Doc Title: The molecular basis of nutritional intervention in multiple sclerosis: a narrative review.
+- Doc Text: It is commonly accepted that nutrition is one of the possible environmental factors involved in the pathogenesis of multiple sclerosis (MS), but its role as complementary MS treatment is unclear and largely disregarded. At present, MS therapy is not associated to a particular diet, probably due to l...
+##### Score 2 Examples (dev)
+**Example 1:**
+- Query: Why Deep Fried Foods May Cause Cancer
+- Doc Title: Birth Weight, Head Circumference, and Prenatal Exposure to Acrylamide from Maternal Diet: The European Prospective Mother–Child Study (NewGeneris)
+- Doc Text: Background: Acrylamide is a common dietary exposure that crosses the human placenta. It is classified as a probable human carcinogen, and developmental toxicity has been observed in rodents. Objectives: We examined the associations between prenatal exposure to acrylamide and birth outcomes in a pros...
+**Example 2:**
+- Query: Living Longer by Reducing Leucine Intake
+- Doc Title: Insights into the beneficial effect of caloric/ dietary restriction for a healthy and prolonged life
+- Doc Text: Over the last several years, new evidence has kept pouring in about the remarkable effect of caloric restriction (CR) on the conspicuous bedfellows- aging and cancer. Through the use of various animal models, it is now well established that by reducing calorie intake one can not only increase life s...
+**Example 3:**
+- Query: Why are Cancer Rates so Low in India?
+- Doc Title: Curcumin and cancer: an "old-age" disease with an "age-old" solution.
+- Doc Text: Cancer is primarily a disease of old age, and that life style plays a major role in the development of most cancers is now well recognized. While plant-based formulations have been used to treat cancer for centuries, current treatments usually involve poisonous mustard gas, chemotherapy, radiation, ...
+**Example 4:**
+- Query: Organic Milk and Prostate Cancer
+- Doc Title: Milk Intake in Early Life and Risk of Advanced Prostate Cancer
+- Doc Text: The authors investigated whether early-life residency in certain areas of Iceland marked by distinct differences in milk intake was associated with risk of prostate cancer in a population-based cohort of 8,894 men born between 1907 and 1935. Through linkage to cancer and mortality registers, the men...
+**Example 5:**
+- Query: Foods for Macular Degeneration
+- Doc Title: Blueberry anthocyanins: protection against ageing and light-induced damage in retinal pigment epithelial cells.
+- Doc Text: Retinal pigment epithelium (RPE) cells are vital for retinal health. However, they are susceptible to injury with ageing and exposure to excessive light, including UV (100-380 nm) and visible (380-760 nm) radiation. To evaluate the protective effect of blueberry anthocyanins on RPE cells, in vitro c...
+**Example 6:**
+- Query: How Beans Help Our Bones
+- Doc Title: The role of phytic acid in legumes: antinutrient or beneficial function?
+- Doc Text: This review describes the present state of knowledge about phytic acid (phytate), which is often present in legume seeds. The antinutritional effects of phytic acid primarily relate to the strong chelating associated with its six reactive phosphate groups. Its ability to complex with proteins and pa...
+**Example 7:**
+- Query: Phosphate Additives in Chicken Banned Elsewhere
+- Doc Title: Effects of Polyphosphate Additives on Campylobacter Survival in Processed Chicken Exudates
+- Doc Text: Campylobacter spp. are responsible for a large number of the bacterial food poisoning cases worldwide. Despite being sensitive to oxygen and nutritionally fastidious, Campylobacter spp. are able to survive in food processing environments and reach consumers in sufficient numbers to cause disease. To...
+**Example 8:**
+- Query: How to Boost the Benefits of Exercise
+- Doc Title: Endurance exercise results in DNA damage as detected by the comet assay.
+- Doc Text: To determine if 6 weeks of supplementation with antioxidants could alleviate exercise-induced DNA damage, we studied 21 runners during a 50 km ultramarathon. Subjects were randomly assigned to one of two groups: (1) placebos (PL) or (2) antioxidants (AO) (1000 mg vitamin C and 400 IU RRR-alpha-tocop...
+**Example 9:**
+- Query: How to Treat Multiple Sclerosis With Diet
+- Doc Title: Improvement of endothelium-dependent coronary vasodilation after a single LDL apheresis in patients with hypercholesterolemia.
+- Doc Text: The purpose of this study was to determine whether a single LDL apheresis would improve impaired endothelium-dependent dilation of the coronary artery in patients with hypercholesterolemia. Hypercholesterolemia is associated with impaired endothelial function, and human studies using cholesterol-low...
+**Example 10:**
+- Query: How to Get Kids to Eat Their Vegetables
+- Doc Title: Attractive names sustain increased vegetable intake in schools.
+- Doc Text: OBJECTIVE: This study will determine if the selective use of attractive names can be a sustainable, scalable means to increase the selection of vegetables in school lunchrooms. METHODS: Study 1 paired an attractive name with carrots in five elementary schools (n=147) and measured selection and consu...
+#### TEST Split
+**Scores in test: [1, 2]**
+##### Score 1 Examples (test)
+**Example 1:**
+- Query: Do Cholesterol Statin Drugs Cause Breast Cancer?
+- Doc Title: Women and statin use: a women's health advocacy perspective.
+- Doc Text: This paper is based on a longer report on the benefits, safety and modalities of information representation with regard to women and statin use, situated within the historical context of Women's Health Movement which has advocated for unbiased, appropriate medical research and prescribing for women ...
+**Example 2:**
+- Query: Exploiting Autophagy to Live Longer
+- Doc Title: Why human lifespan is rapidly increasing: solving "longevity riddle" with "revealed-slow-aging" hypothesis
+- Doc Text: Healthy life span is rapidly increasing and human aging seems to be postponed. As recently exclaimed in Nature, these findings are so perplexing that they can be dubbed the 'longevity riddle'. To explain current increase in longevity, I discuss that certain genetic variants such as hyper-active mTOR...
+**Example 3:**
+- Query: How to Reduce Exposure to Alkylphenols Through Your Diet
+- Doc Title: Maternal meat and fat consumption during pregnancy and suspected atopic eczema in Japanese infants aged 3-4 months: the Osaka Maternal and Child He...
+- Doc Text: Interest has increased in the possibility that maternal dietary intake during pregnancy might influence the development of allergic disorders in children. The present prospective study examined the association of maternal intake of selected foods high in fatty acids and specific types of fatty acids...
+**Example 4:**
+- Query: What’s Driving America’s Obesity Problem?
+- Doc Title: Prevalence of physical activity and obesity in US counties, 2001–2011: a road map for action
+- Doc Text: Background Obesity and physical inactivity are associated with several chronic conditions, increased medical care costs, and premature death. Methods We used the Behavioral Risk Factor Surveillance System (BRFSS), a state-based random-digit telephone survey that covers the majority of United States ...
+**Example 5:**
+- Query: Who Should be Careful About Curcumin?
+- Doc Title: Curcumin: a new paradigm and therapeutic opportunity for the treatment of osteoarthritis: curcumin for osteoarthritis management
+- Doc Text: The management of osteoarthritis represents a real challenge. This complex and multi-factorial disease evolves over decades and requires not only the alleviation of symptoms, i.e. pain and joint function but also the preservation of articular structure without side effects. Nutraceuticals are good c...
+**Example 6:**
+- Query: Foods for Glaucoma
+- Doc Title: Blueberry anthocyanins: protection against ageing and light-induced damage in retinal pigment epithelial cells.
+- Doc Text: Retinal pigment epithelium (RPE) cells are vital for retinal health. However, they are susceptible to injury with ageing and exposure to excessive light, including UV (100-380 nm) and visible (380-760 nm) radiation. To evaluate the protective effect of blueberry anthocyanins on RPE cells, in vitro c...
+**Example 7:**
+- Query: What is Actually in Chicken Nuggets?
+- Doc Title: The Prevalence of Phosphorus Containing Food Additives in Top Selling Foods in Grocery Stores
+- Doc Text: Objective To determine the prevalence of phosphorus-containing food additives in best selling processed grocery products and to compare the phosphorus content of a subset of top selling foods with and without phosphorus additives. Design The labels of 2394 best selling branded grocery products in no...
+**Example 8:**
+- Query: What Do Meat Purge and Cola Have in Common?
+- Doc Title: The Prevalence of Phosphorus Containing Food Additives in Top Selling Foods in Grocery Stores
+- Doc Text: Objective To determine the prevalence of phosphorus-containing food additives in best selling processed grocery products and to compare the phosphorus content of a subset of top selling foods with and without phosphorus additives. Design The labels of 2394 best selling branded grocery products in no...
+**Example 9:**
+- Query: Chronic Headaches and Pork Parasites
+- Doc Title: Cognitive Changes and Quality of Life in Neurocysticercosis: A Longitudinal Study
+- Doc Text: Background Few studies have focused on the cognitive morbidity of neurocysticercosis (NCC), one of the most common parasitic infections of the central nervous system. We longitudinally assessed the cognitive status and quality of life (QoL) of patients with incident symptomatic NCC cases and matched...
+**Example 10:**
+- Query: Stopping Heart Disease in Childhood
+- Doc Title: Endothelial function in healthy 11-year-old children after dietary intervention with onset in infancy: the Special Turku Coronary Risk Factor Inter...
+- Doc Text: BACKGROUND: Early childhood introduction of nutritional habits aimed at atherosclerosis prevention reduces children's serum total cholesterol concentration, but its effect on vascular endothelial function is unknown. METHODS AND RESULTS: Between 1990 and 1992, we randomized healthy 7-month-old infan...
+##### Score 2 Examples (test)
+**Example 1:**
+- Query: Do Cholesterol Statin Drugs Cause Breast Cancer?
+- Doc Title: Elevated Levels of Cholesterol-Rich Lipid Rafts in Cancer Cells Are Correlated with Apoptosis Sensitivity Induced by Cholesterol-Depleting Agents
+- Doc Text: Lipid rafts/caveolae are membrane platforms for signaling molecules that regulate various cellular functions, including cell survival. To better understand the role of rafts in tumor progression and therapeutics, we investigated the effect of raft disruption on cell viability and compared raft level...
+**Example 2:**
+- Query: Exploiting Autophagy to Live Longer
+- Doc Title: Insights into the beneficial effect of caloric/ dietary restriction for a healthy and prolonged life
+- Doc Text: Over the last several years, new evidence has kept pouring in about the remarkable effect of caloric restriction (CR) on the conspicuous bedfellows- aging and cancer. Through the use of various animal models, it is now well established that by reducing calorie intake one can not only increase life s...
+**Example 3:**
+- Query: How to Reduce Exposure to Alkylphenols Through Your Diet
+- Doc Title: p-Nonyl-phenol: an estrogenic xenobiotic released from "modified" polystyrene.
+- Doc Text: Alkylphenols are widely used as plastic additives and surfactants. We report the identification of an alkylphenol, nonylphenol, as an estrogenic substance released from plastic centrifuge tubes. This compound was extracted with methanol, purified by flash chromatography and reverse-phase high perfor...
+**Example 4:**
+- Query: What’s Driving America’s Obesity Problem?
+- Doc Title: The role of exercise in the treatment of obesity.
+- Doc Text: The United States is in the midst of a significant public health problem that relates to obesity and inactivity. This epidemic has far-ranging consequences for our workforce and our children and shows no signs of slowing in the near future. Significant research has been performed on the effects of e...
+**Example 5:**
+- Query: Who Should be Careful About Curcumin?
+- Doc Title: Total and soluble oxalate content of some Indian spices.
+- Doc Text: Spices, such as cinnamon, cloves, cardamom, garlic, ginger, cumin, coriander and turmeric are used all over the world as flavouring and colouring ingredients in Indian foods. Previous studies have shown that spices contain variable amounts of total oxalates but there are few reports of soluble oxala...
+**Example 6:**
+- Query: Foods for Glaucoma
+- Doc Title: Antioxidant intake and primary open-angle glaucoma: a prospective study.
+- Doc Text: The relation between dietary antioxidant intake and primary open-angle glaucoma risk was examined in participants aged over 40 years in the Nurses' Health Study (n = 76,200) and the Health Professionals Follow-up Study (n = 40,284). They were followed biennially from 1980 and 1986, respectively, to ...
+**Example 7:**
+- Query: What is Actually in Chicken Nuggets?
+- Doc Title: The autopsy of chicken nuggets reads "chicken little".
+- Doc Text: PURPOSE: To determine the contents of chicken nuggets from 2 national food chains. BACKGROUND: Chicken nuggets have become a major component of the American diet. We sought to determine the current composition of this highly processed food. METHODS: Randomly selected nuggets from 2 different nationa...
+**Example 8:**
+- Query: What Do Meat Purge and Cola Have in Common?
+- Doc Title: Public health impact of dietary phosphorus excess on bone and cardiovascular health in the general population.
+- Doc Text: This review explores the potential adverse impact of the increasing phosphorus content in the American diet on renal, cardiovascular, and bone health of the general population. Increasingly, studies show that phosphorus intakes in excess of the nutrient needs of a healthy population may significantl...
+**Example 9:**
+- Query: Chronic Headaches and Pork Parasites
+- Doc Title: Clinical manifestations, diagnosis, and treatment of neurocysticercosis.
+- Doc Text: Neurocysticercosis (NCC) is the most frequent parasitic disease of the human brain. Modern imaging studies, CT and MRI, have defined the diagnosis and characterization of the disease. Through these studies the therapeutic approach for each case may be individualized with the aid of antihelmintics, s...
+**Example 10:**
+- Query: Stopping Heart Disease in Childhood
+- Doc Title: Relation of serum lipoprotein levels and systolic blood pressure to early atherosclerosis. The Bogalusa Heart Study.
+- Doc Text: We assessed the relation of risk factors for cardiovascular disease to early atherosclerotic lesions in the aorta and coronary arteries in 35 persons (mean age at death, 18 years). Aortic involvement with fatty streaks was greater in blacks than in whites (37 vs. 17 percent, P less than 0.01). Howev...
+## CORE
+**Has Scores 1, 2 (in dev/test) but only 1 in train**
+**Score 2 = HIGHLY RELEVANT** ✓
+**Score 1 = RELEVANT** ✓
+## Special Notes
+- **Original Purpose**: Medical IR bridging layman queries to technical PubMed docs
+- **What Scores Really Mean**:
+  - Score 2: Direct, highly relevant medical papers (best answers)
+  - Score 1: Related papers providing context/mechanisms (still relevant!)
+  - Example: "Reducing Leucine" query → Score 2: caloric restriction papers, Score 1: mTOR/amino acid papers (leucine activates mTOR!)
+- **Dataset Type**: Medical domain Q&A (NutritionFacts.org)
+- **CRITICAL ISSUE**: Train has collapsed scores (all 1s), test/dev have proper 1/2 distinction
+## Training Verdict
+**GOOD FOR TRAINING** ✓ (with caveats)
+- Both Score 1 and 2 are truly relevant
+- Score 1 provides mechanistic/related info
+- BUT: Must handle train/test mismatch (train lacks Score 2)
+- Recommendation: Use as binary (1+2 = positive) OR use only dev/test splits
+## Observations
+⚠️ **Score mismatch between train and test:**
+- Train has scores: [1]
+- Test has scores: [1, 2]
+- This could cause issues when training models
+⚠️ **Training data only has Score 1:**
+- All training documents are marked as equally relevant
+- Model cannot learn to distinguish relevance levels
+- May indicate collapsed/merged relevance levels

test_scores/dataset_reports/quora_analysis.md ADDED Viewed

File without changes

test_scores/dataset_reports/scidocs_analysis.md ADDED Viewed

	@@ -0,0 +1,555 @@

+# SCIDOCS Dataset Analysis
+## Overview
+- Total queries available: 1000
+- Total corpus documents: 25657
+- Corpus documents loaded for examples: 25657
+## Split Statistics
+| Split | Queries | Total Pairs | Score Distribution |
+|-------|---------|-------------|-------------------|
+| train | N/A | N/A | N/A |
+| dev | N/A | N/A | N/A |
+| test | 1000 | 29928 | 0:25000, 1:4928 |
+## Score Analysis
+**Multiple score levels found: [0, 1]**
+- Higher scores indicate higher relevance
+- Score 0 typically means not relevant
+- Score 1 typically means relevant
+- Score 2 typically means highly relevant
+## Examples
+### Score Comparison Examples
+#### Comparison Example 1
+**Query:** A Direct Search Method to solve Economic Dispatch Problem with Valve-Point Effect
+**From:** test split
+**Score 1 Documents:**
+*Document 1:*
+- Title: A hybrid of genetic algorithm and particle swarm optimization for recurrent network design
+- Text: An evolutionary recurrent network which automates the design of recurrent neural/fuzzy networks using a new evolutionary learning algorithm is proposed in this paper. This new evolutionary learning algorithm is based on a hybrid of genetic algorithm (GA) and particle swarm optimization (PSO), and is thus called HGAPSO. In HGAPSO, individuals in a new generation are created, not only by crossover a...
+*Document 2:*
+- Title: A Hybrid EP and SQP for Dynamic Economic Dispatch with Nonsmooth Fuel Cost Function
+- Text: Dynamic economic dispatch (DED) is one of the main functions of power generation operation and control. It determines the optimal settings of generator units with predicted load demand over a certain period of time. The objective is to operate an electric power system most economically while the system is operating within its security limits. This paper proposes a new hybrid methodology for solvin...
+*Document 3:*
+- Title: Genetic Fuzzy Systems - Evolutionary Tuning and Learning of Fuzzy Knowledge Bases
+- Text: It's not surprisingly when entering this site to get the book. One of the popular books now is the genetic fuzzy systems evolutionary tuning and learning of fuzzy knowledge bases. You may be confused because you can't find the book in the book store around your city. Commonly, the popular book will be sold quickly. And when you have found the store to buy the book, it will be so hurt when you run ...
+**Score 0 Documents:**
+*Document 1:*
+- Title: Separate face and body selectivity on the fusiform gyrus.
+- Text: Recent reports of a high response to bodies in the fusiform face area (FFA) challenge the idea that the FFA is exclusively selective for face stimuli. We examined this claim by conducting a functional magnetic resonance imaging experiment at both standard (3.125 x 3.125 x 4.0 mm) and high resolution (1.4 x 1.4 x 2.0 mm). In both experiments, regions of interest (ROIs) were defined using data from ...
+*Document 2:*
+- Title: Scheduling for Reduced CPU Energy
+- Text: The energy usage of computer systems is becoming more important, especially for battery operated systems. Displays, disks, and cpus, in that order, use the most energy. Reducing the energy used by displays and disks has been studied elsewhere; this paper considers a new method for reducing the energy used by the cpu. We introduce a new metric for cpu energy performance, millions-of-instructions-pe...
+*Document 3:*
+- Title: A data mining approach for location prediction in mobile environments
+- Text: Mobility prediction is one of the most essential issues that need to be explored for mobility management in mobile computing systems. In this paper, we propose a new algorithm for predicting the next inter-cell movement of a mobile user in a Personal Communication Systems network. In the first phase of our threephase algorithm, user mobility patterns are mined from the history of mobile user traje...
+---
+#### Comparison Example 2
+**Query:** Bearish-Bullish Sentiment Analysis on Financial Microblogs
+**From:** test split
+**Score 1 Documents:**
+*Document 1:*
+- Title: SemEval-2015 Task 11: Sentiment Analysis of Figurative Language in Twitter
+- Text: This report summarizes the objectives and evaluation of the SemEval 2015 task on the sentiment analysis of figurative language on Twitter (Task 11). This is the first sentiment analysis task wholly dedicated to analyzing figurative language on Twitter. Specifically, three broad classes of figurative language are considered: irony, sarcasm and metaphor. Gold standard sets of 8000 training tweets an...
+*Document 2:*
+- Title: Text mining of news-headlines for FOREX market prediction: A Multi-layer Dimension Reduction Algorithm with semantics and sentiment
+- Text: In this paper a novel approach is proposed to predict intraday directional-movements of a currency-pair in the foreign exchange market based on the text of breaking financial news-headlines. The motivation behind this work is twofold: First, although market-prediction through text-mining is shown to be a promising area of work in the literature, the text-mining approaches utilized in it at this st...
+*Document 3:*
+- Title: BabelNet: The automatic construction, evaluation and application of a wide-coverage multilingual semantic network
+- Text: a r t i c l e i n f o a b s t r a c t We present an automatic approach to the construction of BabelNet, a very large, wide-coverage multilingual semantic network. Key to our approach is the integration of lexicographic and encyclopedic knowledge from WordNet and Wikipedia. In addition, Machine Translation is applied to enrich the resource with lexical information for all languages. We first conduc...
+**Score 0 Documents:**
+*Document 1:*
+- Title: A survey of software testing practices in alberta
+- Text: Software organizations have typically de-emphasized the importance of software testing. In this paper, the results of a regional survey of software testing and software quality assurance techniques are described. Researchers conducted the study during the summer and fall of 2002 by surveying software organizations in the Province of Alberta. Results indicate that Alberta-based organizations tend t...
+*Document 2:*
+- Title: Analysis and Design of Average Current Mode Control Using a Describing-Function-Based Equivalent Circuit Model
+- Text: This paper proposes a small-signal model for average current mode control based on an equivalent circuit. The model uses a three-terminal equivalent circuit model based on a linearized describing function method to include the feedback effect of the sideband frequency components of the inductor current. The model extends the results obtained in peak current mode control to average current mode con...
+*Document 3:*
+- Title: Using deep learning for short text understanding
+- Text: Classifying short texts to one category or clustering semantically related texts is challenging, and the importance of both is growing due to the rise of microblogging platforms, digital news feeds, and the like. We can accomplish this classifying and clustering with the help of a deep neural network which produces compact binary representations of a short text, and can assign the same category to...
+---
+#### Comparison Example 3
+**Query:** Predicting defects in SAP Java code: An experience report
+**From:** test split
+**Score 1 Documents:**
+*Document 1:*
+- Title: Hipikat: a project memory for software development
+- Text: Sociological and technical difficulties, such as a lack of informal encounters, can make it difficult for new members of noncollocated software development teams to learn from their more experienced colleagues. To address this situation, we have developed a tool, named Hipikat that provides developers with efficient and effective access to the group memory for a software development project that i...
+*Document 2:*
+- Title: A Metrics Suite for Object Oriented Design
+- Text: Given the central role that software development plays in the delivery and application of information technology, managers are increasingly focusing on process improvement in the software development area. This demand has spurred the provision of a number of new and/or improved approaches to software development, with perhaps the most prominent being object-orientation (OO). In addition, the focus...
+*Document 3:*
+- Title: Two case studies of open source software development: Apache and Mozilla
+- Text: According to its proponents, open source style software development has the capacity to compete successfully, and perhaps in many cases displace, traditional commercial development methods. In order to begin investigating such claims, we examine data from two major open source projects, the Apache web server and the Mozilla browser. By using email archives of source code change history and problem...
+**Score 0 Documents:**
+*Document 1:*
+- Title: Object Detection Featuring 3D Audio Localization for Microsoft HoloLens - A Deep Learning based Sensor Substitution Approach for the Blind
+- Text: Finding basic objects on a daily basis is a difficult but common task for blind people. This paper demonstrates the implementation of a wearable, deep learning backed, object detection approach in the context of visual impairment or blindness. The prototype aims to substitute the impaired eye of the user and replace it with technical sensors. By scanning its surroundings, the prototype provides a ...
+*Document 2:*
+- Title: English as a Formal Specification Language
+- Text: PENG is a computer-processable controlled natural language designed for writing unambiguous and precise specifications. PENG covers a strict subset of standard English and is precisely defined by a controlled grammar and a controlled lexicon. In contrast to other controlled languages, the author does not need to know the grammatical restrictions explicitly. ECOLE, a look-ahead text editor, indicat...
+*Document 3:*
+- Title: Automatic Retraction and Full-Cycle Operation for a Class of Airborne Wind Energy Generators
+- Text: Airborne wind energy systems aim to harvest the power of winds blowing at altitudes higher than what conventional wind turbines reach. They employ a tethered flying structure, usually a wing, and exploit the aerodynamic lift to produce electrical power. In the case of ground-based systems, where the traction force on the tether is used to drive a generator on the ground, a two-phase power cycle is...
+---
+#### Comparison Example 4
+**Query:** Active-Metric Learning for Classification of Remotely Sensed Hyperspectral Images
+**From:** test split
+**Score 1 Documents:**
+*Document 1:*
+- Title: Query by Committee
+- Text: We propose an algorithm called query by commitee, in which a committee of students is trained on the same data set. The next query is chosen according to the principle of maximal disagreement. The algorithm is studied for two toy models: the high-low game and perceptron learning of another perceptron. As the number of queries goes to infinity, the committee algorithm yields asymptotically finite i...
+*Document 2:*
+- Title: Adaptive Manifold Learning
+- Text: Manifold learning algorithms seek to find a low-dimensional parameterization of high-dimensional data. They heavily rely on the notion of what can be considered as local, how accurately the manifold can be approximated locally, and, last but not least, how the local structures can be patched together to produce the global parameterization. In this paper, we develop algorithms that address two key ...
+*Document 3:*
+- Title: Manifold-Learning-Based Feature Extraction for Classification of Hyperspectral Data: A Review of Advances in Manifold Learning
+- Text: Advances in hyperspectral sensing provide new capability for characterizing spectral signatures in a wide range of physical and biological systems, while inspiring new methods for extracting information from these data. HSI data often lie on sparse, nonlinear manifolds whose geometric and topological structures can be exploited via manifold-learning techniques. In this article, we focused on demon...
+**Score 0 Documents:**
+*Document 1:*
+- Title: Leveraging graph topology and semantic context for pharmacovigilance through twitter-streams
+- Text: Adverse drug events (ADEs) constitute one of the leading causes of post-therapeutic death and their identification constitutes an important challenge of modern precision medicine. Unfortunately, the onset and effects of ADEs are often underreported complicating timely intervention. At over 500 million posts per day, Twitter is a commonly used social media platform. The ubiquity of day-to-day perso...
+*Document 2:*
+- Title: IMPLICIT SHAPE MODELS FOR OBJECT DETECTION IN 3D POINT CLOUDS
+- Text: We present a method for automatic object localization and recognition in 3D point clouds representing outdoor urban scenes. The method is based on the implicit shape models (ISM) framework, which recognizes objects by voting for their center locations. It requires only few training examples per class, which is an important property for practical use. We also introduce and evaluate an improved vers...
+*Document 3:*
+- Title: Multi-task Learning for Maritime Traffic Surveillance from AIS Data Streams
+- Text: In a world of global trading, maritime safety, security and efficiency are crucial issues. We propose a multi-task deep learning framework for vessel monitoring using Automatic Identification System (AIS) data streams. We combine recurrent neural networks with latent variable modeling and an embedding of AIS messages to a new representation space to jointly address key issues to be dealt with when...
+---
+#### Comparison Example 5
+**Query:** Ad Hoc Retrieval Experiments Using WordNet and Automatically Constructed Thesauri
+**From:** test split
+**Score 1 Documents:**
+*Document 1:*
+- Title: Concept Based Query Expansion
+- Text: Query expansion methods have been studied for a long time - with debatable success in many instances. In this paper we present a probabilistic query expansion model based on a similarity thesaurus which was constructed automatically. A similarity thesaurus reflects domain knowledge about the particular collection from which it is constructed. We address the two important issues with query expansio...
+*Document 2:*
+- Title: An Association Thesaurus for Information Retrieval
+- Text: Although commonly used in both commercial and experimental information retrieval systems, thesauri have not demonstrated consistent beneets for retrieval performance, and it is diicult to construct a thesaurus automatically for large text databases. In this paper, an approach, called PhraseFinder, is proposed to construct collection-dependent association thesauri automatically using large full-tex...
+*Document 3:*
+- Title: Experiments in Automatic Statistical Thesaurus Construction
+- Text: A well constructed thesaurus has long been recognized as a valuable tool in the effective operation of an information retrieval system. This paper reports the results of experiments designed to determine the validity of an approach to the automatic construction of global thesauri (described originally by Crouch in [1] and [2] based on a clustering of the document collection. The authors validate t...
+**Score 0 Documents:**
+*Document 1:*
+- Title: Build-to-order supply chain management : a literature review and framework for development
+- Text: The build-to-order supply chain management (BOSC) strategy has recently attracted the attention of both researchers and practitioners, given its successful implementation in many companies including Dell computers, Compaq, and BMW. The growing number of articles on BOSC in the literature is an indication of the importance of the strategy and of its role in improving the competitiveness of an organ...
+*Document 2:*
+- Title: Team MIT Urban Challenge Technical Report
+- Text: This technical report describes Team MIT's approach to the DARPA Urban Challenge. We have developed a novel strategy for using many inexpensive sensors, mounted on the vehicle periphery, and calibrated with a new crossmodal calibration technique. Lidar, camera, and radar data streams are processed using an innovative, locally smooth state representation that provides robust perception for real t...
+*Document 3:*
+- Title: Platforms in healthcare innovation ecosystems: The lens of an innovation intermediary
+- Text: Healthcare innovation has made progressive strides. Innovative solutions now tend to incorporate device integration, data collection and data analysis linked across a diverse range of actors building platform-centric healthcare ecosystems. The interconnectedness and inter-disciplinarity of the ecosystems bring with it a number of vital issues around how to strategically manage such a complex syste...
+---
+### Examples by Split and Score
+#### TEST Split
+**Scores in test: [0, 1]**
+##### Score 0 Examples (test)
+**Example 1:**
+- Query: A Direct Search Method to solve Economic Dispatch Problem with Valve-Point Effect
+- Doc Title: Separate face and body selectivity on the fusiform gyrus.
+- Doc Text: Recent reports of a high response to bodies in the fusiform face area (FFA) challenge the idea that the FFA is exclusively selective for face stimuli. We examined this claim by conducting a functional magnetic resonance imaging experiment at both standard (3.125 x 3.125 x 4.0 mm) and high resolution...
+**Example 2:**
+- Query: Bearish-Bullish Sentiment Analysis on Financial Microblogs
+- Doc Title: A survey of software testing practices in alberta
+- Doc Text: Software organizations have typically de-emphasized the importance of software testing. In this paper, the results of a regional survey of software testing and software quality assurance techniques are described. Researchers conducted the study during the summer and fall of 2002 by surveying softwar...
+**Example 3:**
+- Query: Predicting defects in SAP Java code: An experience report
+- Doc Title: Object Detection Featuring 3D Audio Localization for Microsoft HoloLens - A Deep Learning based Sensor Substitution Approach for the Blind
+- Doc Text: Finding basic objects on a daily basis is a difficult but common task for blind people. This paper demonstrates the implementation of a wearable, deep learning backed, object detection approach in the context of visual impairment or blindness. The prototype aims to substitute the impaired eye of the...
+**Example 4:**
+- Query: Active-Metric Learning for Classification of Remotely Sensed Hyperspectral Images
+- Doc Title: Leveraging graph topology and semantic context for pharmacovigilance through twitter-streams
+- Doc Text: Adverse drug events (ADEs) constitute one of the leading causes of post-therapeutic death and their identification constitutes an important challenge of modern precision medicine. Unfortunately, the onset and effects of ADEs are often underreported complicating timely intervention. At over 500 milli...
+**Example 5:**
+- Query: Ad Hoc Retrieval Experiments Using WordNet and Automatically Constructed Thesauri
+- Doc Title: Build-to-order supply chain management : a literature review and framework for development
+- Doc Text: The build-to-order supply chain management (BOSC) strategy has recently attracted the attention of both researchers and practitioners, given its successful implementation in many companies including Dell computers, Compaq, and BMW. The growing number of articles on BOSC in the literature is an indic...
+**Example 6:**
+- Query: Underwater Acoustic Target Tracking: A Review
+- Doc Title: Performance of Store Brands: A Cross-Country Analysis of Consumer Store Brand Preferences, Perceptions, and Risk
+- Doc Text: This paper empirically studies consumer choice behavior in regard to store brands in the US, UK and Spain. Store brand market shares differ by country and they are usually much higher in Europe than in the US. However, there is surprisingly little work in marketing that empirically studies the reaso...
+**Example 7:**
+- Query: Unsupervised Diverse Colorization via Generative Adversarial Networks
+- Doc Title: Active learning for on-road vehicle detection: a comparative study
+- Doc Text: In recent years, active learning has emerged as a powerful tool in building robust systems for object detection using computer vision. Indeed, active learning approaches to on-road vehicle detection have achieved impressive results. While active learning approaches for object detection have been exp...
+**Example 8:**
+- Query: Lane Detection ( Part I ) : Mono-Vision Based Method
+- Doc Title: The Ontological Foundation of REA Enterprise Information Systems
+- Doc Text: Philosophers have studied ontologies for centuries in their search for a systematic explanation of existence: “What kind of things exist?” Recently, ontologies have emerged as a major research topic in the fields of artificial intelligence and knowledge management where they address the content issu...
+**Example 9:**
+- Query: Detection of distributed denial of service attacks using machine learning algorithms in software defined networks
+- Doc Title: Sensing spatial distribution of urban land use by integrating points-of-interest and Google Word2Vec model
+- Doc Text: Urban land use information plays an essential role in a wide variety of urban planning and environmental monitoring processes. During the past few decades, with the rapid technological development of remote sensing (RS), geographic information systems (GIS) and geospatial big data, numerous methods ...
+**Example 10:**
+- Query: Distributed Privacy-Preserving Collaborative Intrusion Detection Systems for VANETs
+- Doc Title: The role of transparency in recommender systems
+- Doc Text: Recommender Systems act as a personalized decision guides, aiding users in decisions on matters related to personal taste. Most previous research on Recommender Systems has focused on the statistical accuracy of the algorithms driving the systems, with little emphasis on interface issues and the use...
+##### Score 1 Examples (test)
+**Example 1:**
+- Query: A Direct Search Method to solve Economic Dispatch Problem with Valve-Point Effect
+- Doc Title: A hybrid of genetic algorithm and particle swarm optimization for recurrent network design
+- Doc Text: An evolutionary recurrent network which automates the design of recurrent neural/fuzzy networks using a new evolutionary learning algorithm is proposed in this paper. This new evolutionary learning algorithm is based on a hybrid of genetic algorithm (GA) and particle swarm optimization (PSO), and is...
+**Example 2:**
+- Query: Bearish-Bullish Sentiment Analysis on Financial Microblogs
+- Doc Title: SemEval-2015 Task 11: Sentiment Analysis of Figurative Language in Twitter
+- Doc Text: This report summarizes the objectives and evaluation of the SemEval 2015 task on the sentiment analysis of figurative language on Twitter (Task 11). This is the first sentiment analysis task wholly dedicated to analyzing figurative language on Twitter. Specifically, three broad classes of figurative...
+**Example 3:**
+- Query: Predicting defects in SAP Java code: An experience report
+- Doc Title: Hipikat: a project memory for software development
+- Doc Text: Sociological and technical difficulties, such as a lack of informal encounters, can make it difficult for new members of noncollocated software development teams to learn from their more experienced colleagues. To address this situation, we have developed a tool, named Hipikat that provides develope...
+**Example 4:**
+- Query: Active-Metric Learning for Classification of Remotely Sensed Hyperspectral Images
+- Doc Title: Query by Committee
+- Doc Text: We propose an algorithm called query by commitee, in which a committee of students is trained on the same data set. The next query is chosen according to the principle of maximal disagreement. The algorithm is studied for two toy models: the high-low game and perceptron learning of another perceptro...
+**Example 5:**
+- Query: Ad Hoc Retrieval Experiments Using WordNet and Automatically Constructed Thesauri
+- Doc Title: Concept Based Query Expansion
+- Doc Text: Query expansion methods have been studied for a long time - with debatable success in many instances. In this paper we present a probabilistic query expansion model based on a similarity thesaurus which was constructed automatically. A similarity thesaurus reflects domain knowledge about the particu...
+**Example 6:**
+- Query: Underwater Acoustic Target Tracking: A Review
+- Doc Title: The challenges of building mobile underwater wireless networks for aquatic applications
+- Doc Text: The large-scale mobile underwater wireless sensor network (UWSN) is a novel networking paradigm to explore aqueous environments. However, the characteristics of mobile UWSNs, such as low communication bandwidth, large propagation delay, floating node mobility, and high error probability, are signifi...
+**Example 7:**
+- Query: Unsupervised Diverse Colorization via Generative Adversarial Networks
+- Doc Title: Infrared Colorization Using Deep Convolutional Neural Networks
+- Doc Text: This paper proposes a method for transferring the RGB color spectrum to near-infrared (NIR) images using deep multi-scale convolutional neural networks. A direct and integrated transfer between NIR and RGB pixels is trained. The trained model does not require any user guidance or a reference image d...
+**Example 8:**
+- Query: Lane Detection ( Part I ) : Mono-Vision Based Method
+- Doc Title: Real time detection of lane markers in urban streets
+- Doc Text: We present a robust and real time approach to lane marker detection in urban streets. It is based on generating a top view of the road, filtering using selective oriented Gaussian filters, using RANSAC line fitting to give initial guesses to a new and fast RANSAC algorithm for fitting Bezier Splines...
+**Example 9:**
+- Query: Detection of distributed denial of service attacks using machine learning algorithms in software defined networks
+- Doc Title: Intrusion detection by machine learning: A review
+- Doc Text: The popularity of using Internet contains some risks of network attacks. Intrusion detection is one major research problem in network security, whose aim is to identify unusual access or attacks to secure internal networks. In literature, intrusion detection systems have been approached by various m...
+**Example 10:**
+- Query: Distributed Privacy-Preserving Collaborative Intrusion Detection Systems for VANETs
+- Doc Title: Private Empirical Risk Minimization: Efficient Algorithms and Tight Error Bounds
+- Doc Text: Convex empirical risk minimization is a basic tool in machine learning and statistics. We provide new algorithms and matching lower bounds for differentially private convex empirical risk minimization assuming only that each data point's contribution to the loss function is Lipschitz and that the do...
+## CORE
+**Has Scores 0, 1**
+**Score 1 = RELEVANT** ✓
+## Special Notes
+- **Original Purpose**: Citation prediction - find papers to cite for a given paper
+- **What Score 1 Really Means**: Papers that should be cited (topically related research)
+- **Dataset Type**: Scientific citation recommendation
+- **Key Characteristic**: Helps find relevant prior work for academic papers
+## Training Verdict
+**GOOD FOR TRAINING** ✓
+- Score 1 documents are citation-worthy papers
+- True relevance for academic IR
+- Can use for binary classification: Score 1 = positive, Score 0 = negative
+## Observations

test_scores/dataset_reports/scifact_analysis.md ADDED Viewed

	@@ -0,0 +1,264 @@

+# SCIFACT Dataset Analysis
+## Overview
+- Total queries available: 1109
+- Total corpus documents: 5183
+- Corpus documents loaded for examples: 5183
+## Split Statistics
+| Split | Queries | Total Pairs | Score Distribution |
+|-------|---------|-------------|-------------------|
+| train | 809 | 919 | 1:919 |
+| dev | N/A | N/A | N/A |
+| test | 300 | 339 | 1:339 |
+## Score Analysis
+**Only one score level (1) found in this dataset.**
+- This means all labeled documents are considered equally relevant
+- Unlabeled documents (not in qrels) are implicitly score 0 (not relevant)
+## Examples
+### Score Comparison Examples
+### Examples by Split and Score
+#### TRAIN Split
+**Scores in train: [1]**
+##### Score 1 Examples (train)
+**Example 1:**
+- Query: 0-dimensional biomaterials lack inductive properties.
+- Doc Title: New opportunities: the use of nanotechnologies to manipulate and track stem cells.
+- Doc Text: Nanotechnologies are emerging platforms that could be useful in measuring, understanding, and manipulating stem cells. Examples include magnetic nanoparticles and quantum dots for stem cell labeling and in vivo tracking; nanoparticles, carbon nanotubes, and polyplexes for the intracellular delivery ...
+**Example 2:**
+- Query: 1 in 5 million in UK have abnormal PrP positivity.
+- Doc Title: Prevalent abnormal prion protein in human appendixes after bovine spongiform encephalopathy epizootic: large scale survey
+- Doc Text: OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, an...
+**Example 3:**
+- Query: 1-1% of colorectal cancer patients are diagnosed with regional or distant metastases.
+- Doc Title: Relation between Medicare screening reimbursement and stage at diagnosis for older patients with colon cancer.
+- Doc Text: CONTEXT Medicare's reimbursement policy was changed in 1998 to provide coverage for screening colonoscopies for patients with increased colon cancer risk, and expanded further in 2001 to cover screening colonoscopies for all individuals. OBJECTIVE To determine whether the Medicare reimbursement poli...
+**Example 4:**
+- Query: 10% of sudden infant death syndrome (SIDS) deaths happen in newborns aged less than 6 months.
+- Doc Title: Sudden infant death syndrome.
+- Doc Text: Despite declines in prevalence during the past two decades, sudden infant death syndrome (SIDS) continues to be the leading cause of death for infants aged between 1 month and 1 year in developed countries. Behavioural risk factors identified in epidemiological studies include prone and side positio...
+**Example 5:**
+- Query: 32% of liver transplantation programs required patients to discontinue methadone treatment in 2001.
+- Doc Title: Liver transplantation and opioid dependence.
+- Doc Text: ContextChronic hepatitis C is the leading cause for liver transplantation in the United States. Intravenous drug use, the major risk factor, accounts for approximately 60% of hepatitis C virus transmission. Information from the United Network of Organ Sharing (UNOS) does not address substance use am...
+**Example 6:**
+- Query: 4-PBA treatment decreases endoplasmic reticulum stress in response to general endoplasmic reticulum stress markers.
+- Doc Title: Wolfram syndrome 1 and adenylyl cyclase 8 interact at the plasma membrane to regulate insulin production and secretion
+- Doc Text: Endoplasmic reticulum (ER) stress causes pancreatic β-cell dysfunction and contributes to β-cell loss and the progression of type 2 diabetes. Wolfram syndrome 1 (WFS1) has been shown to be an important regulator of the ER stress signalling pathway; however, its role in β-cell function remains unclea...
+**Example 7:**
+- Query: 4-PBA treatment raises endoplasmic reticulum stress in response to general endoplasmic reticulum stress markers.
+- Doc Title: Wolfram syndrome 1 and adenylyl cyclase 8 interact at the plasma membrane to regulate insulin production and secretion
+- Doc Text: Endoplasmic reticulum (ER) stress causes pancreatic β-cell dysfunction and contributes to β-cell loss and the progression of type 2 diabetes. Wolfram syndrome 1 (WFS1) has been shown to be an important regulator of the ER stress signalling pathway; however, its role in β-cell function remains unclea...
+**Example 8:**
+- Query: 40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.
+- Doc Title: Effect of homocysteine lowering on mortality and vascular disease in advanced chronic kidney disease and end-stage renal disease: a randomized control
+- Doc Text: CONTEXT High plasma homocysteine levels are a risk factor for mortality and vascular disease in observational studies of patients with chronic kidney disease. Folic acid and B vitamins decrease homocysteine levels in this population but whether they lower mortality is unknown. OBJECTIVE To determine...
+**Example 9:**
+- Query: 5'-nucleotidase metabolizes 6MP.
+- Doc Title: Relapse specific mutations in NT5C2 in childhood acute lymphoblastic leukemia
+- Doc Text: Relapsed childhood acute lymphoblastic leukemia (ALL) carries a poor prognosis, despite intensive retreatment, owing to intrinsic drug resistance. The biological pathways that mediate resistance are unknown. Here, we report the transcriptome profiles of matched diagnosis and relapse bone marrow spec...
+**Example 10:**
+- Query: 50% of patients exposed to radiation have activated markers of mesenchymal stem cells.
+- Doc Title: KLF4-dependent phenotypic modulation of smooth muscle cells has a key role in atherosclerotic plaque pathogenesis
+- Doc Text: Previous studies investigating the role of smooth muscle cells (SMCs) and macrophages in the pathogenesis of atherosclerosis have provided controversial results owing to the use of unreliable methods for clearly identifying each of these cell types. Here, using Myh11-CreERT2 ROSA floxed STOP eYFP Ap...
+#### TEST Split
+**Scores in test: [1]**
+##### Score 1 Examples (test)
+**Example 1:**
+- Query: 0-dimensional biomaterials show inductive properties.
+- Doc Title: New opportunities: the use of nanotechnologies to manipulate and track stem cells.
+- Doc Text: Nanotechnologies are emerging platforms that could be useful in measuring, understanding, and manipulating stem cells. Examples include magnetic nanoparticles and quantum dots for stem cell labeling and in vivo tracking; nanoparticles, carbon nanotubes, and polyplexes for the intracellular delivery ...
+**Example 2:**
+- Query: 1,000 genomes project enables mapping of genetic sequence variation consisting of rare variants with larger penetrance effects than common variants.
+- Doc Title: Rare Variants Create Synthetic Genome-Wide Associations
+- Doc Text: Genome-wide association studies (GWAS) have now identified at least 2,000 common variants that appear associated with common diseases or related traits (http://www.genome.gov/gwastudies), hundreds of which have been convincingly replicated. It is generally thought that the associated markers reflect...
+**Example 3:**
+- Query: 1/2000 in UK have abnormal PrP positivity.
+- Doc Title: Prevalent abnormal prion protein in human appendixes after bovine spongiform encephalopathy epizootic: large scale survey
+- Doc Text: OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, an...
+**Example 4:**
+- Query: 5% of perinatal mortality is due to low birth weight.
+- Doc Title: Estimates of global prevalence of childhood underweight in 1990 and 2015.
+- Doc Text: CONTEXT One key target of the United Nations Millennium Development goals is to reduce the prevalence of underweight among children younger than 5 years by half between 1990 and 2015. OBJECTIVE To estimate trends in childhood underweight by geographic regions of the world. DESIGN, SETTING, AND PARTI...
+**Example 5:**
+- Query: A deficiency of vitamin B12 increases blood levels of homocysteine.
+- Doc Title: Folic acid improves endothelial function in coronary artery disease via mechanisms largely independent of homocysteine lowering.
+- Doc Text: BACKGROUND Homocysteine is a risk factor for coronary artery disease (CAD), although a causal relation remains to be proven. The importance of determining direct causality rests in the fact that plasma homocysteine can be safely and inexpensively reduced by 25% with folic acid. This reduction is max...
+**Example 6:**
+- Query: A high microerythrocyte count raises vulnerability to severe anemia in homozygous alpha (+)- thalassemia trait subjects.
+- Doc Title: Increased Microerythrocyte Count in Homozygous α+-Thalassaemia Contributes to Protection against Severe Malarial Anaemia
+- Doc Text: BACKGROUND The heritable haemoglobinopathy alpha(+)-thalassaemia is caused by the reduced synthesis of alpha-globin chains that form part of normal adult haemoglobin (Hb). Individuals homozygous for alpha(+)-thalassaemia have microcytosis and an increased erythrocyte count. Alpha(+)-thalassaemia hom...
+**Example 7:**
+- Query: A total of 1,000 people in the UK are asymptomatic carriers of vCJD infection.
+- Doc Title: Prevalent abnormal prion protein in human appendixes after bovine spongiform encephalopathy epizootic: large scale survey
+- Doc Text: OBJECTIVES To carry out a further survey of archived appendix samples to understand better the differences between existing estimates of the prevalence of subclinical infection with prions after the bovine spongiform encephalopathy epizootic and to see whether a broader birth cohort was affected, an...
+**Example 8:**
+- Query: ADAR1 binds to Dicer to cleave pre-miRNA.
+- Doc Title: ADAR1 Forms a Complex with Dicer to Promote MicroRNA Processing and RNA-Induced Gene Silencing
+- Doc Text: Adenosine deaminases acting on RNA (ADARs) are involved in RNA editing that converts adenosine residues to inosine specifically in double-stranded RNAs. In this study, we investigated the interaction of the RNA editing mechanism with the RNA interference (RNAi) machinery and found that ADAR1 forms a...
+**Example 9:**
+- Query: AIRE is expressed in some skin tumors.
+- Doc Title: Keratin-dependent regulation of Aire and gene expression in skin tumor keratinocytes
+- Doc Text: Expression of the intermediate filament protein keratin 17 (K17) is robustly upregulated in inflammatory skin diseases and in many tumors originating in stratified and pseudostratified epithelia. We report that autoimmune regulator (Aire), a transcriptional regulator, is inducibly expressed in human...
+**Example 10:**
+- Query: ALDH1 expression is associated with better breast cancer outcomes.
+- Doc Title: ALDH1 is a marker of normal and malignant human mammary stem cells and a predictor of poor clinical outcome.
+- Doc Text: Application of stem cell biology to breast cancer research has been limited by the lack of simple methods for identification and isolation of normal and malignant stem cells. Utilizing in vitro and in vivo experimental systems, we show that normal and cancer human mammary epithelial cells with incre...
+## CORE
+**Only Score 1 exists**
+**Score 1 = RELEVANT** ✓
+## Special Notes
+- **Original Purpose**: Scientific claim verification - find evidence to support/refute claims
+- **What Score 1 Really Means**: Papers containing evidence for verifying scientific claims
+- **Dataset Type**: Scientific fact checking
+- **Key Characteristic**: Claims paired with abstracts that can verify them
+## Training Verdict
+**GOOD FOR TRAINING** ✓
+- Score 1 documents provide verification evidence
+- True scientific relevance matching
+- Can use for binary classification: Score 1 = positive class
+## Observations
+⚠️ **Training data only has Score 1:**
+- All training documents are marked as equally relevant
+- Model cannot learn to distinguish relevance levels
+- May indicate collapsed/merged relevance levels

test_scores/dataset_reports/training_config.json ADDED Viewed

	@@ -0,0 +1,164 @@

+{
+  "beir_training_config": {
+    "version": "2.0",
+    "created": "2024-01-09",
+    "purpose": "Three-category training with within-dataset sampling priority",
+    "datasets": {
+      "arguana": {
+        "use": true,
+        "provides_hard_negatives": false,
+        "splits_available": ["test"],
+        "score_to_category": {
+          "1": "positive",
+          "0": "easy_negative",
+          "unlabeled": "easy_negative"
+        },
+        "sampling": "within_dataset",
+        "notes": "Counter-arguments are relevant. Only test split (1406 queries)."
+      },
+      "climate-fever": {
+        "use": false,
+        "provides_hard_negatives": false,
+        "splits_available": ["test"],
+        "score_to_category": {
+          "1": "skip"
+        },
+        "sampling": "skip",
+        "notes": "Score 1 is questionable. DO NOT USE. Can provide easy negatives to others."
+      },
+      "dbpedia-entity": {
+        "use": true,
+        "provides_hard_negatives": true,
+        "splits_available": ["dev", "test"],
+        "score_to_category": {
+          "2": "positive",
+          "1": "hard_negative",
+          "0": "easy_negative",
+          "unlabeled": "easy_negative"
+        },
+        "sampling": "within_dataset",
+        "notes": "GOLD MINE! Score 1 = keyword match without intent = perfect hard negatives"
+      },
+      "fever": {
+        "use": true,
+        "provides_hard_negatives": false,
+        "splits_available": ["train", "dev", "test"],
+        "score_to_category": {
+          "1": "positive",
+          "0": "easy_negative",
+          "unlabeled": "easy_negative"
+        },
+        "sampling": "within_dataset",
+        "notes": "Evidence that supports OR refutes claims. Large dataset (140K train)."
+      },
+      "fiqa": {
+        "use": true,
+        "provides_hard_negatives": false,
+        "splits_available": ["train", "dev", "test"],
+        "score_to_category": {
+          "1": "positive",
+          "0": "easy_negative",
+          "unlabeled": "easy_negative"
+        },
+        "sampling": "within_dataset",
+        "notes": "Financial Q&A. 5500 train queries."
+      },
+      "hotpotqa": {
+        "use": true,
+        "provides_hard_negatives": false,
+        "splits_available": ["train", "dev", "test"],
+        "score_to_category": {
+          "1": "positive",
+          "0": "easy_negative",
+          "unlabeled": "easy_negative"
+        },
+        "sampling": "within_dataset",
+        "notes": "Multi-hop reasoning. 85K train queries."
+      },
+      "nfcorpus": {
+        "use": true,
+        "provides_hard_negatives": false,
+        "splits_available": ["train", "dev", "test"],
+        "score_to_category": {
+          "1": "positive",
+          "2": "positive",
+          "0": "easy_negative",
+          "unlabeled": "easy_negative"
+        },
+        "sampling": "within_dataset",
+        "notes": "Medical IR. Both 1&2 are relevant. WARNING: train has only 1s, test/dev have 1&2."
+      },
+      "quora": {
+        "use": true,
+        "provides_hard_negatives": false,
+        "splits_available": ["dev", "test"],
+        "score_to_category": {
+          "1": "positive",
+          "0": "easy_negative",
+          "unlabeled": "easy_negative"
+        },
+        "sampling": "within_dataset",
+        "notes": "Duplicate questions. 5K dev, 10K test queries."
+      },
+      "scidocs": {
+        "use": true,
+        "provides_hard_negatives": false,
+        "splits_available": ["test"],
+        "score_to_category": {
+          "1": "positive",
+          "0": "easy_negative",
+          "unlabeled": "easy_negative"
+        },
+        "sampling": "within_dataset",
+        "notes": "Citation prediction. 1000 test queries."
+      },
+      "scifact": {
+        "use": true,
+        "provides_hard_negatives": false,
+        "splits_available": ["train", "test"],
+        "score_to_category": {
+          "1": "positive",
+          "0": "easy_negative",
+          "unlabeled": "easy_negative"
+        },
+        "sampling": "within_dataset",
+        "notes": "Scientific claim verification. 920 train, 300 test."
+      },
+      "trec-covid": {
+        "use": true,
+        "provides_hard_negatives": false,
+        "splits_available": ["test"],
+        "score_to_category": {
+          "2": "positive",
+          "1": "positive",
+          "0": "easy_negative",
+          "unlabeled": "easy_negative"
+        },
+        "sampling": "within_dataset",
+        "notes": "COVID-19 IR. Both 1&2 are relevant. Only 50 test queries."
+      },
+      "webis-touche2020": {
+        "use": true,
+        "provides_hard_negatives": true,
+        "splits_available": ["test"],
+        "score_to_category": {
+          "2": "positive",
+          "1": "hard_negative",
+          "0": "easy_negative",
+          "unlabeled": "easy_negative"
+        },
+        "sampling": "within_dataset",
+        "notes": "GOLD MINE! Score 1 = wrong topic = perfect hard negatives. Only 49 test queries."
+      }
+    },
+    "global_settings": {
+      "sampling_factor": 10,
+      "min_easy_negatives": 5,
+      "max_easy_negatives": 100,
+      "sampling_priority": "within_dataset_first",
+      "cross_dataset_fallback": true,
+      "random_seed": 42,
+      "hard_negative_sources": ["dbpedia-entity", "webis-touche2020"]
+    }
+  }
+}

test_scores/dataset_reports/trec-covid_analysis.md ADDED Viewed

	@@ -0,0 +1,782 @@

+# TREC-COVID Dataset Analysis
+## Overview
+- Total queries available: 50
+- Total corpus documents: 171332
+- Corpus documents loaded for examples: 50000
+## Split Statistics
+| Split | Queries | Total Pairs | Score Distribution |
+|-------|---------|-------------|-------------------|
+| train | N/A | N/A | N/A |
+| dev | N/A | N/A | N/A |
+| test | 50 | 66334 | 0:41661, 1:10456, 2:14217 |
+## Score Analysis
+**Multiple score levels found: [0, 1, 2]**
+- Higher scores indicate higher relevance
+- Score 0 typically means not relevant
+- Score 1 typically means relevant
+- Score 2 typically means highly relevant
+## Examples
+### Score Comparison Examples
+#### Comparison Example 1
+**Query:** what is the origin of COVID-19
+**From:** test split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Intraspecies diversity of SARS-like coronaviruses in Rhinolophus sinicus and its implications for the origin of SARS coronaviruses in humans.
+- Text: The Chinese rufous horseshoe bat (Rhinolophus sinicus) has been suggested to carry the direct ancestor of severe acute respiratory syndrome (SARS) coronavirus (SCoV), and the diversity of SARS-like CoVs (SLCoV) within this Rhinolophus species is therefore worth investigating. Here, we demonstrate the remarkable diversity of SLCoVs in R. sinicus and identify a strain with the same pattern of phylog...
+*Document 2:*
+- Title: Virus–Host Coevolution with a Focus on Animal and Human DNA Viruses
+- Text: Viruses have been infecting their host cells since the dawn of life, and this extremely long-term coevolution gave rise to some surprising consequences for the entire tree of life. It is hypothesised that viruses might have contributed to the formation of the first cellular life form, or that even the eukaryotic cell nucleus originates from an infection by a coated virus. The continuous struggle b...
+*Document 3:*
+- Title: Animal source of the coronavirus continues to elude scientists.
+- Text: ...
+**Score 1 Documents:**
+*Document 1:*
+- Title: Deadly virus effortlessly hops species
+- Text: Genetic engineering helps reveal origin of deadly 'flu...
+*Document 2:*
+- Title: Emerging infectious diseases
+- Text: The spectrum of human pathogens and the infectious diseases they cause is continuously changing through evolution and changes in the way human populations interact with their environment and each other. New human pathogens most often emerge from an animal reservoir, emphasizing the central role that non-human reservoirs play in human infectious diseases. Pathogens may also re-emerge with new chara...
+*Document 3:*
+- Title: Latest SARS evidence
+- Text: ...
+**Score 0 Documents:**
+*Document 1:*
+- Title: The epidemiology of MERS-CoV
+- Text: ...
+*Document 2:*
+- Title: Anesthesia and COVID-19: What We Should Know and What We Should Do.
+- Text: Coronavirus disease 2019 (COVID-19), caused by Severe Acute Respiratory Syndrome-Coronavirus-2 (SARS-CoV-2), was first reported in Wuhan, Hubei, China, and has spread to more than 200 other countries around the world. COVID-19 is a highly contagious disease with continuous human-to-human transmission. The origin of the virus is unknown. Airway manipulations and intubations, which are common during...
+*Document 3:*
+- Title: Diseases of the Liver and Hepatobiliary System
+- Text: ...
+---
+#### Comparison Example 2
+**Query:** how does the coronavirus respond to changes in the weather
+**From:** test split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Seasonality of Respiratory Viral Infections.
+- Text: The seasonal cycle of respiratory viral diseases has been widely recognized for thousands of years, as annual epidemics of the common cold and influenza disease hit the human population like clockwork in the winter season in temperate regions. Moreover, epidemics caused by viruses such as severe acute respiratory syndrome coronavirus (SARS-CoV) and the newly emerging SARS-CoV-2 occur during the wi...
+*Document 2:*
+- Title: Effect modification of environmental factors on influenza-associated mortality: a time-series study in two Chinese cities
+- Text: BACKGROUND: Environmental factors have been associated with transmission and survival of influenza viruses but no studies have ever explored the role of environmental factors on severity of influenza infection. METHODS: We applied a Poisson regression model to the mortality data of two Chinese metropolitan cities located within the subtropical zone, to calculate the influenza associated excess mor...
+*Document 3:*
+- Title: Meteorological impacts on the incidence of COVID-19 in the U.S.
+- Text: Since the World Health Organization has declared the current outbreak of the novel coronavirus (COVID-19) a global pandemic, some have been anticipating that the mitigation could happen in the summer like seasonal influenza, while medical solutions are still in a slow progress. Experimental studies have revealed a few evidences that coronavirus decayed quickly under the exposure of heat and humidi...
+**Score 1 Documents:**
+*Document 1:*
+- Title: The association between the seasonality of pediatric pandemic influenza virus outbreak and ambient meteorological factors in Shanghai
+- Text: BACKGROUND AND OBJECTIVES: The number of pediatric patients diagnosed with influenza types A and B is increasing annually, especially in temperate regions such as Shanghai (China). The onset of pandemic influenza viruses might be attributed to various ambient meteorological factors including temperature, relative humidity (Rh), and PM(1) concentrations, etc. The study aims to explore the correlati...
+*Document 2:*
+- Title: A relationship between acute respiratory illnesses and weather.
+- Text: Weekly data from 7 years (2004-2010) of primary-care counts of acute respiratory illnesses (ARIs) and local weather readings were used to adjust a multivariate time-series vector error correction model with covariates (VECMX). Weather variables were included through a partial least squares index that consisted of weekly minimum temperature (coefficient = - 0·26), weekly median of relative humidity...
+*Document 3:*
+- Title: Evaluating the impact of the weather conditions on the influenza propagation
+- Text: BACKGROUND: Predicting the details of how an epidemic evolves is highly valuable as health institutions need to better plan towards limiting the infection propagation effects and optimizing their prediction and response capabilities. Simulation is a cost- and time-effective way of predicting the evolution of the infection as the joint influence of many different factors: interaction patterns, pers...
+**Score 0 Documents:**
+*Document 1:*
+- Title: Bayesian Analysis for Inference of an Emerging Epidemic: Citrus Canker in Urban Landscapes
+- Text: Outbreaks of infectious diseases require a rapid response from policy makers. The choice of an adequate level of response relies upon available knowledge of the spatial and temporal parameters governing pathogen spread, affecting, amongst others, the predicted severity of the epidemic. Yet, when a new pathogen is introduced into an alien environment, such information is often lacking or of no use,...
+*Document 2:*
+- Title: Scientific Opinion on the welfare of cattle kept for beef production and the welfare in intensive calf farming systems
+- Text: Information given in previous Opinions “Welfare of cattle kept for beef production” (SCAHAW, 2001) and “The risks of poor welfare in intensive calf farming systems” (EFSA, 2006) is updated and recent scientific evidence on the topics reviewed. Risks of poor welfare are identified using a structured analysis, and issues not identified in the SCAHAW (2001) beef Opinion, especially effects of housing...
+*Document 3:*
+- Title: Emergency Preparedness—Planning and Management
+- Text: In general, emergencies can be divided into internal and external situations. Regardless of the nature of the emergency, the organizational response will be quite different when the emergency condition exists within the facility from when it occurs at some distant location. The actual emergency response programs of healthcare facilities indicate that much more effort has gone into a prepared respo...
+---
+#### Comparison Example 3
+**Query:** will SARS-CoV2 infected people develop immunity? Is cross protection possible?
+**From:** test split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Serological differentiation between COVID-19 and SARS infections.
+- Text: In response to the coronavirus disease 2019 (COVID-19) outbreak, caused by the SARS-CoV-2 virus, multiple diagnostic tests are required globally for acute disease diagnosis, contact tracing, monitoring of asymptomatic infection rates and assessing herd immunity. While PCR remains the frontline test of choice in the acute diagnostic setting, serological tests are urgently needed to fulfil the other...
+*Document 2:*
+- Title: The laboratory tests and host immunity of COVID-19 patients with different severity of illness.
+- Text: BACKGROUND The Coronavirus Disease-2019 (COVID-19), infected by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), has caused a severe outbreak in China. The host immunity of COVID-19 patients is unknown. METHODS The routine laboratory tests and host immunity in COVID-19 patients with different severity of illness were compared after patient admission. RESULTS A total of 65 SARS-CoV-2-p...
+*Document 3:*
+- Title: Host Immune Response and Immunobiology of Human SARS-CoV-2 Infection
+- Text: One of the most serious viral outbreaks of the decade, infecting humans, originated from the city of Wuhan, China, by the end of December 2019, has left the world shaken up. It is the successor infection of severe acute respiratory syndrome coronavirus (SARS-CoV) named as SARS-CoV-2 causing a disease called as COVID-19 (Coronavirus disease-19). Being one of the most severe diseases in terms of tra...
+**Score 1 Documents:**
+*Document 1:*
+- Title: Complete protection against severe acute respiratory syndrome coronavirus-mediated lethal respiratory disease in aged mice by immunization with a mouse-adapted virus lacking E protein.
+- Text: Zoonotic coronaviruses, including the one that caused severe acute respiratory syndrome (SARS), cause significant morbidity and mortality in humans. No specific therapy for any human coronavirus is available, making vaccine development critical for protection against these viruses. We previously showed that recombinant SARS coronavirus (SARS-CoV) (Urbani strain based) lacking envelope (E) protein ...
+*Document 2:*
+- Title: Temporal Dynamics of Co-circulating Lineages of Porcine Reproductive and Respiratory Syndrome Virus
+- Text: Porcine Reproductive and Respiratory Syndrome Virus (PRRSV) is the most important endemic pathogen in the U.S. swine industry. Despite control efforts involving improved biosecurity and different vaccination protocols, the virus continues to circulate and evolve. One of the foremost challenges in its control is high levels of genetic and antigenic diversity. Here, we quantify the co-circulation, e...
+*Document 3:*
+- Title: Proposal for vaccination against SARS coronavirus using avian infectious bronchitis virus strain H from The Netherlands
+- Text: ...
+**Score 0 Documents:**
+*Document 1:*
+- Title: Updated Vaccination Recommendations for Carnivores
+- Text: ...
+*Document 2:*
+- Title: Deep sequencing identifies viral and wasp genes with potential roles in replication of Microplitis demolitor Bracovirus.
+- Text: Viruses in the genus Bracovirus (BV) (Polydnaviridae) are symbionts of parasitoid wasps that specifically replicate in the ovaries of females. Recent analysis of expressed sequence tags from two wasp species, Cotesia congregata and Chelonus inanitus, identified transcripts related to 24 different nudivirus genes. These results together with other data strongly indicate that BVs evolved from a nudi...
+*Document 3:*
+- Title: BCG vaccine protection from severe coronavirus disease 2019 (COVID-19).
+- Text: A series of epidemiological explorations has suggested a negative association between national bacillus Calmette-Guérin (BCG) vaccination policy and the prevalence and mortality of coronavirus disease 2019 (COVID-19). However, these comparisons are difficult to validate due to broad differences between countries such as socioeconomic status, demographic structure, rural vs. urban settings, time of...
+---
+#### Comparison Example 4
+**Query:** what causes death from Covid-19?
+**From:** test split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Covid-19: Death rate in England and Wales reaches record high because of covid-19.
+- Text: ...
+*Document 2:*
+- Title: Clinical Characteristics and Risk Factors for Mortality of COVID-19 Patients With Diabetes in Wuhan, China: A Two-Center, Retrospective Study.
+- Text: OBJECTIVE Diabetes is common in COVID-19 patients and associated with unfavorable outcomes. We aimed to describe the characteristics and outcomes and to analyze the risk factors for in-hospital mortality of COVID-19 patients with diabetes. RESEARCH DESIGN AND METHODS This two-center retrospective study was performed at two tertiary hospitals in Wuhan, China. Confirmed COVID-19 patients with diabet...
+*Document 3:*
+- Title: Sudden Death from Pulmonary Causes
+- Text: This chapter seeks to survey many of the common pathological entities identified in the lungs at autopsy and the potential role of pulmonary disease in formulating an opinion regarding the cause of death. Appreciation of pulmonary pathology in the medicolegal context is important as it frequently contributes to the immediate or underlying mechanisms of death. The primacy of the lungs in breathing ...
+**Score 1 Documents:**
+*Document 1:*
+- Title: The kidney in COVID-19: protagonist or figurant?
+- Text: The etiology of injury in COVID-19 patients is diverse and multifactorial. Autopsy and biopsy studies reveal, alongside podocyte and tubular cell anomalies, the presence of virion within the cells. Evidence suggests that, in addition to the direct cytopathic effect of SARS- CoV2 on the glomeruli and renal tubules, there is also the indirect effect of cell-mediated immunity, the cytokines storm and...
+*Document 2:*
+- Title: Why are more BAME people dying from COVID-19?
+- Text: ...
+*Document 3:*
+- Title: Symptom burden and clinical profile of COVID-19 deaths: a rapid systematic review and evidence summary.
+- Text: The spread of pandemic COVID-19 has created unprecedented need for information. The pandemic is the cause of significant mortality and with this the need for rapidly disseminated information for palliative care professionals regarding the prevalence of symptoms, their intensity, their resistance or susceptibility to symptom control and the mode of death for patients. METHODS We undertook a systema...
+**Score 0 Documents:**
+*Document 1:*
+- Title: Herausforderungen in der Seuchenkontrolle und der jetzigen Pandemie durch verzerrte Verteilungen
+- Text: BACKGROUND: The pandemic phase 3—mitigation—by the SARS-Coronavirus‑2 is currently taking on speed in Germany. Skewed distributions of key epidemiological parameters of the virus and patient care are a challenge for the control of the outbreak as well as keeping the system functional. METHODS: The skewed parameters—pre-patency period, incubation period, duration of viral shedding and time to admis...
+*Document 2:*
+- Title: Article 14 Prevention of Spread of Disease
+- Text: Each contracting State agrees to take effective measures to prevent the spread by means of air navigation of cholera, typhus (epidemic), smallpox, yellow fever, plague, and such other communicable diseases as the contracting States shall from time to time decide to designate, and to that end contracting States will keep in close consultation with the agencies concerned with international regulatio...
+*Document 3:*
+- Title: Diseases of the Liver and Hepatobiliary System
+- Text: ...
+---
+#### Comparison Example 5
+**Query:** what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?
+**From:** test split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Complete protection against severe acute respiratory syndrome coronavirus-mediated lethal respiratory disease in aged mice by immunization with a mouse-adapted virus lacking E protein.
+- Text: Zoonotic coronaviruses, including the one that caused severe acute respiratory syndrome (SARS), cause significant morbidity and mortality in humans. No specific therapy for any human coronavirus is available, making vaccine development critical for protection against these viruses. We previously showed that recombinant SARS coronavirus (SARS-CoV) (Urbani strain based) lacking envelope (E) protein ...
+*Document 2:*
+- Title: Protease Inhibitors: Candidate Drugs to Inhibit Severe Acute Respiratory Syndrome Coronavirus 2 Replication.
+- Text: The number of patients infected with severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) has rapidly increased, although the WHO declared a pandemic. However, drugs that function against SARS-CoV-2 have not been established. SARS-CoV-2 has been suggested to bind angiotensin-converting enzyme 2, the receptor of the SARS coronavirus. SARS coronavirus and coronavirus 229E, the cause of the c...
+*Document 3:*
+- Title: Immunogenicity and protective efficacy in mice and hamsters of a β-propiolactone inactivated whole virus SARS-CoV vaccine.
+- Text: The immunogenicity and efficacy of β-propiolactone (BPL) inactivated whole virion SARS-CoV (WI-SARS) vaccine was evaluated in BALB/c mice and golden Syrian hamsters. The vaccine preparation was tested with or without adjuvants. Adjuvant Systems AS01(B) and AS03(A) were selected and tested for their capacity to elicit high humoral and cellular immune responses to WI-SARS vaccine. We evaluated the e...
+**Score 1 Documents:**
+*Document 1:*
+- Title: Comparison of SARS-CoV-2 spike protein binding to ACE2 receptors from human, pets, farm animals, and putative intermediate hosts.
+- Text: The emergence of a novel coronavirus, SARS-CoV-2, resulted in a pandemic. Here, we used X-ray structures of human ACE2 bound to the receptor-binding domain (RBD) of the spike protein (S) from SARS-CoV-2 to predict its binding to ACE2 proteins from different animals, including pets, farm animals, and putative intermediate hosts of SARS-CoV-2. Comparing the interaction sites of ACE2 proteins known t...
+*Document 2:*
+- Title: Replication of SARS-CoV-2 in human respiratory epithelium.
+- Text: Currently, there are four seasonal coronaviruses associated with relatively mild respiratory tract disease in humans. However, there are also a plethora of animal coronaviruses, which have the potential to cross the species border. This regularly results in the emergence of new viruses in humans. In 2002 SARS-CoV emerged, to rapidly disappear in May 2003. In 2012 MERS-CoV was identified as a possi...
+*Document 3:*
+- Title: Cholesterol-modifying drugs in COVID-19
+- Text: Infection with SARS-CoV-2 is more likely to lead to poor outcomes in the elderly and those with cardiovascular disease, obesity or metabolic syndrome. Here we consider mechanisms by which dyslipidemia and the use of cholesterol-modifying drugs could influence the virus-host relationship. Cholesterol is essential for the assembly, replication and infectivity of enveloped virus particles; we highlig...
+**Score 0 Documents:**
+*Document 1:*
+- Title: Extensive Literature Search on the ‘Effects of Copper intake levels in the gut microbiota profile of target animals, in particular piglets’
+- Text: ...
+*Document 2:*
+- Title: Big Data and Biodefense: Prospects and Pitfalls
+- Text: This chapter will provide an overview of how “big data” and “big data” analytics can be brought to bear on the pressing biodefense challenges of: (1) threat awareness; and (2) surveillance and detection. The chapter will also discuss potential problems that can arise by relying exclusively on “big data” approaches, which have properties and limitations inherent in their composition that may not be...
+*Document 3:*
+- Title: Types of Recombinant Vaccines
+- Text: The original scientific strategy behind vaccinology has historically been to “isolate, inactivate, and inject,” first invoked by Louis Pasteur....
+---
+### Examples by Split and Score
+#### TEST Split
+**Scores in test: [-1, 0, 1, 2]**
+##### Score -1 Examples (test)
+No examples found in loaded corpus.
+##### Score 0 Examples (test)
+**Example 1:**
+- Query: what is the origin of COVID-19
+- Doc Title: The epidemiology of MERS-CoV
+- Doc Text: ...
+**Example 2:**
+- Query: how does the coronavirus respond to changes in the weather
+- Doc Title: Bayesian Analysis for Inference of an Emerging Epidemic: Citrus Canker in Urban Landscapes
+- Doc Text: Outbreaks of infectious diseases require a rapid response from policy makers. The choice of an adequate level of response relies upon available knowledge of the spatial and temporal parameters governing pathogen spread, affecting, amongst others, the predicted severity of the epidemic. Yet, when a n...
+**Example 3:**
+- Query: will SARS-CoV2 infected people develop immunity? Is cross protection possible?
+- Doc Title: Updated Vaccination Recommendations for Carnivores
+- Doc Text: ...
+**Example 4:**
+- Query: what causes death from Covid-19?
+- Doc Title: Herausforderungen in der Seuchenkontrolle und der jetzigen Pandemie durch verzerrte Verteilungen
+- Doc Text: BACKGROUND: The pandemic phase 3—mitigation—by the SARS-Coronavirus‑2 is currently taking on speed in Germany. Skewed distributions of key epidemiological parameters of the virus and patient care are a challenge for the control of the outbreak as well as keeping the system functional. METHODS: The s...
+**Example 5:**
+- Query: what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?
+- Doc Title: Extensive Literature Search on the ‘Effects of Copper intake levels in the gut microbiota profile of target animals, in particular piglets’
+- Doc Text: ...
+**Example 6:**
+- Query: what types of rapid testing for Covid-19 have been developed?
+- Doc Title: Clinical Features and Chest CT Manifestations of Coronavirus Disease 2019 (COVID-19) in a Single-Center Study in Shanghai, China.
+- Doc Text: OBJECTIVE. Confronting the new coronavirus infection known as coronavirus disease 2019 (COVID-19) is challenging and requires excluding patients with suspected COVID-19 who actually have other diseases. The purpose of this study was to assess the clinical features and CT manifestations of COVID-19 b...
+**Example 7:**
+- Query: are there serological tests that detect antibodies to coronavirus?
+- Doc Title: A review of enzyme immunoassay for detection of antibody to Brucella abortus in cattle
+- Doc Text: Enzyme immunoassay has gained wide acceptance for serological diagnosis of bovine brucellosis because of its ability to detect antibody of all isotypes unlike the conventional tests. The indirect enzyme immunoassay, however, presents several parameters that require careful analysis. These parameters...
+**Example 8:**
+- Query: how has lack of testing availability led to underreporting of true incidence of Covid-19?
+- Doc Title: Implications of SARSr-CoV 2 infection in thalassemias: Do patients fall into the "high clinical risk" category?
+- Doc Text: We're all flying blind regarding coronavirus, but it's fair to think if thalassemic patients are particularly vulnerable to SARS-COV-2 infection or are at potential higher risk of complications from COVID-19 than normal population, specially when they become older. The frustrating thing is that, rig...
+**Example 9:**
+- Query: how has COVID-19 affected Canada
+- Doc Title: [Diagnostic steps and estimation of disease prognosis in COVID-19-infected patients].
+- Doc Text: In December 2019, a new coronavirus outbreak hit his head in Wuhan, China. The identified pathogen is a new coronavirus named "severe acute respiratory syndrome coronavirus 2," or SARS-CoV-2, and the respiratory syndrome it elicits was named "coronavirus disease 2019," or COVID-19. The World Health ...
+**Example 10:**
+- Query: has social distancing had an impact on slowing the spread of COVID-19?
+- Doc Title: Global trends in research related to social media in psychology: mapping and bibliometric analysis
+- Doc Text: BACKGROUND: Social media, defined as interactive Web applications, have been on the rise globally, particularly among adults. The objective of this study was to investigate the trend of the literature related to the most used social network worldwide (i.e. Facebook, Twitter, LinkedIn, Snapchat, and ...
+##### Score 1 Examples (test)
+**Example 1:**
+- Query: what is the origin of COVID-19
+- Doc Title: Deadly virus effortlessly hops species
+- Doc Text: Genetic engineering helps reveal origin of deadly 'flu...
+**Example 2:**
+- Query: how does the coronavirus respond to changes in the weather
+- Doc Title: The association between the seasonality of pediatric pandemic influenza virus outbreak and ambient meteorological factors in Shanghai
+- Doc Text: BACKGROUND AND OBJECTIVES: The number of pediatric patients diagnosed with influenza types A and B is increasing annually, especially in temperate regions such as Shanghai (China). The onset of pandemic influenza viruses might be attributed to various ambient meteorological factors including tempera...
+**Example 3:**
+- Query: will SARS-CoV2 infected people develop immunity? Is cross protection possible?
+- Doc Title: Complete protection against severe acute respiratory syndrome coronavirus-mediated lethal respiratory disease in aged mice by immunization with a mous
+- Doc Text: Zoonotic coronaviruses, including the one that caused severe acute respiratory syndrome (SARS), cause significant morbidity and mortality in humans. No specific therapy for any human coronavirus is available, making vaccine development critical for protection against these viruses. We previously sho...
+**Example 4:**
+- Query: what causes death from Covid-19?
+- Doc Title: The kidney in COVID-19: protagonist or figurant?
+- Doc Text: The etiology of injury in COVID-19 patients is diverse and multifactorial. Autopsy and biopsy studies reveal, alongside podocyte and tubular cell anomalies, the presence of virion within the cells. Evidence suggests that, in addition to the direct cytopathic effect of SARS- CoV2 on the glomeruli and...
+**Example 5:**
+- Query: what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?
+- Doc Title: Comparison of SARS-CoV-2 spike protein binding to ACE2 receptors from human, pets, farm animals, and putative intermediate hosts.
+- Doc Text: The emergence of a novel coronavirus, SARS-CoV-2, resulted in a pandemic. Here, we used X-ray structures of human ACE2 bound to the receptor-binding domain (RBD) of the spike protein (S) from SARS-CoV-2 to predict its binding to ACE2 proteins from different animals, including pets, farm animals, and...
+**Example 6:**
+- Query: what types of rapid testing for Covid-19 have been developed?
+- Doc Title: Understanding, verifying and implementing Emergency Use Authorization molecular diagnostics for the detection of SARS-CoV-2 RNA.
+- Doc Text: The SARS-CoV-2 pandemic has brought a new wave of challenges to health care, particularly in the area of rapid diagnostic test development and implementation. Acute diagnosis of COVID-19 infection is critically dependent on detection of SARS-CoV-2 RNA from clinical specimens (e.g. nasopharyngeal swa...
+**Example 7:**
+- Query: are there serological tests that detect antibodies to coronavirus?
+- Doc Title: Self-assembly into virus–like particles of the recombinant capsid protein of porcine circovirus type 3 and its application on antibodies detection
+- Doc Text: PCV3 capsid protein (Cap) is an important antigen for diagnosis and vaccine development. To achieve high-level expression of recombinant PCV3 Cap in Escherichia coli (E. coli), the gene of wild-type entire Cap (wt-eCap) was amplified from clinical samples, and three optimized entire Cap (opti-eCap) ...
+**Example 8:**
+- Query: how has lack of testing availability led to underreporting of true incidence of Covid-19?
+- Doc Title: Quarantine alone or in combination with other public health measures to control COVID-19: a rapid review.
+- Doc Text: BACKGROUND Coronavirus disease 2019 (COVID-19) is a rapidly emerging disease that has been classified a pandemic by the World Health Organization (WHO). To support WHO with their recommendations on quarantine, we conducted a rapid review on the effectiveness of quarantine during severe coronavirus o...
+**Example 9:**
+- Query: how has COVID-19 affected Canada
+- Doc Title: Health-related concerns and precautions during the COVID-19 pandemic: A comparison of Canadians with and without underlying health conditions.
+- Doc Text: BACKGROUND The risk of experiencing adverse outcomes from the coronavirus disease 2019 (COVID-19), such as hospitalization, admission to intensive care units and death, is elevated for older individuals and those with certain underlying health conditions including diabetes, chronic conditions affect...
+**Example 10:**
+- Query: has social distancing had an impact on slowing the spread of COVID-19?
+- Doc Title: Understanding the Impact of Face Mask Usage Through Epidemic Simulation of Large Social Networks
+- Doc Text: Evidence from the 2003 SARS epidemic and 2009 H1N1 pandemic shows that face masks can be an effective non-pharmaceutical intervention in minimizing the spread of airborne viruses. Recent studies have shown that using face masks is correlated to an individual’s age and gender, where females and older...
+##### Score 2 Examples (test)
+**Example 1:**
+- Query: what is the origin of COVID-19
+- Doc Title: Intraspecies diversity of SARS-like coronaviruses in Rhinolophus sinicus and its implications for the origin of SARS coronaviruses in humans.
+- Doc Text: The Chinese rufous horseshoe bat (Rhinolophus sinicus) has been suggested to carry the direct ancestor of severe acute respiratory syndrome (SARS) coronavirus (SCoV), and the diversity of SARS-like CoVs (SLCoV) within this Rhinolophus species is therefore worth investigating. Here, we demonstrate th...
+**Example 2:**
+- Query: how does the coronavirus respond to changes in the weather
+- Doc Title: Seasonality of Respiratory Viral Infections.
+- Doc Text: The seasonal cycle of respiratory viral diseases has been widely recognized for thousands of years, as annual epidemics of the common cold and influenza disease hit the human population like clockwork in the winter season in temperate regions. Moreover, epidemics caused by viruses such as severe acu...
+**Example 3:**
+- Query: will SARS-CoV2 infected people develop immunity? Is cross protection possible?
+- Doc Title: Serological differentiation between COVID-19 and SARS infections.
+- Doc Text: In response to the coronavirus disease 2019 (COVID-19) outbreak, caused by the SARS-CoV-2 virus, multiple diagnostic tests are required globally for acute disease diagnosis, contact tracing, monitoring of asymptomatic infection rates and assessing herd immunity. While PCR remains the frontline test ...
+**Example 4:**
+- Query: what causes death from Covid-19?
+- Doc Title: Covid-19: Death rate in England and Wales reaches record high because of covid-19.
+- Doc Text: ...
+**Example 5:**
+- Query: what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?
+- Doc Title: Complete protection against severe acute respiratory syndrome coronavirus-mediated lethal respiratory disease in aged mice by immunization with a mous
+- Doc Text: Zoonotic coronaviruses, including the one that caused severe acute respiratory syndrome (SARS), cause significant morbidity and mortality in humans. No specific therapy for any human coronavirus is available, making vaccine development critical for protection against these viruses. We previously sho...
+**Example 6:**
+- Query: what types of rapid testing for Covid-19 have been developed?
+- Doc Title: Hunting coronavirus severe acute respiratory syndrome coronavirus 2 (2019 novel coronavirus): From laboratory testing back to basic research
+- Doc Text: The rapid spread of coronavirus disease 2019 (COVID-19) in many countries causes citizens of daily inconvenience and even life-threat for elderly population. The invasion of the main pathogen, severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2; 2019 novel coronavirus [2019-nCoV]), into huma...
+**Example 7:**
+- Query: are there serological tests that detect antibodies to coronavirus?
+- Doc Title: SARS-CoV-2 serology: Test, test, test, but interpret with caution!
+- Doc Text: SARS-CoV-2 serological tests are a subject of intense interest and have the potential to significantly enhance the diagnostic capability of healthcare services in the current pandemic. However, as with all novel assays, significant validation is required to understand the clinical relevance of resul...
+**Example 8:**
+- Query: how has lack of testing availability led to underreporting of true incidence of Covid-19?
+- Doc Title: I Just Can’t Get Enough (of Experts): The Numbers of COVID-19 and the Need for a European Approach to Testing
+- Doc Text: This article offers a reflection on the testing strategies deployed in the generation of epidemiological data in the European Union (EU). I will argue that, while in the early days of the pandemic, Member States proceeded to testing in a rather scattered way, the shortage of resources seems to have ...
+**Example 9:**
+- Query: how has COVID-19 affected Canada
+- Doc Title: Communication, transparency key as Canada faces new coronavirus threat.
+- Doc Text: ...
+**Example 10:**
+- Query: has social distancing had an impact on slowing the spread of COVID-19?
+- Doc Title: Timing of Community Mitigation and Changes in Reported COVID-19 and Community Mobility - Four U.S. Metropolitan Areas, February 26-April 1, 2020.
+- Doc Text: Community mitigation activities (also referred to as nonpharmaceutical interventions) are actions that persons and communities can take to slow the spread of infectious diseases. Mitigation strategies include personal protective measures (e.g., handwashing, cough etiquette, and face coverings) that ...
+## CORE
+**Has Scores 0, 1, 2**
+**Score 2 = HIGHLY RELEVANT** ✓
+**Score 1 = RELEVANT** ✓
+## Special Notes
+- **Original Purpose**: COVID-19 biomedical information retrieval
+- **What Scores Really Mean**:
+  - Score 2: Direct COVID-19 papers (origins, immunity, mortality)
+  - Score 1: Related viral/respiratory papers (influenza, SARS, general respiratory)
+  - Score 0: Unrelated topics (cattle welfare, citrus disease, liver disease)
+- **Dataset Type**: Pandemic IR with graded relevance
+- **CLEANED**: Removed 2 Score -1 annotations (were errors)
+## Training Verdict
+**GOOD FOR TRAINING** ✓
+- Both Score 1 and 2 are truly relevant
+- Clear graded relevance levels
+- For binary: Use 1+2 as positive, 0 as negative
+## Observations

test_scores/dataset_reports/webis-touche2020_analysis.md ADDED Viewed

	@@ -0,0 +1,624 @@

+# WEBIS-TOUCHE2020 Dataset Analysis
+## Overview
+- Total queries available: 49
+- Total corpus documents: 382545
+- Corpus documents loaded for examples: 50000
+## Split Statistics
+| Split | Queries | Total Pairs | Score Distribution |
+|-------|---------|-------------|-------------------|
+| train | N/A | N/A | N/A |
+| dev | N/A | N/A | N/A |
+| test | 49 | 2214 | 0:1282, 1:296, 2:636 |
+## Score Analysis
+**Multiple score levels found: [0, 1, 2]**
+## Examples
+### Score Comparison Examples
+#### Comparison Example 1
+**Query:** Should teachers get tenure?
+**From:** test split
+**Score 2 Documents:**
+*Document 1:*
+- Title: There should not be a teacher tenure.
+- Text: Reason 1 - Teacher tenure creates complacency because teachers know they are unlikely to lose their jobs: If teachers know that they reached the period where they get special defence from most accusations - it would send the message to them that they can then do whatever they want to do in the classroom and really slack with their teaching duties. Reason 2 - Tenure makes it difficult to remove und...
+*Document 2:*
+- Title: There should not be a teacher tenure.
+- Text: Quotes used in my debate are all included here. (I know this is not exactly allowed and it's really messy and confusing but 10,000 characters (approx. 1500 words) really isn’t enough for 10 rebuttals.) Rebuttal of ‘Reason 1’: ‘a’ (this is where Quote a. from the picture should go) Pro is presupposing that teachers will become complacent if they know they are unlikely to lose their jobs. However, 1...
+*Document 3:*
+- Title: There should not be a teacher tenure.
+- Text: Thank you sara_ann_dee for her argument. In this debate, I will argue that ‘there should be a teacher tenure.’ Whether this tenure is reformed, or reduced to only applicable to some teachers, does not matter as long as I am able to prove that ‘teacher tenure’ should, in some form, exist. Voters should vote for me if I am able to do this. Although rebuttals are reserved for the next round, I urge t...
+**Score 0 Documents:**
+*Document 1:*
+- Title: should students have to wear school uniforms
+- Text: What do u think should students wear school uniforms...
+*Document 2:*
+- Title: There should not be a teacher tenure.
+- Text: Hello and thank you for joining my debate I will be arguing to get rid of the teacher tenure....
+---
+#### Comparison Example 2
+**Query:** Is vaping with e-cigarettes safe?
+**From:** test split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Bloomberg's Ban on E-Cigs
+- Text: Electronic cigarettes comes with different cartridges including 6-18mg of nicotine and sometimes 0mg. This is to say that electronic cigarettes are safer to smoke than traditional cigarettes. Electronic cigarettes do not cause tar because of the fact that it does not contain tobacco and leave behind no tar. As a result, the main components of carcinogen are not present to create a problem that tra...
+*Document 2:*
+- Title: Bloomberg's Ban on E-Cigs
+- Text: Whether smoking a cigarette or e-cig there is still nicotine In both and nicotine is highly addictive. E cigs are not a safer alternative to cigarettes because they are just as addictive. E-cigs may only be vapor but it is not undetectable. That wretched nicotine smell will linger on your clothes and in your hair. The smoke and vapor is bound to offend someone and I would not like to be sitting ou...
+*Document 3:*
+- Title: Should E-cigs and vapes be regulated
+- Text: I've never claimed that I'm okay with "kids" acquiring an addiction to electronic cigarettes. I simply do not care. It's not only nicotine but other harmful substances that are present. You provided examples of why they're bad, Without any sources. I'm just saying that there shouldn't be any regulations because it will be utterly pointless and time consuming. Juul for example is a good way to star...
+**Score 0 Documents:**
+*Document 1:*
+- Title: Mining is relatively safe.
+- Text: Your assertion of the debate is that mining is a relatively safe occupation, but relative to what? It's not safe relative to toilet paper testers. In fact, I can actually bring statistics to show that it is actually far more dangerous. How many miners die per year? What percentage of miners get asbestos or other mining related illnesses? This is from an independent Indonesian website: Pada tahun p...
+*Document 2:*
+- Title: Mining is relatively safe.
+- Text: Welcome to the debateFirst round is for defining and the last round is for conclusions. I define mining as a quest that involved digging at a very large scale to seek for precious stones or metals. I define relatively as 'compared to other occupations' and safe as 'compared to other occupations' as well.Thank you....
+*Document 3:*
+- Title: The legalization of Marijuana in the United States would benefit all Americans.
+- Text: This drug should stay illegal for many reasons concerning the health and safety of Americans today. Saying no one ever died from smoking marijuana is like saying no one ever died from smoking tobacco. Today research shows that nearly 50 percent of teenagers try marijuana before they graduate from high school. Most young teens and children start using marijuana for many reasons, they are curious, o...
+---
+#### Comparison Example 3
+**Query:** Should corporal punishment be used in schools?
+**From:** test split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Spanking In Schools
+- Text: I will now summarize this debate and make a closing statement. Con has presented arguments that are self-contradicting: Should spanking "be banned from everywhere," including children's own homes, or is it "the parents [sic] choice" whether or not to spank their children? I have shown that spanking should not be banned universally, and I have also demonstrated that if parents are allowed to spank ...
+*Document 2:*
+- Title: Corporal punishment should be banned from schools
+- Text: http://abcnews.go.com... This article shows the ineffectiveness of corporal punishment in schools. In closing, corporal punishment should be banned because it is ineffective and lowers students' IQ's. Other forms of punishment should be explored and tested. Good luck....
+**Score 1 Documents:**
+*Document 1:*
+- Title: Corporal punishment in the families!!!
+- Text: Spanking Has Its Place I believe many would argue that spanking a child is a form of corporal punishment and it should be regarded as child abuse, but there is also a whole other group that supports spankings and they say it works. I've tried to use spanking with my child and I found it didn't work and I didn't prefer that method, but I don't see it as child abuse either. Of course, when parents o...
+**Score 0 Documents:**
+*Document 1:*
+- Title: corporal punishment
+- Text: Sir, you are the one who instigated the debate. You have to present your position and I refute it. Not the other way around....
+*Document 2:*
+- Title: should schools teach sword fighting
+- Text: i say schools should teach sword fighting because it would be self defense and would help students defend the school if someone broke in while classes were going on...
+---
+#### Comparison Example 4
+**Query:** Should social security be privatized?
+**From:** test split
+**Score 2 Documents:**
+*Document 1:*
+- Title: Abolish Social Security
+- Text: Unpromising, chaotic, and fraud. Social security fits this description exactly; therefore social security must be abolished. I take an affirmative stance to abolish Social security for the following contentions; Contention 1; Social security has no prospect for America, Contention 2; America now and in the future will not be able to sustain the needs of the Social security system, and Contention 3...
+**Score 0 Documents:**
+*Document 1:*
+- Title: Get rid of parole
+- Text: When someone is convicted of murder should they get rid of parole and automatically send them to death row?...
+*Document 2:*
+- Title: Abolish Social Security
+- Text: Due to the fact my opponent's account has been closed, this debate is null. I please ask you not to vote....
+*Document 3:*
+- Title: Obama Care is a disaster
+- Text: “Unemployment insurance or subsidized coverage run by the government is destined to fail. It encourages laziness, has no accountability, and wins the voters' vote who is receiving it. […] If you don't work and don't plan on working, you shouldn't be allowed to have it.” Keep in mind unemployment insurance only goes to folks who are actively looking for work, a mom who needs help feeding her kids w...
+---
+#### Comparison Example 5
+**Query:** Is a college education worth it?
+**From:** test split
+**Score 2 Documents:**
+*Document 1:*
+- Title: College education is not necessary to become successful
+- Text: College education is a waste of money and time. Most people who attend college do not get what they pay for. An associate degree or a bachelors degree cannot really get you a high paying job or "career". In order to get a "career" you must continue your education to receive a masters degree, a PhD or any other further education Many people without a college career have been extremely successful; F...
+**Score 1 Documents:**
+*Document 1:*
+- Title: Going to college is worth
+- Text: Consider a smart person who is determined to be a lawyer. This person has a full ride scholarship, so education is free, and it is necessary for this person's life dreams and happiness to be a lawyer. A degree is required to practice law [1]. You cannot become a lawyer without being able to practice law, and so, since classes cost no money, it is totally worth it for this person to go to college. ...
+**Score 0 Documents:**
+*Document 1:*
+- Title: Should colleges be free
+- Text: Colleges should not be free because of the following reasons: 1) How will the colleges pay for the teachers and professors that teach the students the course and material without having the students pay for their education fee? 2)Do you really you think with the capitalist society America is rooted on that anything at the most part will be free? Sure there's public education. You still have to pay...
+*Document 2:*
+- Title: Going to college is worth
+- Text: I accept, and will be arguing that college is worth it for some people....
+*Document 3:*
+- Title: It's not worth having children
+- Text: True, true... but there is one thing that redeems reproduction... In order for us humans to evolve, we are required by the laws of nature to knock each other up and get preggerz so we can continue the human race's existence. Unless, like, we gave birth to robots that ruin the woman's "anatomy" forever during birth. And so what if they cause grief? I'm sure there's plenty of masochistic parents out...
+---
+### Examples by Split and Score
+#### TEST Split
+**Scores in test: [0, 1, 2]**
+##### Score 0 Examples (test)
+**Example 1:**
+- Query: Should teachers get tenure?
+- Doc Title: should students have to wear school uniforms
+- Doc Text: What do u think should students wear school uniforms...
+**Example 2:**
+- Query: Is vaping with e-cigarettes safe?
+- Doc Title: Mining is relatively safe.
+- Doc Text: Your assertion of the debate is that mining is a relatively safe occupation, but relative to what? It's not safe relative to toilet paper testers. In fact, I can actually bring statistics to show that it is actually far more dangerous. How many miners die per year? What percentage of miners get asbestos or other mining related illnesses? This is from an independent Indonesian website: Pada tahun pertama saja kita melihat bahwa 19% dari semua penguji kertas toilet akan kontrak penyakit mematikan. Selama rentang 10 tahun kita telah melihat bahwa hanya di bawah setengah (48,2%) dari semua penguji kertas toilet mati [1.] Translation: "In the first year alone we see that 19% of all toilet paper testers will contract a lethal disease. Over a 10 year span we have seen that just under half (48.2%) of all toilet paper testers are dead from job-related illnesses." [1.] This was from an independent Chinese website where many toilet paper testers are employed. Upon doing the research of how many miners die over a 10 year span from job related illnesses you will that 48.2% figure is far larger than the amount of miners that die from job related illnesses over a 10 year period. www.npr.org " News " Health Apr 16, 2010 - "Mindful of the grief and anger surrounding the death of the 29 miners at the Upper Big ... in 2006, Attfield found that 4 percent of 1,000 miners tested had advanced pneumoconiosis." This is just one example in one particular case, but it gives you perspective of the 4% of miners battling job-related illnesses vs. the 19% of job-related illnesses that toilet paper testers suffer. [1.] http://m.inilah.com......
+**Example 3:**
+- Query: Should corporal punishment be used in schools?
+- Doc Title: corporal punishment
+- Doc Text: Sir, you are the one who instigated the debate. You have to present your position and I refute it. Not the other way around....
+**Example 4:**
+- Query: Should social security be privatized?
+- Doc Title: Get rid of parole
+- Doc Text: When someone is convicted of murder should they get rid of parole and automatically send them to death row?...
+**Example 5:**
+- Query: Is a college education worth it?
+- Doc Title: Should colleges be free
+- Doc Text: Colleges should not be free because of the following reasons: 1) How will the colleges pay for the teachers and professors that teach the students the course and material without having the students pay for their education fee? 2)Do you really you think with the capitalist society America is rooted on that anything at the most part will be free? Sure there's public education. You still have to pay taxes for that. Virtually, colleges cannot be free. This leads to my third point. 3)There's this thing called community college. It's cheap and inexpensive. Most people that go there either need to save money or just extend their studies before they transfer to a better college. 4)Scholarships help a lot with loans. Most students can get a scholarship. Not the big flashy "Bill Gate Scholarship", but a small one that pays for a part of your tuition. All in all, the way colleges run is that they provide education in return for money. Also, public colleges are funded by the state. Private colleges aren't. This is why they are usually more expensive and in return you get better education. And that's pretty much it for my side of the debate....
+**Example 6:**
+- Query: Should felons who have completed their sentence be allowed to vote?
+- Doc Title: In a democratic society, felons out to retain the right to vote.
+- Doc Text: I regret that my opponent has forfeited this round as I enjoyed the debate....
+**Example 7:**
+- Query: Should abortion be legal?
+- Doc Title: Legal Abortion
+- Doc Text: My argument is this: you are attempting to save innocent lives. Saving lives is heteronormative and normalises the queer - and makes them "other". Otherness creates conflict and arises dictatorships.Your case makes dictatorships, yes. As for the "no statistical support" - I provided quality sources for you and I quoted them (they are behind paywalls so I will not provide the full text because I doubt the authors want that) - just saying that my arguments are unsubstantiated is fallacious.In the end my opponent just says my argument doesn't make sense - never attacking. He also provides no reasons to vote in negation, just that doing so saves lives - I AGREE but saving lives is INHERENTLY flawed!Easily vote in affirmation because of the dropped arguments and a lack of constructive arguments on my opponent's side. Thank you ~...
+**Example 8:**
+- Query: Should students have to wear school uniforms?
+- Doc Title: should students have to wear school uniforms
+- Doc Text: What do u think should students wear school uniforms...
+**Example 9:**
+- Query: Should any vaccines be required for children?
+- Doc Title: Vaccines do not cause autism
+- Doc Text: To begin I would like to reiterate the framework of this debate. Namely, as the resolution specifies that vaccines do not cause autism, I simple need demonstrate some pathway by which autism can come about as a result of a vaccination. The negative is not required to demonstrate a direct link by which a vaccination directly leads to autism. Secondly, the negative need only show one instance in which a vaccination could bring about autism as the affirmative has already agreed with this proposition. High fevers in pregnant women can lead to autism In a recent landmark study "A team of UC Davis researchers has found that mothers who had a fever during pregnancy were more than twice as likely to have a child with autism " than mothers who did not have a fever or who took medication to counter its effect" [1] With this in mind let's look at some of the side effects associated with several vaccines. According to the center of Disease control[2] vaccines which have a possible side effect of high fever include but are not limited to, the vaccine for Haemophilus influenza type b, the vaccine for, Diphtheria, Tetanus, and acellular Pertussis and Gardasil the vaccine used to prevent HPV. Responses: My opponent makes that claim that the redacted 1998 Andrew Wakefield study implemented a poor methodology and fabricated data. I completely agree, this study was poorly conducted. However, this does not negate the fact the high fevers in women who are pregnant can result in autism and that vaccination can result in such fevers. Conclusion: I have demonstrated that high fever, a side effect of three different vaccinations can bring about autism as specified in the resolution. I have thus fulfilled my burden of proof and ask that you negate this resolution. 1) http://www.ucdmc.ucdavis.edu... 2) http://www.cdc.gov......
+**Example 10:**
+- Query: Should performance-enhancing drugs be accepted in sports?
+- Doc Title: Anybody who used performance enhancing drugs in the MLB shouldn't be elected to the Hall of fame
+- Doc Text: No, they cheated therefore they shouldn't make the hall of fame. It doesn't matter if a lot of players did it. They cheated therefore they should be penalized. The people who make the Hall of Fame should be in there because of their honor to the game, not for cheating and deceiving people so they can have a physical advantage. You said that there are many "incredible players" in the Mitchell Report. Don't you think maybe they were so incredible because they used steriods and performance enhancing drugs? If they didn't use them they wouldn't be as good as the stats show. I understand players are out there to get theirs and get as much money as they can, but what about the true players of the game who aren't using these performing enhancing drugs. They did the right thing and won't make the hall of fame because they might have decent numbers over their career but the ones who used performance enhancing drugs had better stats than them because they cheated....
+##### Score 1 Examples (test)
+**Example 1:**
+- Query: Should corporal punishment be used in schools?
+- Doc Title: Corporal punishment in the families!!!
+- Doc Text: Spanking Has Its Place I believe many would argue that spanking a child is a form of corporal punishment and it should be regarded as child abuse, but there is also a whole other group that supports spankings and they say it works. I've tried to use spanking with my child and I found it didn't work and I didn't prefer that method, but I don't see it as child abuse either. Of course, when parents over do it, or spank particularly hard, or hit children in inappropriate ways, then of course it is child abuse....
+**Example 2:**
+- Query: Is a college education worth it?
+- Doc Title: Going to college is worth
+- Doc Text: Consider a smart person who is determined to be a lawyer. This person has a full ride scholarship, so education is free, and it is necessary for this person's life dreams and happiness to be a lawyer. A degree is required to practice law [1]. You cannot become a lawyer without being able to practice law, and so, since classes cost no money, it is totally worth it for this person to go to college. 1. http://study.com......
+**Example 3:**
+- Query: Should abortion be legal?
+- Doc Title: Legal Abortion
+- Doc Text: This debate is "should abortion be legal?" Short answer no... Long answer noooooooo... Lol, but seriously, Abortion is murder any way that you look at it. I am excited for an intelligent debate...
+**Example 4:**
+- Query: Should students have to wear school uniforms?
+- Doc Title: Should we be aloud too wear hats in school as in appropriate hats no GANG HATS OKKK
+- Doc Text: I think we should be able to but also a little we shouldn't because you could get distracted and not do your work instead just play with your hat but you would get a chance to express yourself...
+**Example 5:**
+- Query: Should performance-enhancing drugs be accepted in sports?
+- Doc Title: All drugs should be legalised, change my mind.
+- Doc Text: I believe that all drugs should be legalised, whether it is dangerous or not its irrelevantI challenge anyone to change my mind.The only downside I see with the legalisation of drugs is that there might be a 1-2% more drug addicts....
+**Example 6:**
+- Query: Should birth control pills be available over the counter?
+- Doc Title: birth control in high school
+- Doc Text: i think birth control is very high in high school they are all too young and they are making the wrong choses and they should really think about there choses because if they have a babie in high school then they have to drop out and then they have to get a crappy job to take care of that babie so there fore there should not be any birth control in high school...
+**Example 7:**
+- Query: Is sexual orientation determined at birth?
+- Doc Title: We are who we are based on our enviorment. Nature vs. Nurture Debate
+- Doc Text: I agree that there is a combination at play but I believe that nurture plays a greater role. A child only learns from what he/she experiences. A child would not understand manners if a parent doesn't suggest and teach them as a child develops. The examples we observe help us distinguish right from wrong. This is the basis in which we create our own set values and discover who we are as an individual....
+**Example 8:**
+- Query: Should animals be used for scientific or commercial testing?
+- Doc Title: Monkeys should be considered persons in the eyes of the law
+- Doc Text: Amina Amjed The Case for Animal Rights Round 1 1.The use of animal for science, for commercial purposes, for agriculture purposes and for the purposes of sport hunting and trapping should be abolished. 2.We should regard animals just as we regard those who are unable to defend and protect themselves such as children, the disabled and the elderly. 3.The abuse of animals for science, for sports, for agriculture and for any other human entertainment or attainment of needs for humans is fundamentally wrong. 4.We, as humans, must stop treating animals as things and treating them as beings that are capable of feeling pain and suffering. 5.It is true, that the process of change regarding animals and the rights of animals is complicated and requires extensive amount of time and finance to bring about this change but even so it is much needed. 6.The idea of animal rights has reason if not just emotion. 7.We have a direct duty to ensure safety of animals from brutality, as we do to our children and to those humans around us who are incapable of helping themselves. 8.We should not overlook animals because they do not have the characteristics and components of a human being. 9.Using animals for scientific experiments and/or for sports" purposes is brutal and against morality. 10.Pain is pain wherever it occurs. 11.Some would try to justify the use, therefore abuse, of animals by referring to what is known as contractarianism. 12.Contractariansim is the belief, by some, that morality consists of a set of rules that individuals abide to voluntarily just as we do when we agree to a contract. 13.Animals cannot volunteer or provide consent to be or not be a part of a contract therefore they cannot be moral and not considered persons. 14.However, this is true for children as well. Children cannot consent to a contract without the permission and supervision of their parent and/or guardian. Just so, animals should be protected by their guardian(s), if one is present, with the same moral perspective as another human. 15.Theories such as the cruelty-kindness view and utilitarianism have been presented to justify the protection of animals but have not proved to be entirely effective. 16.Each view is flawed is some way to segregate animal and the rights that should be given to them. 17.By taking the inherent value approach, it is evidently clear that each individual is as valuable as the next and not just what that individual can do for others. Your value as an individual would not be regarded by what you can do for me and in return what I can do for you. 18.In that same respect, an animal is considered an individual who should be treated with the same rights as me and you regardless of what they can do for us. 19.We should not use and/or abuse animals just because by doing so, it provides for the good of others. By hurting or undermining one individual we inherently subject ourselves to condone acts of cruelty. 20.The rights view of moral theory rationally explains the domain of human morality. 21.It is true, that animals lack many attributes of a human such as reading, writing, building book cases or baking a cake. 22.However, it is also true that there are some humans who are incapable of doing some of the things that I mentioned above but we do not devalue them as an individual or say that they are not human. 23.As an individual, be it a human or a monkey, we feel things, want things, have certain expectations, we feel pain and excitement, we feel frustration and we also suffer from untimely death. 24.All who have inherent value have it equally regardless of them being human animals or not. 25.The fight for animal rights is analogous to that of equality for women and other minorities. 26.In regards to animals that are used in the field of science, there is proof of devaluing these animals by testing on them routinely as if their value is reducible by their usefulness to others. These animals are treated without any respect and in doing so the rights of animals are violated. 27.In the same sense, farm animals are kept in stressful close confinements or in isolation which causes them pain and suffering. This treatment, rather mistreatment of farm animals is rooted in the view which lacks the acknowledgement of animals as individuals with independent value. Instead they are viewed as resource for "humans". 28.Killing one human for selfish means does not satisfy our moral values than why should that be true of killing or torturing animals. 29.A right, properly comprehended, is a claim that one party may exercise against another. The victim of this claim can potentially be a person, a community, or even all human kind. 30.It should than be understood that rights in general are in every case claims or potential claims within a community or moral agents. 31.Rights can be defended or claimed by those beings that can make moral claims against one another. 32.Human can make these moral choices whereas animals cannot. Animals are not beings that are capable of practicing or responding to moral claims. 33.But rights cannot simply depend on the presence of moral capacity. If that were true then we would have to agree that humans who are brain damaged or comatose lack the ability to respond to or exercise moral claims therefore they have no rights. 34.Non-human mammals have the same fundamental rights as a normal mammal to not be harmed or killed. 35.Those that have a subject of life, like normal mammals and non-human mammals, have inherent value. 36.Animals have the ability to feel pain, satisfaction, need, pleasure, disease and death, just as any human being does. 37.Therefore, I argue, that animals, such as monkeys should be considered persons in the eyes of the law. 38.Rights of animals should be respected. 39.The use of animals in science; commercial animal agriculture; commercial and sport hunting and trapping should be indefinitely terminated....
+**Example 9:**
+- Query: Should gay marriage be legal?
+- Doc Title: Gay marriage should be legal
+- Doc Text: I think gay marriage should be legal because every body deserves to marry who they love....
+**Example 10:**
+- Query: Is drinking milk healthy for humans?
+- Doc Title: Eating Meat is not Inherently Unhealthy or Immoral
+- Doc Text: I apologize for the mishap in citation, upon inspection you'll notice 3 black periods after each source "..." this cut off the actual link, leading to the general website. This is undoubtedly a mistake caused by having to copy and paste my arguments after your FF. Actual citation is as follows; please feel free to verify: [1] http://authoritynutrition.com... [2] http://www.mayoclinic.org... [3] http://www.hsph.harvard.edu... [4] http://www.peta.org... [5] http://well.blogs.nytimes.com... [6] http://modernfarmer.com... For the purpose of my rebuttal I will refer to my previous 6 sources as numerically cited and future sources will be cited as 7 & up. In citing your sources they will be referenced as [Con #] Given that my arguments have been legitimated, your comment "Since there are no proper sources we can ignore the statistics, for they were wrong anyways." can be disregarded. I understand that this is a bit unfair to you given that you decided to arrogantly brush off my arguments and supply your own. You will now have the chance to refute my Round 2 argument. I recommend you actually read it this time, because there is clear evidence in your rebuttal you did not. Problematic Citation and Source Material: I apologize for the mishap in my Round 2 source material; however you'll find that everything is properly cited and credible. However in your entire argument you fail to properly cite anything you said. I have no idea which comment corresponds to which source; if this were an academic setting you would be charged with plagiarism. Your source material is also very questionable and contradictory, and some are irrelevant to the debate entirely. Because you did not openly disclose which arguments come from which sources I have to spend the majority of my rebuttal debunking your sources. I ask that you re-read my Round 1 post. Source [Con 1] enforces my position quote "The high level of meat and saturated fat consumption in the USA ... exceeds nutritional needs and contributes to high rates of chronic diseases such as cardiovascular disease, diabetes mellitus and some cancers." [Con 1] is describing SAD, in my position statement I clearly acknowledge that SAD is not relevant to the debate because over-consumption does not equal inherent unhealthiness. If over-consumption was a true indicator of inherent negative health affects the 8% of obese vegetarians you mention would show that over consuming vegetables causes the negative health effects associated with obesity. [Con 2] is problematic; the author is simply unqualified, her brief bio gives no indication of having a degree in anything, she is extremely biased and only provides one citation for her entire 15 point article. More importantly all of her points are contingent on the immorality of factory farming, which as I've stated in Round 2 does not divert from meat being inherently unhealthy or immoral because there are healthy and moral ways of raising cattle [2][5][6][Con 8]. The findings of [Con 3] can also be refuted, Point 4 Source [1] quote "It is true that processed meat is associated with an increased risk of cancer, especially colon cancer ... Two review studies, one that looked at data from 35 studies and the other from 25 studies, found that the effect for unprocessed red meat was very weak for men and nonexistent for women. " This implies it is the process, not the meat that causes cancer. [Con 4] is 42 pages long, I only read the first page and found that it, like [Con 1], was critical of SAD for its health effects. This does not prove meat is inherently bad, fish for example is low in fat and has positive health effects [3]. Also [Con 4] says Chimps eat monkeys, if sentience is such an issue then why don't chimps respect the sentience of monkeys? (I'll address this in more detail later). [Con 5] is from PETA an organization well known for its bias against eating meat. Like [Con 2] the arguments are almost entirely based on factory farming, which again is irrelevant in the face of free range farming [5][6][Con 8]. [Con 6] is PETA India, it has the same bias and credibility as [Con 2]. Point 4 for example says that meat is bad because if you under cook it then you could get bird flu. Ok, by this logic vegetables are bad because if you don't clean them you could get E. Coli. As I said in the beginning of Round 2 "Like all food the health factor is not necessarily the food itself but how it is prepared and produced." [Con 7] is my personal favorite. By citing this source you show how truly desperate you are to grasp at straws. This source made me understand why you did not disclose them in your debate, as it has no scientific value whatsoever, it simply bullet points propaganda without any evidence. Let's read some of the findings of SupremeMasterTV.com. Eating meat causes "Blue tongue disease", what is this? Oh right it's a disease that only affects livestock and not humans [7]. The other diseases are food borne illnesses from improper cooking. In your rebuttal you state "Studies have shown that an Ovo-Lacto Vegetarian diet is the most healthy diet of all." Meaning that those who drink milk and eat eggs along with vegetables have the best diet of all (this is not a vegan diet, in Round 1 you said you'd argue as a vegan). But Lo! [Con 7] says drinking milk is bad! It causes "Breast, prostate and testicular cancer from hormones present in milk; Listeria and Crohn's disease; Hormones and saturated fat lead to osteoporosis, obesity, diabetes and heart disease; Linked to higher incidences of multiple sclerosis." Still think drinking milk is cool? [Con 8] is written by the same author as [Con 4] and as such contains the exact same arguments. Also I don't think you actually read this article because the author lays out a plan for rational meat eating in the section "Toward Rational Meat Eating" so yea ... you're helping me with this one. [Con 9] I admit this article does seem legitimate and has sources. However this argument doesn't help you in the way you might think, it focuses on factory farming and it acknowledges meat can be environmentally friendly "Chicken is probably the best land animal to eat, certainly in terms of climate change impact. Fish have a low greenhouse gas impact but are being eaten in such large quantities that many are at risk of extinction." "Even limiting one's meat consumption to chicken yields major environmental benefits--not to mention health and financial benefits." So eating chicken yields health benefits ... Interesting. [Con 10] was about jacana birds being influenced by human scientists to commit infanticide. I don't see how this proves anything more than humans encouraged behavior that only humans would consider immoral. If anything this goes to show killing animals is not an issue of morality. For the sake of science (as suppose to survival) humans encouraged these birds kill each others babies. Are these scientists immoral for encouraging nature? Eating Meat Can be Healthy: For the bulk of this section please read Round 2. This section is for counter arguments. Given that none of your statistics are properly cited they can be seen as fiction. You clearly dramatize the negative health effects of meat, if meat was so bad then why would doctors recommend it as part of a balanced diet [2][3][8]? Quoting Con "Aquamarine animals can often cause toxins, for all those fish which are high in Omega 3 are also extremely high in mercury. Those which are not like Salmon may cause many other diseases, and may also create bowel issues." Ah yes, Harvard disagrees quote "Environmental Protection Agency " calculated that if 100,000 people ate farmed salmon twice a week for 70 years, the extra PCB intake could potentially cause 24 extra deaths from cancer"but would prevent at least 7,000 deaths from heart disease. Second, levels of PCBs and dioxins in fish are very low, similar to levels in meats, dairy products, and eggs. Third, more than 90 percent of the PCBs and dioxins in the U.S. food supply come from such non-seafood sources, including meats, dairy, eggs, and vegetables." [3] Morality of Meat: Appeal to Nature In Round 2 I specifically acknowledge the possible flaw in this argument "However just because an animal does something does not give us the right to copy them blindly. It is our advanced intelligence and our appeal to morality that sets us apart from the lesser animals." It"s important to note humans are animals, and as such comparison to other animals is legitimate (it's kind of how biology and anthropology work). You yourself make this a moral underpinning in Round 1 "" it is wrong to kill [animals], as it is wrong to kill humans." But why? What makes them like us? You argue mammals are sentient like us, they feel emotion like we do, so they should be spared. You think that just because chimps can love each as humans can love each other this somehow negates the fact that chimps are equally enthusiastic about killing lesser yet still sentient primates like monkeys [Con 4], humans eat monkeys too ya know. Because we can feel love somehow all mammals are entitled to our good graces. Even if I agreed with your pan-mammalism philosophy fish and chicken aren't mammals what claim to familiarity do they have? When it comes to rape and infanticide there are obvious pro-human arguments against it, if your only argument against eating mammals is 'well you wouldn't eat a human!' then I'm afraid the moral connection falls flat. New Citation: [7] https://www.princeton.edu... [8] http://www.mayoclinic.org......
+##### Score 2 Examples (test)
+**Example 1:**
+- Query: Should teachers get tenure?
+- Doc Title: There should not be a teacher tenure.
+- Doc Text: Reason 1 - Teacher tenure creates complacency because teachers know they are unlikely to lose their jobs: If teachers know that they reached the period where they get special defence from most accusations - it would send the message to them that they can then do whatever they want to do in the classroom and really slack with their teaching duties. Reason 2 - Tenure makes it difficult to remove under-performing teachers because the process involves months of legal wrangling by the principal, the school board, the union, and the courts: Most schools stop trying to fire a certain teacher because the proccess is just too difficult. " A June 1, 2009 study by the New Teacher Project found that 81% of school administrators knew a poorly performing tenured teacher at their school; however, 86% of administrators said they do not always pursue dismissal of teachers because of the costly and time consuming process. It can take up to 335 days to remove a tenured teacher in Michigan before the courts get involved. " (. http://teachertenure.procon.org...) (Patrick McGuinn, "Ringing the Bell for K-12 Teacher Tenure Reform," www. americanprogress. org). This quote means that 86 OUT OF 100 SCHOOL ADMINISTRATORS WANT A TEACHER TO BE FIRED - but will not do so because the proccess is to draining. But what does that leave our learning and growing generation with? Many teachers who do not care, teach well, or put effort in their work? That is certaintly what this is going to result into if we do not abolish it quickly. Also check out this statistic of who is in favor (people in general) "An Apr. -May 2011 survey of 2,600 Americans found that 49% oppose teacher tenure while 20% support it. Among teachers, 53% support tenure while 32% oppose it. According to a Sep. 2010 report by the Thomas B. Fordham Institute, 86% of education professors favor "making it easier to terminate unmotivated or incompetent teachers - even if they are tenured. ” Of course you cannot expect most teachers to be against it sinse that it their profession and it effects them - but for bystanders with accurate and unbiased opinions, look how many people are against it. Also, "56% of school board presidents disagreed with the statement that teacher tenure ensures academic freedom. " (M. J. Stephey, "A Brief History of Tenure," www. time. com). Reason 3 - Most people are against teature tenure: "In an Oct. 1, 2006 survey, 91% of school board presidents either agreed or strongly agreed that tenure impedes the dismissal of under-performing teachers. 60% also believed that tenure does not promote fair evaluations. " (. http://teachertenure.procon.org...) This means that most teachers OF SUCH A LARGE PERCENTAGE are not in favor of the teacher tenure. Reason 4 - Teacher tenure does nothing to promote the education of children: "Former DC Schools Chancellor Michelle Rhee said in 2008, "Tenure is the holy grail of teacher unions, but it has no educational value for kids; it only benefits adults. ”("Rhee-Forming D. C. Schools," www. wsj. com). This piece of evidence means that the only people actually benefiting from this tenure are the teachers who are employed - not any students. Isint education suppost to be focused on the younger generation and their best interest? Since when did school become all about the teachers - this tenure undermines what it means to actually be a teacher. If anything, it is only a BAD THING for students - and why would we keep something in our school systems that MAKES THE GENERATIONS' LEARNING LESS VALUEABLE? It does not make any sense. Reason 5 - Tenure at the K-12 level is not earned, but given to nearly everyone: "To receive tenure at the university level, professors must show contributions to their fields by publishing research. At the K-12 level, teachers only need to "stick around” for a short period of time to receive tenure. A June 1, 2009 study by the New Teacher Project found that less than 1% of evaluated teachers were rated unsatisfactory. " (Marcus A. Winters, "Challenging Tenure in D. C. ," www. manhattan-institute. org). This statistic is absolutely upsetting and degrating. Basically, this quote is explaning how 99% of teachers have free protection handed to them if they just stay in that profession for a certain amount of time. What if that teacher was already slacking in many areas? Now we are going to award them for poor effort and teaching abilities? It is not fair to the students involved with these teachers and it is not fair that they do not actually have to WORK to recieve a benefit of protection unlike most other professions that require some form of acomplishment to recieve that/those benefits in question. Because "with most states granting tenure after three years, teachers have not had the opportunity to "show their worth, or their ineptitude. " (Rose Garrett, "What Is Teacher Tenure? ," www. education. com), (. http://teachertenure.procon.org...).Reason 6 - Tenure makes it costly for schools to remove a teacher with poor performance or who is guilty of wrongdoing: "It costs an average of $250,000 to fire a teacher in New York City. New York spent an estimated $30 million a year paying tenured teachers accused of incompetence and wrongdoing to report to reassignment centers (sometimes called "rubber rooms”) where they were paid to sit idly. Those rooms were shut down on June 28, 2010. " ("Rhee-Forming D. C. Schools," www. wsj. com), (Steven Brill, "The Rubber Room," New Yorker). This is just sad, now it even costs the school boards money for teachers not doing their job? Should'nt that be the opposite? Reason 7 - Tenure is not needed to recruit teachers: "Sacramento Charter High School, which does not offer tenure, had 900 teachers apply for 80 job openings. " (Nanette Asimov, "Teacher Job Security Fuels Prop. 74 Battle," San Francisco Chronicle). This quote further proves why tenure is pretty much useless and unfair because teachers DO NOT NEED TENURE to continue their job as a teacher at their shchool, past school, future school, or school they are applying for. Reason 8 - With job protections granted through court rulings, collective bargaining, and state and federal laws, teachers today no longer need tenure to protect them from dismissal: "For this reason, few other professions offer tenure because employees are adequately protected with existing laws. " (Tenure Reforms and NJSBA Policy: Report of the NJSBA Tenure Task Force," New Jersey School Boards Association website, www. njsba. org), (Scott McLeod, JD, PhD, "Does Teacher Tenure Have a Future? ," www. dangerouslyirrelevant. org). This is the most important fact out of all these because it shows how the WHOLE REASON teacher tenure is here in the first place is NOT NEEDED not have the protections that teachers have without tenure. The teacher tenure is not benefitial for anyone except teachers - they get unfair advantages in MANY ways, some I have just listed. Why should we let this continue if unnessisary? Citations: . http://teachertenure.procon.org...http://teachertenure.procon.org...http://teachertenure.procon.org...Wanda Marie Thibodeaux, "Pro & Cons of Teacher Tenure," www. ehow. comPatrick McGuinn, "Ringing the Bell for K-12 Teacher Tenure Reform," www. americanprogress. org. http://teachertenure.procon.org... "Rhee-Forming D. C. Schools," www. wsj. comMarcus A. Winters, "Challenging Tenure in D. C. ," www. manhattan-institute. orgM. J. Stephey, "A Brief History of Tenure," www. time. comRose Garrett, "What Is Teacher Tenure? ," www. education. com. http://teachertenure.procon.org... "Rhee-Forming D. C. Schools," www. wsj. comSteven Brill, "The Rubber Room," New YorkerTenure Reforms and NJSBA Policy: Report of the NJSBA Tenure Task Force," New Jersey School Boards Association website, www. njsba. orgScott McLeod, JD, PhD, "Does Teacher Tenure Have a Future? ," www. dangerouslyirrelevant. orgNanette Asimov, "Teacher Job Security Fuels Prop. 74 Battle," San Francisco Chronicle...
+**Example 2:**
+- Query: Is vaping with e-cigarettes safe?
+- Doc Title: Bloomberg's Ban on E-Cigs
+- Doc Text: Electronic cigarettes comes with different cartridges including 6-18mg of nicotine and sometimes 0mg. This is to say that electronic cigarettes are safer to smoke than traditional cigarettes. Electronic cigarettes do not cause tar because of the fact that it does not contain tobacco and leave behind no tar. As a result, the main components of carcinogen are not present to create a problem that traditional cigarettes that contain various chemicals, additives and smokes. Vapor is just vapor. It does not include any smell or lingering odor. It is far from affecting people around you while smoking electronic cigarette. Electronic cigarettes should not be banned because it does not pose any harm to its users and help people from quitting cigar....
+**Example 3:**
+- Query: Should corporal punishment be used in schools?
+- Doc Title: Spanking In Schools
+- Doc Text: I will now summarize this debate and make a closing statement. Con has presented arguments that are self-contradicting: Should spanking "be banned from everywhere," including children's own homes, or is it "the parents [sic] choice" whether or not to spank their children? I have shown that spanking should not be banned universally, and I have also demonstrated that if parents are allowed to spank their children, it is also their right to choose a school that will do the same in an appropriate manner as well. My opponent also mentions studies that have shown that spanking "causes problems later down the road," but he has failed to present any of the studies in his sources. On the other hand, I have cited sources that show that corporal punishment is very effective, and I have also presented a testimonial from a teacher who was grateful for the ability to apply it in class due to its effective nature. Con has thrown claims around willy-nilly throughout the debate, but has not been able to back up any of them. The one piece of evidence my opponent has presented, i. e. the case of a child who went home to her mother with bruises on her bottom, has been shown to be unconvincing at best. The mother of the child was upset that the school was not implementing corporal punishment correctly, not that the school was enacting corporal punishment. The child was most definitely at fault, and faced a few bruises on her bottom as opposed to something that might appear in her permanent record. If anything, spanking was the correct decision in this case; it just should have been implemented differently, which only shows that supporting corporal punishment is better than abolishing it, as it can then be refined and implemented better. In addition to successfully refuting all of Con's points, I have done the following:-I have shown that spanking is similar in many ways to other forms of punishment that society imposes; to oppose spanking in schools (or in general) would be to argue a much greater point about punishment in general. -I have shown that spanking provides teachers with another tool to keep their classes progressing smoothly. -I have shown that spanking has benefits, largely due to its immediacy and proven ability to keep children from misbehaving. -I have shown that spanking is actually a great alternative to other forms of punishment imposed in schools, in that it does not potentially ruin a child's future. In closing, from looking at the exchange between Con and me, one must conclude that corporal punishment in schools is appropriate if carried out correctly, just like any other form of punishment society must use at various levels of misbehavior, whether the punishment is to send a criminal to prison or even to punish a dog to keep it from becoming unruly. Spanking should not be eliminated from schools everywhere, and if anything, it should become a more widespread practice....
+**Example 4:**
+- Query: Should social security be privatized?
+- Doc Title: Abolish Social Security
+- Doc Text: Unpromising, chaotic, and fraud. Social security fits this description exactly; therefore social security must be abolished. I take an affirmative stance to abolish Social security for the following contentions; Contention 1; Social security has no prospect for America, Contention 2; America now and in the future will not be able to sustain the needs of the Social security system, and Contention 3; Individual workers should be able to invest their own retirement money. Under Social Security, lower and middle class individuals are forced to pay a significant portion of their income, approximately 12 percent for the alleged purpose of securing their retirement. That money is not saved or invested, but transferred directly to the program's current beneficiaries with the "promise" that when current taxpayers get old, the income of future taxpayers will be transferred to them. Since this scheme creates no wealth, any benefits one person receives in excess of his payments necessarily come at the expense of others. Under Social Security, whether an individual gets twice as much from others as was taken from them, or half as much, or nothing at all, is entirely at the discretion of politicians. They cannot count on Social Security for anything-except a massive drain on his income. Therefore, there is absolutely no way that the system can even guarantee future retirees the equal amount that they had previously contributed to Social security, making this system inequitable. Fixing the Social security system is essentially impossible. The government has increased the payroll tax 17 times since 1935 yet, the system is still crippling. Proving my point further, that Social security should be abolished immediately. In 2002, there were 186 million workers in America and 190 million retired people. This was the beginning of the end of the Social security system. The workforce can no longer acquire the money necessary to give to the retirement population. The evidence continues to mount. According to newyorktimes.com, by 2010, while 41 million new workers enter the workforce, a staggering 76 million workers will enter retirement. This is an unfathomable amount and impossible for the social security system to reach a solution for how these retired people are going to get money. How much, when, and in what form one should provide for retirement is highly individual-and is properly left to the individual's free judgment and action. Social Security deprives the individual of this freedom, and thus makes them less able to plan for the future, less able to provide for their retirements, less able to enjoy their most vital years, and less able to invest in themselves. If Social Security did not exist, individual workers could be free to use that 12 percent of their income as they choose making their ability to better their future incomparably greater. They could save for their retirement with a diversified, long-term, productive investment in stocks or bonds. Or they could reasonably choose not to devote all 12 percent to retirement. They might choose to work far past the age of 65 or choose to invest in their own productivity through additional education or starting a business. So the future of this individual's life is up to no one but themselves. This would encourage many Americans to work to earn money for a better future. To conclude my remarks, I urge my fellow debators to vote in affirmation....
+**Example 5:**
+- Query: Is a college education worth it?
+- Doc Title: College education is not necessary to become successful
+- Doc Text: College education is a waste of money and time. Most people who attend college do not get what they pay for. An associate degree or a bachelors degree cannot really get you a high paying job or "career". In order to get a "career" you must continue your education to receive a masters degree, a PhD or any other further education Many people without a college career have been extremely successful; For example, , Bill Gates , Andrew Carnegie, Federica Henry Royce and Abraham Lincoln ....
+**Example 6:**
+- Query: Should abortion be legal?
+- Doc Title: Abortion should be illegal world wide
+- Doc Text: Abortion is rang no one should murder there unborn child. If a women has a right to the choice of the abortion ,,well the baby has a choice and should have the right to live. Its Evil to kill a baby .there should be no argument because any rebuttal means your stance is kill a child infant you created. it makes no sense religious wise, morally makes no sense at all, and it shouldn't make legal sense because murders illegal....
+**Example 7:**
+- Query: Should students have to wear school uniforms?
+- Doc Title: Students should wear school uniform.
+- Doc Text: I think students should wear school uniforms. It is because in schools where students are allowed to wear whatever they want, there are school bullies that make fun of other students who may not have a good taste in fashion. But if they all wear the same school uniforms, they can't make fun of one anothers' fashion senses which prevents this type of bullying from happening. And also, if students can wear whatever they want, they might take a longer time to leave their houses for school due to taking too long to choose what they want to wear which may cause them to be late for school....
+**Example 8:**
+- Query: Should any vaccines be required for children?
+- Doc Title: Resolved: State mandated administration of childhood vaccinations is justified.
+- Doc Text: Not a full case yet.. Just some little points I put together... Governments should not have the right to intervene in the health decisions parents make for their children. 31% of parents believe they should have the right to refuse mandated school entry vaccinations for their children, according to a 2010 survey by the University of Michigan. Many parents hold religious beliefs against vaccination. Forcing such parents to vaccinate their children would violate the 1st Amendment which guarantees citizens the right to the free exercise of their religion. Vaccines are often unnecessary in many cases where the threat of death from disease is small. During the early nineteenth century, mortality for the childhood diseases whooping cough, measles, and scarlet fever fell drastically before immunization became available. This decreased mortality has been attributed to improved personal hygiene, water purification, effective sewage disposal, and better food hygiene and nutrition. Vaccines interfere with natural law and God's plan for humanity. Disease is a natural occurrence, and humans should not interfere with its trajectory. Common childhood vaccinations may cause rare yet serious reactions including anaphylactic shock, paralysis, and sudden death. This risk is not worth taking, especially considering most diseases vaccinated against are not necessarily life threatening. Vaccines can trigger auto-immune disorders such as arthritis, multiple sclerosis, lupus, Guillain-Barrï¿½ Syndrome (GBS), and other disorders. Vaccines can cause brain inflammation (encephalopathy) which can lead to death or permanent brain damage and disorders such as autism, ADD/ADHD, and other developmental problems. In addition, the vaccine additive thimerosal (found in most pre-1999 vaccines) has been associated specifically with the development of autism and is still found in certain meningococcal, tetanus, and flu vaccines such as the H1N1 vaccine. Vaccines clog and disrupt the lymphatic system with large foreign protein molecules (the active ingredients contained within vaccines) which may lead to lymphatic cancers such as leukemia and lymphoma. All vaccines cause immune system suppression, and can permanently damage the natural immune system. Unvaccinated children build and strengthen their immune systems through fighting off infection and developing natural immunity to diseases like measles and chickenpox. Artificial immunity, generated through vaccination, weakens the immune system and leaves children more vulnerable to all other diseases and infections. Children should not be required to receive the DPT (diphtheria, pertussis, tetanus) vaccine. Some studies have shown that children who receive the DPT vaccine exhibit shallow breathing which has been associated with sleep apnea and may be a causal factor in sudden infant death syndrome (SIDS). Studies of infants whose deaths were recorded as SIDS show a temporal relationship with DPT vaccination (these infants tended to die at similar time intervals in relation to when they were vaccinated). Children should not receive the hepatitis B vaccine. Hepatitis B is a blood-born disease and is primarily spread by sexual intercourse and intravenous drug use. Children are not at great risk of contracting the disease. In addition, researchers have found that immunization with the hepatitis B vaccine is associated with an increased risk of developing multiple sclerosis. Young girls should not receive mandatory vaccination for HPV (human papilloma virus). The vaccine was approved in 2006 and the long-term effects are unknown. Since approval, adverse side effects such as severe allergic reactions, Guillain-Barrï¿½ syndrome, spinal cord inflammation and pancreatitis have been reported to the US Vaccine Adverse Events Reporting System. Although these adverse reactions may be rare, they are not worth the risk since the vaccine only protects against two of the 15 strains of HPV that may cause cancer of the cervix (20-40 years after an individual is infected). Vaccines are promoted primarily to generate profits for manufacturers and financial donations for medical organizations that endorse vaccines. In 2003, a House Committee on Government Reform report revealed that the CDC Advisory Committee on Immunization Practices had members with significant financial ties to vaccine companies. The American Academy of Pediatrics, a leading pro-vaccination organization, receives millions of dollars from vaccine companies....
+**Example 9:**
+- Query: Should performance-enhancing drugs be accepted in sports?
+- Doc Title: Performance Enhancing Drugs (PEDs) should be permitted in professional sports.
+- Doc Text: I affirm that PEDs should be permitted in sports. Legalizing PEDs would serve to level the playing field. According to the New York Times, more than a third of the top Tour de France finishers since 1998 have been confirmed to be doping while racing, despite efforts to stop them (1). So, the statistics clearly indicate that those who obey the rules and do not dope have a relatively low chance of performing well. If those who did not not use Performance Enhancing Drugs in the Tour de France were instead allowed to dope, those racers would have had the same advantages as those who currently dope illegally, and the playing field would have been more even. One of the primary reasons people enjoy watching professional sports is because they showcase the highest level of human achievement. Allowing PEDs to be used would raise the level of competition and skill in professional sports. This would thus make achievements in sports seem even more unthinkable and impressive, which would thus make more people attend these sporting events. As described in Forbes, "A huge part of watching sports is witnessing the very peak of human athletic ability, and legalizing performance enhancing drugs would help athletes climb even higher (2)." This shows that Performance Enhancing Drugs can raise the level of sports, which can make sports even more of a spectacle. For example, during the 1990's, when Mark McGwire and Sammy Sosa used steroids, their respective games were usually sold out with people who wanted to see their heightened abilities (2). I will now refute my opponent's arguments. My opponent argues that PEDs provide with a large risk for athletes. However, Athletes who are using PEDs are already in a dangerous industry, and frequently become injured regardless of whether or not they use drugs. In the year 2013 alone, the NFL reported at least 152 concussions, and was forced to spend up to 765 million dollars to settle claims of traumatic head injury brought by former players (3). So, PEDs do not provide with a large injury risk compared to the inherent risk of professional sports. My opponent also argues that allowing PEDs takes away from true human achievement. However, being an athlete requires a high amount of skill regardless of whether or not you use PEDs. Hitting a home run, for example requires excellent timing and a perfect swing. Using PEDs may increase the distance of this home run, but hitting a it is still a spectacle of your own human, not chemical precision. So, performing in sports requires intricate skill, no matter what. It is thus clear that PEDs do not take away from human achievement. As such, I affirm. 1- http://www.nytimes.com... 2- http://www.forbes.com... 3- http://www.pbs.org......
+**Example 10:**
+- Query: Should birth control pills be available over the counter?
+- Doc Title: Birth Control
+- Doc Text: I believe that birth control should be allowed to be obtained without parental consent. When a girl is requesting to use birth control, she is being mature and responsible and it is her own right, not her parents, to make the decision to have sex or not. If someone is making the decision to have sex, they should be mature enough to make the decision to be safe about it. Further more, most teens would have sex if they want to regardless if they have protection or not, so it is better to be safe than sorry....
+## CORE
+**Has Scores 0, 1, 2**
+**Score 2 = HIGHLY RELEVANT** ✓
+**Score 1 = NOT RELEVANT** ✗ (should be merged with Score 0!)
+## Special Notes
+- **Original Purpose**: Argument retrieval for controversial topics and debates
+- **What Scores Really Mean**:
+  - Score 2: Strong argumentative content directly addressing the query
+  - Score 1: Just keyword matching - wrong topic/context (uniforms→hats, schools→families)
+  - Score 0: Completely off-topic
+- **Dataset Type**: Argument retrieval from debate portals (Debate.org, IDebate, etc.)
+- **CRITICAL FLAW**: Score 1 is just keyword matching without intent matching!
+## Training Verdict
+**BAD FOR TRAINING** ✗
+- Score 1 should be treated as Score 0 (not relevant)
+- Only Score 2 is actually relevant
+- Similar issue to DBPedia-Entity - keyword matching ≠ relevance
+- For binary: Score 2 as positive, Score 0+1 as negative
+## Training Recommendations
+- If must use: Merge Score 1 into Score 0
+- Only use Score 2 as positive examples
+- Better to skip this dataset due to bad Score 1 annotations
+## Observations

test_scores/examine_more_scores.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""
+Examine more examples of score 1 vs score 2 documents to understand the pattern
+"""
+import json
+from pathlib import Path
+from collections import defaultdict
+def load_queries(dataset_path):
+    """Load queries"""
+    queries = {}
+    queries_file = dataset_path / "queries.jsonl"
+    if queries_file.exists():
+        with open(queries_file, 'r') as f:
+            for line in f:
+                q = json.loads(line)
+                queries[q['_id']] = q['text']
+    return queries
+def load_corpus(dataset_path):
+    """Load corpus documents"""
+    corpus = {}
+    corpus_file = dataset_path / "corpus.jsonl"
+    if corpus_file.exists():
+        with open(corpus_file, 'r') as f:
+            for line in f:
+                doc = json.loads(line)
+                corpus[doc['_id']] = {
+                    'title': doc.get('title', ''),
+                    'text': doc.get('text', '')
+                }
+    return corpus
+def examine_more_nfcorpus():
+    """Examine more NFCorpus examples"""
+    dataset_path = Path("../beir_data/nfcorpus")
+    # Load data
+    queries = load_queries(dataset_path)
+    corpus = load_corpus(dataset_path)
+    # Load test qrels (has both 1 and 2 scores)
+    qrels = defaultdict(dict)
+    qrels_file = dataset_path / "qrels" / "test.tsv"
+    with open(qrels_file, 'r') as f:
+        next(f)  # Skip header
+        for line in f:
+            parts = line.strip().split('\t')
+            if len(parts) == 3:
+                query_id, doc_id, score = parts
+                qrels[query_id][doc_id] = int(score)
+    # Find queries that have both score 1 and score 2 documents
+    mixed_queries = []
+    for query_id, docs in qrels.items():
+        scores = set(docs.values())
+        if 1 in scores and 2 in scores:
+            mixed_queries.append(query_id)
+    print(f"Examining 5 more examples from NFCorpus\n")
+    # Skip first 3, examine next 5
+    for query_id in mixed_queries[3:8]:
+        if query_id not in queries:
+            continue
+        print("="*80)
+        print(f"QUERY: {queries[query_id]}")
+        print("-"*80)
+        # Get score 2 and 1 documents
+        score_2_docs = [doc_id for doc_id, score in qrels[query_id].items() if score == 2]
+        score_1_docs = [doc_id for doc_id, score in qrels[query_id].items() if score == 1]
+        print(f"\n[SCORE 2 - HIGHLY RELEVANT] (showing 1 of {len(score_2_docs)})")
+        for doc_id in score_2_docs[:1]:
+            if doc_id in corpus:
+                doc = corpus[doc_id]
+                print(f"Title: {doc['title'][:150]}")
+                print(f"Text snippet: {doc['text'][:250]}...")
+        print(f"\n[SCORE 1 - SUPPOSEDLY RELEVANT] (showing 2 of {len(score_1_docs)})")
+        for doc_id in score_1_docs[:2]:
+            if doc_id in corpus:
+                doc = corpus[doc_id]
+                print(f"\nTitle: {doc['title'][:150]}")
+                print(f"Text snippet: {doc['text'][:250]}...")
+        print("\n")
+def examine_webis_touche():
+    """Examine Webis-Touche2020 which also has 0, 1, 2"""
+    dataset_path = Path("beir_data/webis-touche2020")
+    # Load data
+    queries = load_queries(dataset_path)
+    corpus = load_corpus(dataset_path)
+    # Load test qrels
+    qrels = defaultdict(dict)
+    qrels_file = dataset_path / "qrels" / "test.tsv"
+    with open(qrels_file, 'r') as f:
+        next(f)  # Skip header
+        for line in f:
+            parts = line.strip().split('\t')
+            if len(parts) == 3:
+                query_id, doc_id, score = parts
+                qrels[query_id][doc_id] = int(score)
+    print("\n" + "="*80)
+    print("WEBIS-TOUCHE2020 EXAMPLES (has 0, 1, 2 scores)")
+    print("="*80)
+    # Find queries with all three scores
+    count = 0
+    for query_id, docs in qrels.items():
+        scores = set(docs.values())
+        if 0 in scores and 1 in scores and 2 in scores:
+            if query_id not in queries or count >= 3:
+                continue
+            count += 1
+            print(f"\nQUERY: {queries[query_id]}")
+            print("-"*80)
+            # Get documents by score
+            score_2_docs = [doc_id for doc_id, score in docs.items() if score == 2]
+            score_1_docs = [doc_id for doc_id, score in docs.items() if score == 1]
+            score_0_docs = [doc_id for doc_id, score in docs.items() if score == 0]
+            print(f"\n[SCORE 2] (showing 1 of {len(score_2_docs)})")
+            for doc_id in score_2_docs[:1]:
+                if doc_id in corpus:
+                    doc = corpus[doc_id]
+                    print(f"Text: {doc['text'][:300]}...")
+            print(f"\n[SCORE 1] (showing 1 of {len(score_1_docs)})")
+            for doc_id in score_1_docs[:1]:
+                if doc_id in corpus:
+                    doc = corpus[doc_id]
+                    print(f"Text: {doc['text'][:300]}...")
+            print(f"\n[SCORE 0] (showing 1 of {len(score_0_docs)})")
+            for doc_id in score_0_docs[:1]:
+                if doc_id in corpus:
+                    doc = corpus[doc_id]
+                    print(f"Text: {doc['text'][:300]}...")
+if __name__ == "__main__":
+    examine_more_nfcorpus()
+    examine_webis_touche()

test_scores/test_fiqa_relevance.py ADDED Viewed

	@@ -0,0 +1,146 @@

+"""
+Test FiQA score 1 documents to see how relevant they really are
+"""
+import json
+from pathlib import Path
+from collections import defaultdict
+import random
+def load_fiqa_data():
+    """Load FiQA queries, corpus, and qrels"""
+    base_path = Path("../beir_data/fiqa")
+    # Load queries
+    queries = {}
+    with open(base_path / "queries.jsonl", 'r') as f:
+        for line in f:
+            q = json.loads(line)
+            queries[q['_id']] = q['text']
+    # Load corpus
+    corpus = {}
+    with open(base_path / "corpus.jsonl", 'r') as f:
+        for line in f:
+            doc = json.loads(line)
+            corpus[doc['_id']] = {
+                'title': doc.get('title', ''),
+                'text': doc.get('text', '')
+            }
+    # Load train qrels (all score 1)
+    train_qrels = defaultdict(list)
+    with open(base_path / "qrels/train.tsv", 'r') as f:
+        next(f)  # Skip header
+        for line in f:
+            parts = line.strip().split('\t')
+            if len(parts) == 3:
+                query_id, doc_id, score = parts
+                train_qrels[query_id].append(doc_id)
+    return queries, corpus, train_qrels
+def examine_fiqa_examples():
+    """Look at actual FiQA examples to judge relevance"""
+    queries, corpus, train_qrels = load_fiqa_data()
+    print("="*80)
+    print("FiQA Training Data Analysis")
+    print("="*80)
+    print(f"Total queries in train: {len(train_qrels)}")
+    print(f"Total corpus size: {len(corpus)}")
+    # Get distribution of docs per query
+    docs_per_query = [len(docs) for docs in train_qrels.values()]
+    print(f"Docs per query: min={min(docs_per_query)}, max={max(docs_per_query)}, avg={sum(docs_per_query)/len(docs_per_query):.1f}")
+    print("\n" + "="*80)
+    print("EXAMINING SPECIFIC EXAMPLES")
+    print("="*80)
+    # Take first 5 queries with their documents
+    sample_queries = list(train_qrels.keys())[:5]
+    for i, query_id in enumerate(sample_queries, 1):
+        if query_id not in queries:
+            continue
+        print(f"\n[EXAMPLE {i}]")
+        print(f"QUERY: {queries[query_id]}")
+        print("-"*80)
+        docs = train_qrels[query_id]
+        print(f"Number of 'relevant' docs: {len(docs)}")
+        # Show first 2 docs for this query
+        for j, doc_id in enumerate(docs[:2], 1):
+            if doc_id in corpus:
+                doc = corpus[doc_id]
+                print(f"\nDoc {j} (ID: {doc_id}):")
+                print(f"Title: {doc['title'][:100]}" if doc['title'] else "[No title]")
+                print(f"Text: {doc['text'][:300]}...")
+        print("\n" + "="*80)
+    # Now let's check some random negative examples (not in qrels)
+    print("\nCOMPARING WITH RANDOM (UNLABELED) DOCUMENTS")
+    print("="*80)
+    # Pick a query and show both labeled and unlabeled docs
+    test_query_id = sample_queries[0]
+    if test_query_id in queries:
+        print(f"\nQUERY: {queries[test_query_id]}")
+        print("-"*80)
+        # Get labeled docs
+        labeled_docs = set(train_qrels[test_query_id])
+        # Get some random unlabeled docs
+        all_doc_ids = list(corpus.keys())
+        unlabeled_docs = [d for d in all_doc_ids if d not in labeled_docs]
+        random_unlabeled = random.sample(unlabeled_docs, min(2, len(unlabeled_docs)))
+        print("\n[LABELED AS RELEVANT (Score 1)]:")
+        for doc_id in list(labeled_docs)[:1]:
+            if doc_id in corpus:
+                doc = corpus[doc_id]
+                print(f"Text: {doc['text'][:250]}...")
+        print("\n[UNLABELED (Implicit Score 0)]:")
+        for doc_id in random_unlabeled[:1]:
+            if doc_id in corpus:
+                doc = corpus[doc_id]
+                print(f"Text: {doc['text'][:250]}...")
+def analyze_relevance_quality():
+    """Analyze if Score 1 docs are truly highly relevant or mixed"""
+    queries, corpus, train_qrels = load_fiqa_data()
+    print("\n" + "="*80)
+    print("RELEVANCE QUALITY ANALYSIS")
+    print("="*80)
+    # Find queries with multiple relevant docs to see if they're all equally relevant
+    multi_doc_queries = [(q, docs) for q, docs in train_qrels.items() if len(docs) > 2]
+    if multi_doc_queries:
+        query_id, doc_ids = multi_doc_queries[0]
+        if query_id in queries:
+            print(f"\nQuery with {len(doc_ids)} 'relevant' docs:")
+            print(f"QUERY: {queries[query_id]}")
+            print("-"*80)
+            # Show all relevant docs to see if they're equally relevant
+            for i, doc_id in enumerate(doc_ids[:4], 1):
+                if doc_id in corpus:
+                    doc = corpus[doc_id]
+                    print(f"\nRelevant Doc {i}:")
+                    print(f"Text snippet: {doc['text'][:200]}...")
+            print("\n[QUESTION]: Do all these docs seem EQUALLY relevant to the query?")
+            print("If some are more relevant than others, then Score 1 is mixing different relevance levels!")
+if __name__ == "__main__":
+    examine_fiqa_examples()
+    analyze_relevance_quality()

test_scores/test_nfcorpus_train_relevance.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""
+Test NFCorpus train score 1 documents to see if they're highly relevant or mixed
+"""
+import json
+from pathlib import Path
+from collections import defaultdict
+import random
+def load_nfcorpus_data():
+    """Load NFCorpus queries, corpus, and qrels"""
+    base_path = Path("../beir_data/nfcorpus")
+    # Load queries
+    queries = {}
+    with open(base_path / "queries.jsonl", 'r') as f:
+        for line in f:
+            q = json.loads(line)
+            queries[q['_id']] = q['text']
+    # Load corpus
+    corpus = {}
+    with open(base_path / "corpus.jsonl", 'r') as f:
+        for i, line in enumerate(f):
+            if i > 5000:  # Limit for speed
+                break
+            doc = json.loads(line)
+            corpus[doc['_id']] = {
+                'title': doc.get('title', ''),
+                'text': doc.get('text', '')
+            }
+    # Load train qrels (all score 1)
+    train_qrels = defaultdict(list)
+    with open(base_path / "qrels/train.tsv", 'r') as f:
+        next(f)  # Skip header
+        for line in f:
+            parts = line.strip().split('\t')
+            if len(parts) == 3:
+                query_id, doc_id, score = parts
+                train_qrels[query_id].append(doc_id)
+    # Also load test qrels to compare
+    test_qrels = defaultdict(dict)
+    with open(base_path / "qrels/test.tsv", 'r') as f:
+        next(f)  # Skip header
+        for line in f:
+            parts = line.strip().split('\t')
+            if len(parts) == 3:
+                query_id, doc_id, score = parts
+                test_qrels[query_id][doc_id] = int(score)
+    return queries, corpus, train_qrels, test_qrels
+def examine_train_examples():
+    """Look at actual NFCorpus train examples"""
+    queries, corpus, train_qrels, test_qrels = load_nfcorpus_data()
+    print("="*80)
+    print("NFCorpus TRAIN Data Analysis")
+    print("="*80)
+    print(f"Total queries in train: {len(train_qrels)}")
+    print(f"Total corpus loaded: {len(corpus)}")
+    # Get distribution of docs per query
+    docs_per_query = [len(docs) for docs in train_qrels.values()]
+    print(f"Docs per query: min={min(docs_per_query)}, max={max(docs_per_query)}, avg={sum(docs_per_query)/len(docs_per_query):.1f}")
+    print("\n" + "="*80)
+    print("EXAMINING TRAIN EXAMPLES (all Score 1)")
+    print("="*80)
+    # Take 5 random queries from train
+    sample_queries = random.sample(list(train_qrels.keys()), min(5, len(train_qrels)))
+    for i, query_id in enumerate(sample_queries, 1):
+        if query_id not in queries:
+            continue
+        print(f"\n[TRAIN EXAMPLE {i}]")
+        print(f"QUERY: {queries[query_id]}")
+        print("-"*80)
+        docs = train_qrels[query_id]
+        print(f"Number of Score 1 docs in TRAIN: {len(docs)}")
+        # Show first 3 docs to see variation in relevance
+        for j, doc_id in enumerate(docs[:3], 1):
+            if doc_id in corpus:
+                doc = corpus[doc_id]
+                print(f"\nTRAIN Doc {j} (Score 1):")
+                print(f"Title: {doc['title'][:100]}" if doc['title'] else "[No title]")
+                print(f"Text: {doc['text'][:250]}...")
+    print("\n" + "="*80)
+    print("COMPARING WITH TEST SET (has Score 1 and 2)")
+    print("="*80)
+    # Find a test query with both score 1 and 2
+    for query_id in test_qrels:
+        if query_id not in queries:
+            continue
+        docs = test_qrels[query_id]
+        scores = set(docs.values())
+        if 1 in scores and 2 in scores:
+            score_1_docs = [d for d, s in docs.items() if s == 1]
+            score_2_docs = [d for d, s in docs.items() if s == 2]
+            print(f"\n[TEST EXAMPLE]")
+            print(f"QUERY: {queries[query_id]}")
+            print("-"*80)
+            # Show Score 2 doc
+            print("\nTEST Score 2 (Highly Relevant):")
+            for doc_id in score_2_docs[:1]:
+                if doc_id in corpus:
+                    doc = corpus[doc_id]
+                    print(f"Title: {doc['title'][:100]}" if doc['title'] else "[No title]")
+                    print(f"Text: {doc['text'][:250]}...")
+            # Show Score 1 doc
+            print("\nTEST Score 1 (Relevant):")
+            for doc_id in score_1_docs[:1]:
+                if doc_id in corpus:
+                    doc = corpus[doc_id]
+                    print(f"Title: {doc['title'][:100]}" if doc['title'] else "[No title]")
+                    print(f"Text: {doc['text'][:250]}...")
+            break
+    print("\n" + "="*80)
+    print("QUESTION: Are TRAIN Score 1 docs more like TEST Score 1 or Score 2?")
+    print("="*80)
+def check_quality_variation():
+    """Check if train Score 1 docs have quality variation"""
+    queries, corpus, train_qrels, test_qrels = load_nfcorpus_data()
+    print("\n" + "="*80)
+    print("CHECKING VARIATION IN TRAIN SCORE 1 DOCUMENTS")
+    print("="*80)
+    # Find a query with many Score 1 docs in train
+    queries_with_many_docs = [(q, docs) for q, docs in train_qrels.items()
+                              if len(docs) > 10]
+    if queries_with_many_docs:
+        # Sort by number of docs
+        queries_with_many_docs.sort(key=lambda x: len(x[1]), reverse=True)
+        query_id, doc_ids = queries_with_many_docs[0]
+        if query_id in queries:
+            print(f"\nQuery with {len(doc_ids)} Score 1 docs in TRAIN:")
+            print(f"QUERY: {queries[query_id]}")
+            print("-"*80)
+            # Show a sample of docs to see if they vary in relevance
+            print("\nShowing 5 random Score 1 docs from TRAIN for this query:")
+            sample_docs = random.sample(doc_ids, min(5, len(doc_ids)))
+            for i, doc_id in enumerate(sample_docs, 1):
+                if doc_id in corpus:
+                    doc = corpus[doc_id]
+                    print(f"\nDoc {i}:")
+                    print(f"Title: {doc['title'][:80]}" if doc['title'] else "[No title]")
+                    print(f"Text: {doc['text'][:200]}...")
+            print("\n[ANALYSIS]: Do these all seem EQUALLY relevant?")
+            print("If some are clearly more relevant than others, then TRAIN Score 1 is mixing relevance levels!")
+if __name__ == "__main__":
+    examine_train_examples()
+    check_quality_variation()

train_datasets_creation/create_training_data_ids_batched.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""Create training data - IDS ONLY with BATCHING (best of both worlds)."""
+import json
+import sys
+import random
+from pathlib import Path
+from collections import defaultdict
+import time
+def process_dataset(dataset_name, config, negatives_ratio=10, batch_size=5000):
+    """Process dataset in batches, storing only IDs."""
+    print(f"\n{'='*50}\nProcessing: {dataset_name}")
+    # Load qrels
+    qrels_path = Path(f'../beir_data/{dataset_name}/qrels/merged.tsv')
+    if not qrels_path.exists():
+        print(f"  ⚠️ No merged.tsv found")
+        return
+    print(f"  Loading qrels...")
+    qrels = defaultdict(dict)
+    with open(qrels_path, 'r', encoding='utf-8') as f:
+        next(f)  # Skip header
+        for line in f:
+            parts = line.strip().split('\t')
+            if len(parts) == 3:
+                qrels[parts[0]][parts[1]] = int(parts[2])
+    # Count scores for info
+    score_counts = defaultdict(int)
+    for docs in qrels.values():
+        for score in docs.values():
+            score_counts[score] += 1
+    print(f"  Loaded {len(qrels):,} queries, scores: {dict(score_counts)}")
+    # Load queries
+    print(f"  Loading queries...")
+    queries = {}
+    with open(f'../beir_data/{dataset_name}/queries.jsonl', 'r', encoding='utf-8') as f:
+        for line in f:
+            q = json.loads(line)
+            queries[q['_id']] = q['text']
+    # Get score mapping from config
+    score_map = config['datasets'][dataset_name]['score_to_category']
+    # Check if this dataset has both Score 1 and 2 as positive
+    has_score_2_positive = score_map.get('2') == 'positive'
+    has_score_1_positive = score_map.get('1') == 'positive'
+    both_scores_positive = has_score_2_positive and has_score_1_positive
+    # Process in batches
+    all_qids = list(qrels.keys())
+    random.seed(42)
+    # Create output file
+    output_dir = Path(f'datasets/{dataset_name}')
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_file = output_dir / 'training_ids.jsonl'
+    # Adjust batch size for dataset size
+    if len(all_qids) > 50000:
+        batch_size = 5000
+    elif len(all_qids) > 10000:
+        batch_size = 2000
+    else:
+        batch_size = len(all_qids)  # Process small datasets in one batch
+    print(f"  Processing {len(all_qids):,} queries in batches of {batch_size:,}...")
+    # Stats
+    total_examples = 0
+    total_easy_pos = 0
+    total_hard_pos = 0
+    total_hard_neg = 0
+    total_easy_neg = 0
+    with open(output_file, 'w', encoding='utf-8') as out_f:
+        for batch_start in range(0, len(all_qids), batch_size):
+            batch_end = min(batch_start + batch_size, len(all_qids))
+            batch_qids = all_qids[batch_start:batch_end]
+            if len(all_qids) > 10000:  # Only show progress for large datasets
+                print(f"    Processing batch: queries {batch_start:,}-{batch_end:,}")
+            # Process queries in this batch
+            for qid in batch_qids:
+                if qid not in queries:
+                    continue
+                docs = qrels[qid]
+                # Categorize documents by score (IDs only!)
+                hard_positive_ids = []
+                easy_positive_ids = []
+                hard_negative_ids = []
+                easy_negative_ids = []
+                for doc_id, score in docs.items():
+                    category = score_map.get(str(score), 'easy_negative')
+                    if category == 'positive':
+                        # If both 1 and 2 are positive, differentiate them
+                        if both_scores_positive:
+                            if score == 2:
+                                easy_positive_ids.append(doc_id)
+                            elif score == 1:
+                                hard_positive_ids.append(doc_id)
+                        else:
+                            # Only one score is positive, treat all as easy positives
+                            easy_positive_ids.append(doc_id)
+                    elif category == 'hard_negative':
+                        hard_negative_ids.append(doc_id)
+                    elif category == 'easy_negative':
+                        easy_negative_ids.append(doc_id)
+                # Combine positives
+                all_positive_ids = easy_positive_ids + hard_positive_ids
+                if not all_positive_ids:
+                    continue
+                # Calculate 1:10 ratio based on total positives
+                total_positives = len(all_positive_ids)
+                total_negatives_have = len(hard_negative_ids) + len(easy_negative_ids)
+                total_negatives_needed = total_positives * negatives_ratio
+                # Only sample from other queries IN THIS BATCH if we need more
+                if total_negatives_have < total_negatives_needed:
+                    need_more = total_negatives_needed - total_negatives_have
+                    # Sample from batch queries only (like efficient version!)
+                    other_batch_qids = [q for q in batch_qids if q != qid]
+                    random.shuffle(other_batch_qids)
+                    # Collect ALL doc IDs already judged for this query
+                    current_query_docs = set(docs.keys())
+                    added = 0
+                    for other_qid in other_batch_qids:
+                        if added >= need_more:
+                            break
+                        for doc_id in qrels[other_qid]:
+                            # Check: not already judged for current query
+                            if doc_id not in current_query_docs:
+                                easy_negative_ids.append(doc_id)
+                                added += 1
+                                if added >= need_more:
+                                    break
+                # Write example directly to file (streaming)
+                example = {
+                    'query_id': qid,
+                    'query_text': queries[qid],
+                    'source_dataset': dataset_name,
+                    'easy_positive_ids': easy_positive_ids,
+                    'hard_positive_ids': hard_positive_ids,
+                    'hard_negative_ids': hard_negative_ids,
+                    'easy_negative_ids': easy_negative_ids[:total_negatives_needed - len(hard_negative_ids)] if easy_negative_ids else []
+                }
+                out_f.write(json.dumps(example) + '\n')
+                # Update stats
+                total_examples += 1
+                total_easy_pos += len(example['easy_positive_ids'])
+                total_hard_pos += len(example['hard_positive_ids'])
+                total_hard_neg += len(example['hard_negative_ids'])
+                total_easy_neg += len(example['easy_negative_ids'])
+    # Print stats
+    print(f"  ✓ Created {total_examples:,} examples")
+    print(f"    Easy positives: {total_easy_pos:,}")
+    print(f"    Hard positives: {total_hard_pos:,}")
+    print(f"    Hard negatives: {total_hard_neg:,}")
+    print(f"    Easy negatives: {total_easy_neg:,}")
+def main():
+    """Main function."""
+    print("="*50)
+    print("TRAINING DATA CREATION - IDS + BATCHING")
+    print("="*50)
+    print("Best of both worlds: IDs only (small files) + Batching (fast)")
+    # Load config
+    with open('../test_scores/dataset_reports/training_config.json', 'r', encoding='utf-8') as f:
+        config = json.load(f)['beir_training_config']
+    # Get datasets to process
+    datasets = [name for name, cfg in config['datasets'].items() if cfg['use']]
+    # Check if specific dataset requested
+    if len(sys.argv) > 1:
+        if sys.argv[1] in datasets:
+            datasets = [sys.argv[1]]
+            print(f"Processing only: {sys.argv[1]}")
+        else:
+            print(f"❌ Dataset '{sys.argv[1]}' not found or disabled")
+            print(f"Available: {', '.join(datasets)}")
+            return
+    print(f"Will process {len(datasets)} datasets")
+    total_start = time.time()
+    # Process each dataset
+    for idx, dataset_name in enumerate(datasets, 1):
+        print(f"\n[{idx}/{len(datasets)}] {dataset_name}")
+        dataset_start = time.time()
+        process_dataset(dataset_name, config)
+        dataset_time = time.time() - dataset_start
+        print(f"  Dataset completed in {dataset_time:.2f} seconds")
+    total_time = time.time() - total_start
+    print(f"\n✅ Complete! Total time: {total_time:.2f} seconds")
+    print("\n📝 Output: datasets/<dataset>/training_ids.jsonl (IDs only)")
+    print("💾 File sizes: ~100x smaller than full text")
+    print("⚡ Speed: As fast as efficient version")
+if __name__ == "__main__":
+    main()

train_datasets_creation/make_dataset_for_train.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import json,pandas as pd,os
+dfs = []
+for fold in os.listdir('./datasets'):
+    if fold == 'fiqa':
+        continue
+    print(fold)
+    corpus = pd.DataFrame([json.loads(x) for x in open(f"../beir_data/{fold}/corpus.jsonl").readlines()])
+    df = pd.DataFrame([json.loads(x) for x in open(f"./datasets/{fold}/training_ids.jsonl").readlines()])
+    if df.size>1000:
+        df = df.sample(1000,replace=True)
+    rows = []
+    for _,row in df.iterrows():
+        query_id = row['query_id']
+        query_text = row['query_text']
+        for label in ['easy_positive_ids','hard_positive_ids', 'hard_negative_ids', 'easy_negative_ids']:
+            for did in row[label]:
+                rows.append({"query_id":query_id,"query_text":query_text,'doc_id':did,'label':label.rstrip('_ids')})
+        df = pd.DataFrame(rows)
+    df = pd.merge(df,corpus[['_id','title','text']],left_on='doc_id',right_on='_id')
+    df['dataset'] = fold
+    dfs.append(df)
+    pd.concat(dfs).to_csv('full_train_dataset.csv')

train_datasets_creation/merge_all_splits.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+Merge all qrels splits (train/dev/test) into one merged.tsv file for each dataset.
+We don't care about splits - we want ALL the data!
+"""
+import os
+from pathlib import Path
+def merge_qrels_for_dataset(dataset_path):
+    """Merge all qrels files in a dataset into merged.tsv."""
+    qrels_dir = dataset_path / 'qrels'
+    if not qrels_dir.exists():
+        print(f"  ⚠️  No qrels directory found")
+        return
+    # Find all .tsv files
+    tsv_files = list(qrels_dir.glob('*.tsv'))
+    if not tsv_files:
+        print(f"  ⚠️  No TSV files found")
+        return
+    # Collect all unique entries (query_id, doc_id, score)
+    all_entries = {}  # (qid, doc_id) -> score
+    header = None
+    for tsv_file in tsv_files:
+        if tsv_file.name == 'merged.tsv':
+            continue  # Skip if already merged
+        print(f"  Reading {tsv_file.name}...")
+        with open(tsv_file, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+            # Get header from first file
+            if header is None and lines:
+                header = lines[0].strip()
+            # Process data lines
+            for line in lines[1:]:  # Skip header
+                if not line.strip():
+                    continue
+                parts = line.strip().split('\t')
+                if len(parts) >= 3:
+                    qid = parts[0]
+                    doc_id = parts[1]
+                    score = int(parts[2])
+                    # Store or update if higher score
+                    key = (qid, doc_id)
+                    if key not in all_entries or score > all_entries[key]:
+                        all_entries[key] = score
+    # Write merged file
+    merged_file = qrels_dir / 'merged.tsv'
+    with open(merged_file, 'w', encoding='utf-8') as f:
+        # Write header
+        f.write(header + '\n')
+        # Write all entries sorted by query_id then doc_id
+        for (qid, doc_id), score in sorted(all_entries.items()):
+            f.write(f"{qid}\t{doc_id}\t{score}\n")
+    print(f"  ✓ Merged {len(all_entries)} unique entries into merged.tsv")
+    print(f"    From splits: {', '.join(f.stem for f in tsv_files if f.name != 'merged.tsv')}")
+    return len(all_entries)
+def main():
+    """Merge all splits for all datasets in beir_data."""
+    beir_data_dir = Path('../beir_data')
+    if not beir_data_dir.exists():
+        print(f"Error: {beir_data_dir} not found!")
+        return
+    # Get all dataset directories
+    dataset_dirs = [d for d in beir_data_dir.iterdir() if d.is_dir()]
+    dataset_dirs.sort()
+    print(f"Found {len(dataset_dirs)} datasets in beir_data")
+    print("="*60)
+    total_entries = 0
+    for dataset_dir in dataset_dirs:
+        print(f"\nProcessing {dataset_dir.name}...")
+        entries = merge_qrels_for_dataset(dataset_dir)
+        if entries:
+            total_entries += entries
+    print("\n" + "="*60)
+    print(f"DONE! Merged {total_entries} total qrel entries across all datasets")
+    print("All datasets now have a 'merged.tsv' file combining all splits")
+if __name__ == "__main__":
+    main()

train_dpo_hf.py DELETED Viewed

@@ -1,268 +0,0 @@
-"""
-DPO Training Script for Phi-3 Mini on Hugging Face
-Optimized for A10G GPU with checkpoint saving and custom validation
-"""
-import os
-import json
-import torch
-import pandas as pd
-from transformers import (
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    TrainingArguments,
-    TrainerCallback,
-    TrainerState,
-    TrainerControl
-)
-from trl import DPOTrainer, DPOConfig
-from datasets import Dataset, load_dataset
-from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
-from typing import Dict, List
-import logging
-from datetime import datetime
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Configuration
-MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
-HF_USERNAME = os.environ.get("HF_USERNAME", "your-username")  # Set via environment
-HF_TOKEN = os.environ.get("HF_TOKEN", None)  # Set via environment
-PROJECT_NAME = "phi3-dpo-beir"
-OUTPUT_DIR = f"./{PROJECT_NAME}-checkpoints"
-class ValidationCallback(TrainerCallback):
-    """Custom callback to evaluate model similar to evaluate.py"""
-    def __init__(self, tokenizer, val_dataset, eval_freq=500):
-        self.tokenizer = tokenizer
-        self.val_dataset = val_dataset
-        self.eval_freq = eval_freq
-    def format_prompt_for_inference(self, query, document):
-        """Format for inference matching evaluate.py style"""
-        prompt = f"""You are an AI content analyst.
-Task:
-1. Given the following content and a user query, decide if the content is relevant.
-2. If it is relevant:
-   - Extract the top 2-3 key sentences
-   - Suggest 3-5 relevant tags
-   - Provide a short explanation or content extension (~2-3 sentences)
-Format your response in JSON with:
-{{
-  "relevant": true or false,
-  "key_sentences": [...],
-  "tags": [...],
-  "expansion": "..."
-}}
-User Query:
-{query}
-Content:
-{document}
-Response:"""
-        return prompt
-    def on_step_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
-        """Run validation every N steps"""
-        if state.global_step % self.eval_freq == 0 and state.global_step > 0:
-            logger.info(f"\n🔍 Running custom validation at step {state.global_step}")
-            model = kwargs["model"]
-            model.eval()
-            # Sample a few validation examples
-            sample_size = min(5, len(self.val_dataset))
-            samples = self.val_dataset.shuffle().select(range(sample_size))
-            correct = 0
-            for sample in samples:
-                # Extract query and document from prompt
-                prompt_text = sample["prompt"]
-                lines = prompt_text.split("\n")
-                query_idx = lines.index("Query:") + 1
-                doc_idx = lines.index("Document:") + 1
-                query = lines[query_idx].strip()
-                doc_parts = lines[doc_idx:]
-                document = "\n".join(doc_parts).strip()
-                # Generate response
-                inference_prompt = self.format_prompt_for_inference(query, document)
-                inputs = self.tokenizer(inference_prompt, return_tensors="pt", truncation=True)
-                inputs = {k: v.to(model.device) for k, v in inputs.items()}
-                with torch.no_grad():
-                    outputs = model.generate(
-                        **inputs,
-                        max_new_tokens=256,
-                        temperature=0.1,
-                        do_sample=True,
-                        pad_token_id=self.tokenizer.eos_token_id
-                    )
-                response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-                response = response[len(inference_prompt):].strip()
-                # Simple accuracy check
-                expected = sample["chosen"].lower()
-                if expected in response.lower():
-                    correct += 1
-                logger.info(f"Expected: {expected}, Got: {response[:100]}...")
-            accuracy = correct / sample_size
-            logger.info(f"✅ Validation accuracy: {accuracy:.2%}")
-        return control
-def prepare_datasets():
-    """Load and prepare DPO datasets"""
-    logger.info("📊 Loading datasets...")
-    # Load CSVs
-    train_df = pd.read_csv("train.csv")
-    val_df = pd.read_csv("val.csv") if os.path.exists("val.csv") else None
-    test_df = pd.read_csv("test.csv") if os.path.exists("test.csv") else None
-    # Convert to HF datasets
-    train_dataset = Dataset.from_pandas(train_df)
-    val_dataset = Dataset.from_pandas(val_df) if val_df is not None else None
-    test_dataset = Dataset.from_pandas(test_df) if test_df is not None else None
-    logger.info(f"✅ Loaded {len(train_dataset)} training examples")
-    if val_dataset:
-        logger.info(f"✅ Loaded {len(val_dataset)} validation examples")
-    return train_dataset, val_dataset, test_dataset
-def get_model_and_tokenizer():
-    """Load model and tokenizer with 4-bit quantization for A10G"""
-    logger.info(f"🤖 Loading model: {MODEL_ID}")
-    # Tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    tokenizer.pad_token = tokenizer.eos_token
-    tokenizer.padding_side = "left"  # Important for DPO
-    # Model with 4-bit quantization
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        load_in_4bit=True,
-        torch_dtype=torch.float16,
-        device_map="auto",
-        trust_remote_code=True
-    )
-    # Prepare for k-bit training
-    model = prepare_model_for_kbit_training(model)
-    return model, tokenizer
-def get_peft_config():
-    """Get LoRA configuration optimized for A10G"""
-    return LoraConfig(
-        r=16,
-        lora_alpha=32,
-        target_modules=[
-            "q_proj", "v_proj", "k_proj", "o_proj",
-            "gate_proj", "up_proj", "down_proj"
-        ],
-        lora_dropout=0.1,
-        bias="none",
-        task_type="CAUSAL_LM",
-    )
-def main():
-    # Load datasets
-    train_dataset, val_dataset, test_dataset = prepare_datasets()
-    # Load model and tokenizer
-    model, tokenizer = get_model_and_tokenizer()
-    # LoRA config
-    peft_config = get_peft_config()
-    # Training arguments optimized for A10G
-    training_args = DPOConfig(
-        output_dir=OUTPUT_DIR,
-        num_train_epochs=3,
-        per_device_train_batch_size=2,  # A10G can handle this
-        per_device_eval_batch_size=2,
-        gradient_accumulation_steps=4,  # Effective batch size = 8
-        gradient_checkpointing=True,
-        learning_rate=5e-5,
-        lr_scheduler_type="cosine",
-        warmup_ratio=0.1,
-        logging_steps=10,
-        save_steps=100,  # Save every 100 steps
-        eval_steps=500,
-        save_total_limit=5,  # Keep last 5 checkpoints
-        load_best_model_at_end=True,
-        metric_for_best_model="loss",
-        greater_is_better=False,
-        # DPO specific
-        beta=0.1,  # DPO regularization
-        max_prompt_length=512,
-        max_length=768,
-        # Optimization
-        optim="paged_adamw_8bit",
-        fp16=True,
-        # Logging
-        report_to="wandb" if os.environ.get("WANDB_API_KEY") else "none",
-        run_name=f"{PROJECT_NAME}-{datetime.now().strftime('%Y%m%d-%H%M')}",
-        # Hub integration
-        push_to_hub=True if HF_TOKEN else False,
-        hub_model_id=f"{HF_USERNAME}/{PROJECT_NAME}" if HF_TOKEN else None,
-        hub_strategy="checkpoint",  # Push every checkpoint
-        hub_token=HF_TOKEN,
-    )
-    # Initialize trainer
-    dpo_trainer = DPOTrainer(
-        model=model,
-        ref_model=None,  # Will create a reference model copy
-        args=training_args,
-        train_dataset=train_dataset,
-        eval_dataset=val_dataset,
-        processing_class=tokenizer,  # Changed from tokenizer to processing_class
-        peft_config=peft_config,
-    )
-    # Add custom validation callback
-    if val_dataset:
-        val_callback = ValidationCallback(tokenizer, val_dataset)
-        dpo_trainer.add_callback(val_callback)
-    # Start training
-    logger.info("🚀 Starting DPO training...")
-    logger.info(f"💾 Checkpoints will be saved to: {OUTPUT_DIR}")
-    if HF_TOKEN:
-        logger.info(f"🤗 Model will be pushed to: https://huggingface.co/{HF_USERNAME}/{PROJECT_NAME}")
-    dpo_trainer.train()
-    # Save final model
-    logger.info("💾 Saving final model...")
-    dpo_trainer.save_model(f"{OUTPUT_DIR}/final")
-    # Push to hub
-    if HF_TOKEN:
-        logger.info("🤗 Pushing final model to Hub...")
-        dpo_trainer.push_to_hub()
-    logger.info("✅ Training complete!")
-if __name__ == "__main__":
-    main()

train_dpo_hf_fixed.py DELETED Viewed

@@ -1,366 +0,0 @@
-"""
-DPO Training Script for Phi-3 Mini - Fixed version
-Handles dependency issues and provides cleaner error handling
-"""
-import os
-import sys
-import json
-import warnings
-warnings.filterwarnings("ignore")
-# Set environment variables to avoid TensorFlow issues
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
-try:
-    import torch
-    import pandas as pd
-    from transformers import (
-        AutoTokenizer,
-        AutoModelForCausalLM,
-        TrainingArguments,
-        TrainerCallback,
-        TrainerState,
-        TrainerControl
-    )
-    from trl import DPOTrainer
-    from trl.trainer.dpo_config import DPOConfig
-    from datasets import Dataset
-    from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
-    from datetime import datetime
-    import logging
-except ImportError as e:
-    print(f"Missing dependency: {e}")
-    print("\nPlease install required packages:")
-    print("pip install torch transformers trl peft datasets accelerate bitsandbytes pandas")
-    print("\nIf you get Keras errors, also run:")
-    print("pip install tf-keras")
-    sys.exit(1)
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Configuration
-MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
-HF_USERNAME = os.environ.get("HF_USERNAME", "your-username")
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
-PROJECT_NAME = "phi3-dpo-beir"
-OUTPUT_DIR = f"./{PROJECT_NAME}-checkpoints"
-class ValidationCallback(TrainerCallback):
-    """Custom callback to evaluate model similar to evaluate.py"""
-    def __init__(self, tokenizer, val_dataset, eval_freq=500):
-        self.tokenizer = tokenizer
-        self.val_dataset = val_dataset
-        self.eval_freq = eval_freq
-    def format_prompt_for_inference(self, query, document):
-        """Format for inference matching evaluate.py style"""
-        prompt = f"""You are an AI content analyst.
-Task:
-1. Given the following content and a user query, decide if the content is relevant.
-2. If it is relevant:
-   - Extract the top 2-3 key sentences
-   - Suggest 3-5 relevant tags
-   - Provide a short explanation or content extension (~2-3 sentences)
-Format your response in JSON with:
-{{
-  "relevant": true or false,
-  "key_sentences": [...],
-  "tags": [...],
-  "expansion": "..."
-}}
-User Query:
-{query}
-Content:
-{document}
-Response:"""
-        return prompt
-    def on_step_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
-        """Run validation every N steps"""
-        if state.global_step % self.eval_freq == 0 and state.global_step > 0:
-            logger.info(f"\n🔍 Running custom validation at step {state.global_step}")
-            model = kwargs["model"]
-            model.eval()
-            # Sample validation examples
-            sample_size = min(5, len(self.val_dataset))
-            samples = self.val_dataset.shuffle(seed=42).select(range(sample_size))
-            correct = 0
-            for sample in samples:
-                try:
-                    # Extract query and document
-                    prompt_text = sample["prompt"]
-                    lines = prompt_text.split("\n")
-                    # Find query and document sections
-                    query_idx = -1
-                    doc_idx = -1
-                    for i, line in enumerate(lines):
-                        if line.strip() == "Query:":
-                            query_idx = i + 1
-                        elif line.strip() == "Document:":
-                            doc_idx = i + 1
-                    if query_idx == -1 or doc_idx == -1:
-                        continue
-                    query = lines[query_idx].strip()
-                    doc_parts = lines[doc_idx:]
-                    document = "\n".join(doc_parts).strip()
-                    # Generate response
-                    inference_prompt = self.format_prompt_for_inference(query, document)
-                    inputs = self.tokenizer(
-                        inference_prompt,
-                        return_tensors="pt",
-                        truncation=True,
-                        max_length=512
-                    )
-                    inputs = {k: v.to(model.device) for k, v in inputs.items()}
-                    with torch.no_grad():
-                        outputs = model.generate(
-                            **inputs,
-                            max_new_tokens=256,
-                            temperature=0.1,
-                            do_sample=True,
-                            pad_token_id=self.tokenizer.eos_token_id
-                        )
-                    response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-                    response = response[len(inference_prompt):].strip()
-                    # Simple accuracy check
-                    expected = sample["chosen"].lower()
-                    if expected in response.lower():
-                        correct += 1
-                    logger.info(f"Expected: {expected}, Got: {response[:100]}...")
-                except Exception as e:
-                    logger.error(f"Validation error: {e}")
-                    continue
-            if sample_size > 0:
-                accuracy = correct / sample_size
-                logger.info(f"✅ Validation accuracy: {accuracy:.2%}")
-        return control
-def prepare_datasets():
-    """Load and prepare DPO datasets"""
-    logger.info("📊 Loading datasets...")
-    # Check if data files exist
-    if not os.path.exists("train.csv"):
-        logger.error("train.csv not found!")
-        logger.info("Please run download_beir_datasets.py first or use generate_sample_data.py")
-        return None, None, None
-    # Load CSVs
-    train_df = pd.read_csv("train.csv")
-    val_df = pd.read_csv("val.csv") if os.path.exists("val.csv") else None
-    test_df = pd.read_csv("test.csv") if os.path.exists("test.csv") else None
-    # Convert to HF datasets
-    train_dataset = Dataset.from_pandas(train_df)
-    val_dataset = Dataset.from_pandas(val_df) if val_df is not None else None
-    test_dataset = Dataset.from_pandas(test_df) if test_df is not None else None
-    logger.info(f"✅ Loaded {len(train_dataset)} training examples")
-    if val_dataset:
-        logger.info(f"✅ Loaded {len(val_dataset)} validation examples")
-    return train_dataset, val_dataset, test_dataset
-def get_model_and_tokenizer():
-    """Load model and tokenizer with 4-bit quantization for A10G"""
-    logger.info(f"🤖 Loading model: {MODEL_ID}")
-    # Tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    tokenizer.pad_token = tokenizer.eos_token
-    tokenizer.padding_side = "left"  # Important for DPO
-    # Check if CUDA is available
-    if not torch.cuda.is_available():
-        logger.warning("⚠️ CUDA not available. Loading model in CPU mode (will be slow!)")
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            torch_dtype=torch.float32,
-            device_map="cpu",
-            trust_remote_code=True
-        )
-    else:
-        # Model with 4-bit quantization
-        try:
-            model = AutoModelForCausalLM.from_pretrained(
-                MODEL_ID,
-                load_in_4bit=True,
-                torch_dtype=torch.float16,
-                device_map="auto",
-                trust_remote_code=True,
-                bnb_4bit_compute_dtype=torch.float16,
-                bnb_4bit_use_double_quant=True,
-                bnb_4bit_quant_type="nf4"
-            )
-            model = prepare_model_for_kbit_training(model)
-        except Exception as e:
-            logger.error(f"Failed to load model in 4-bit: {e}")
-            logger.info("Falling back to full precision...")
-            model = AutoModelForCausalLM.from_pretrained(
-                MODEL_ID,
-                torch_dtype=torch.float16,
-                device_map="auto",
-                trust_remote_code=True
-            )
-    return model, tokenizer
-def get_peft_config():
-    """Get LoRA configuration optimized for A10G"""
-    return LoraConfig(
-        r=16,
-        lora_alpha=32,
-        target_modules=[
-            "q_proj", "v_proj", "k_proj", "o_proj",
-            "gate_proj", "up_proj", "down_proj"
-        ],
-        lora_dropout=0.1,
-        bias="none",
-        task_type="CAUSAL_LM",
-    )
-def main():
-    logger.info("="*60)
-    logger.info("🚀 Starting DPO Training for Phi-3 Mini")
-    logger.info("="*60)
-    # Load datasets
-    train_dataset, val_dataset, test_dataset = prepare_datasets()
-    if train_dataset is None:
-        return
-    # Load model and tokenizer
-    try:
-        model, tokenizer = get_model_and_tokenizer()
-    except Exception as e:
-        logger.error(f"Failed to load model: {e}")
-        return
-    # LoRA config
-    peft_config = get_peft_config()
-    # Training arguments optimized for A10G
-    training_args = DPOConfig(
-        output_dir=OUTPUT_DIR,
-        num_train_epochs=3,
-        per_device_train_batch_size=2,  # A10G can handle this
-        per_device_eval_batch_size=2,
-        gradient_accumulation_steps=4,  # Effective batch size = 8
-        gradient_checkpointing=True,
-        learning_rate=5e-5,
-        lr_scheduler_type="cosine",
-        warmup_ratio=0.1,
-        logging_steps=10,
-        save_steps=100,  # Save every 100 steps
-        eval_steps=500,
-        save_total_limit=5,  # Keep last 5 checkpoints
-        load_best_model_at_end=True,
-        metric_for_best_model="loss",
-        greater_is_better=False,
-        # DPO specific
-        beta=0.1,  # DPO regularization
-        max_prompt_length=512,
-        max_length=768,
-        # Optimization
-        optim="paged_adamw_8bit" if torch.cuda.is_available() else "adamw_torch",
-        fp16=torch.cuda.is_available(),
-        # Logging
-        report_to="none",  # Disable wandb for simplicity
-        run_name=f"{PROJECT_NAME}-{datetime.now().strftime('%Y%m%d-%H%M')}",
-        # Hub integration
-        push_to_hub=True if HF_TOKEN else False,
-        hub_model_id=f"{HF_USERNAME}/{PROJECT_NAME}" if HF_TOKEN else None,
-        hub_strategy="checkpoint",  # Push every checkpoint
-        hub_token=HF_TOKEN,
-    )
-    # Initialize trainer
-    try:
-        dpo_trainer = DPOTrainer(
-            model=model,
-            ref_model=None,  # Will create a reference model copy
-            args=training_args,
-            train_dataset=train_dataset,
-            eval_dataset=val_dataset,
-            processing_class=tokenizer,  # Changed from tokenizer to processing_class
-            peft_config=peft_config,
-        )
-    except Exception as e:
-        logger.error(f"Failed to initialize trainer: {e}")
-        return
-    # Add custom validation callback
-    if val_dataset:
-        val_callback = ValidationCallback(tokenizer, val_dataset)
-        dpo_trainer.add_callback(val_callback)
-    # Start training
-    logger.info("🚀 Starting DPO training...")
-    logger.info(f"💾 Checkpoints will be saved to: {OUTPUT_DIR}")
-    if HF_TOKEN:
-        logger.info(f"🤗 Model will be pushed to: https://huggingface.co/{HF_USERNAME}/{PROJECT_NAME}")
-    # Print some info about the data
-    logger.info("\n📊 Data Statistics:")
-    logger.info(f"Training samples: {len(train_dataset)}")
-    if val_dataset:
-        logger.info(f"Validation samples: {len(val_dataset)}")
-    # Show a sample
-    logger.info("\n📝 Sample training data:")
-    sample = train_dataset[0]
-    logger.info(f"Prompt (first 200 chars): {sample['prompt'][:200]}...")
-    logger.info(f"Chosen: {sample['chosen']}")
-    logger.info(f"Rejected: {sample['rejected']}")
-    try:
-        dpo_trainer.train()
-    except KeyboardInterrupt:
-        logger.info("\n⚠️ Training interrupted by user")
-    except Exception as e:
-        logger.error(f"\n❌ Training failed: {e}")
-        return
-    # Save final model
-    logger.info("💾 Saving final model...")
-    dpo_trainer.save_model(f"{OUTPUT_DIR}/final")
-    # Push to hub
-    if HF_TOKEN:
-        logger.info("🤗 Pushing final model to Hub...")
-        try:
-            dpo_trainer.push_to_hub()
-        except Exception as e:
-            logger.error(f"Failed to push to hub: {e}")
-    logger.info("✅ Training complete!")
-    logger.info(f"📁 Model saved to: {OUTPUT_DIR}/final")
-if __name__ == "__main__":
-    main()