Spaces:

codewithdark
/

Gemma_Finetuner

Runtime error

App Files Files Community

codewithdark commited on Mar 18

Commit

183d0e9

verified ·

1 Parent(s): a3e502c

Delete Gemma-Finetune

Browse files

Files changed (12) hide show

Gemma-Finetune/.gitignore +0 -192
Gemma-Finetune/Gemma3_(4B).ipynb +0 -0
Gemma-Finetune/LICENSE +0 -21
Gemma-Finetune/README.md +0 -41
Gemma-Finetune/main.py +0 -290
Gemma-Finetune/requirements.txt +0 -9
Gemma-Finetune/utils/__pycache__/check_dataset.cpython-311.pyc +0 -0
Gemma-Finetune/utils/__pycache__/model.cpython-311.pyc +0 -0
Gemma-Finetune/utils/__pycache__/sample_dataset.cpython-311.pyc +0 -0
Gemma-Finetune/utils/check_dataset.py +0 -272
Gemma-Finetune/utils/model.py +0 -552
Gemma-Finetune/utils/sample_dataset.py +0 -105

Gemma-Finetune/.gitignore DELETED Viewed

@@ -1,192 +0,0 @@
-# Python
-__pycache__/
-*.py[cod]
-*$py.class
-*.so
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-# Virtual Environment
-venv/
-env/
-ENV/
-# Model files and datasets
-models/
-sample_datasets/
-*.pt
-*.pth
-*.bin
-*.gguf
-*.onnx
-# IDE
-.idea/
-.vscode/
-*.swp
-*.swo
-# Logs and databases
-*.log
-*.sqlite
-wandb/
-# OS generated files
-.DS_Store
-.DS_Store?
-._*
-.Spotlight-V100
-.Trashes
-ehthumbs.db
-Thumbs.db
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-.pybuilder/
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# UV
-#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#uv.lock
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
-.pdm.toml
-.pdm-python
-.pdm-build/
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-# Ruff stuff:
-.ruff_cache/
-# PyPI configuration file
-.pypirc

Gemma-Finetune/Gemma3_(4B).ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

Gemma-Finetune/LICENSE DELETED Viewed

@@ -1,21 +0,0 @@
-MIT License
-Copyright (c) 2025 Dark Coder
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

Gemma-Finetune/README.md DELETED Viewed

@@ -1,41 +0,0 @@
-# Gemma Fine-tuning UI
-A user-friendly interface for fine-tuning Google's Gemma models using Unsloth optimizations.
-## Features
-- Easy-to-use web interface for model fine-tuning
-- Support for multiple data formats (CSV, JSONL, TEXT)
-- Parameter-efficient fine-tuning with LoRA
-- Real-time training progress visualization
-- Model export in multiple formats
-- Integrated text generation testing
-## Installation
-```bash
-git clone https://github.com/codewithdark-git/Gemma-Finetune.git
-cd Gemma-Finetune
-pip install -r requirements.txt
-```
-## Usage
-1. Run the application:
-```bash
-python main.py
-```
-2. Follow the UI steps:
-   - Upload your dataset
-   - Configure model parameters
-   - Start training
-   - Test and export your model
-## Requirements
-See requirements.txt for detailed dependencies.
-## License
-MIT License

Gemma-Finetune/main.py DELETED Viewed

@@ -1,290 +0,0 @@
-import gradio as gr
-from utils.check_dataset import validate_dataset, generate_dataset_report
-from utils.sample_dataset import generate_sample_datasets
-from utils.model import GemmaFineTuning
-class GemmaUI:
-    def __init__(self):
-        self.model_instance = GemmaFineTuning()
-        self.default_params = self.model_instance.default_params
-    def create_ui(self):
-        """Create the Gradio interface"""
-        with gr.Blocks(title="Gemma Fine-tuning UI") as app:
-            gr.Markdown("# Gemma Model Fine-tuning Interface")
-            gr.Markdown("Upload your dataset, configure parameters, and fine-tune a Gemma model")
-            with gr.Tabs():
-                with gr.TabItem("1. Data Upload & Preprocessing"):
-                    with gr.Row():
-                        with gr.Column():
-                            file_upload = gr.File(label="Upload Dataset")
-                            file_format = gr.Radio(
-                                ["csv", "jsonl", "text"],
-                                label="File Format",
-                                value="csv"
-                            )
-                            preprocess_button = gr.Button("Preprocess Dataset")
-                            dataset_info = gr.TextArea(label="Dataset Information", interactive=False)
-                with gr.TabItem("2. Model & Hyperparameters"):
-                    with gr.Row():
-                        with gr.Column():
-                            model_name = gr.Dropdown(
-                                choices=[
-                                    "google/gemma-2b",
-                                    "google/gemma-7b",
-                                    "google/gemma-2b-it",
-                                    "google/gemma-7b-it"
-                                ],
-                                value=self.default_params["model_name"],
-                                label="Model Name",
-                                info="Select a Gemma model to fine-tune"
-                            )
-                            learning_rate = gr.Slider(
-                                minimum=1e-6,
-                                maximum=5e-4,
-                                value=self.default_params["learning_rate"],
-                                label="Learning Rate",
-                                info="Learning rate for the optimizer"
-                            )
-                            batch_size = gr.Slider(
-                                minimum=1,
-                                maximum=32,
-                                step=1,
-                                value=self.default_params["batch_size"],
-                                label="Batch Size",
-                                info="Number of samples in each training batch"
-                            )
-                            epochs = gr.Slider(
-                                minimum=1,
-                                maximum=10,
-                                step=1,
-                                value=self.default_params["epochs"],
-                                label="Epochs",
-                                info="Number of training epochs"
-                            )
-                        with gr.Column():
-                            max_length = gr.Slider(
-                                minimum=128,
-                                maximum=2048,
-                                step=16,
-                                value=self.default_params["max_length"],
-                                label="Max Sequence Length",
-                                info="Maximum token length for inputs"
-                            )
-                            use_lora = gr.Checkbox(
-                                value=self.default_params["use_lora"],
-                                label="Use LoRA for Parameter-Efficient Fine-tuning",
-                                info="Recommended for faster training and lower memory usage"
-                            )
-                            lora_r = gr.Slider(
-                                minimum=4,
-                                maximum=64,
-                                step=4,
-                                value=self.default_params["lora_r"],
-                                label="LoRA Rank (r)",
-                                info="Rank of the LoRA update matrices",
-                                visible=lambda: use_lora.value
-                            )
-                            lora_alpha = gr.Slider(
-                                minimum=4,
-                                maximum=64,
-                                step=4,
-                                value=self.default_params["lora_alpha"],
-                                label="LoRA Alpha",
-                                info="Scaling factor for LoRA updates",
-                                visible=lambda: use_lora.value
-                            )
-                            eval_ratio = gr.Slider(
-                                minimum=0.05,
-                                maximum=0.3,
-                                step=0.05,
-                                value=self.default_params["eval_ratio"],
-                                label="Validation Split Ratio",
-                                info="Portion of data to use for validation"
-                            )
-                with gr.TabItem("3. Training"):
-                    with gr.Row():
-                        with gr.Column():
-                            start_training_button = gr.Button("Start Fine-tuning")
-                            stop_training_button = gr.Button("Stop Training", variant="stop")
-                            training_status = gr.Textbox(label="Training Status", interactive=False)
-                        with gr.Column():
-                            progress_plot = gr.Plot(label="Training Progress")
-                            refresh_plot_button = gr.Button("Refresh Plot")
-                with gr.TabItem("4. Evaluation & Export"):
-                    with gr.Row():
-                        with gr.Column():
-                            test_prompt = gr.Textbox(
-                                label="Test Prompt",
-                                placeholder="Enter a prompt to test the model...",
-                                lines=3
-                            )
-                            max_gen_length = gr.Slider(
-                                minimum=10,
-                                maximum=500,
-                                step=10,
-                                value=100,
-                                label="Max Generation Length"
-                            )
-                            generate_button = gr.Button("Generate Text")
-                            generated_output = gr.Textbox(label="Generated Output", lines=10, interactive=False)
-                        with gr.Column():
-                            export_format = gr.Radio(
-                                ["pytorch", "tensorflow", "gguf"],
-                                label="Export Format",
-                                value="pytorch"
-                            )
-                            export_button = gr.Button("Export Model")
-                            export_status = gr.Textbox(label="Export Status", interactive=False)
-            # Functionality
-            def preprocess_data(file, format_type):
-                try:
-                    if file is None:
-                        return "Please upload a file first."
-                    # Process the uploaded file
-                    dataset = self.model_instance.prepare_dataset(file.name, format_type)
-                    self.model_instance.dataset = dataset
-                    # Create a summary of the dataset
-                    num_samples = len(dataset["train"])
-                    # Sample a few examples
-                    examples = dataset["train"].select(range(min(3, num_samples)))
-                    sample_text = []
-                    for ex in examples:
-                        text_key = list(ex.keys())[0] if "text" not in ex else "text"
-                        sample = ex[text_key]
-                        if isinstance(sample, str):
-                            sample_text.append(sample[:100] + "..." if len(sample) > 100 else sample)
-                    info = f"Dataset loaded successfully!\n"
-                    info += f"Number of training examples: {num_samples}\n"
-                    info += f"Sample data:\n" + "\n---\n".join(sample_text)
-                    return info
-                except Exception as e:
-                    return f"Error preprocessing data: {str(e)}"
-            def start_training(
-                model_name, learning_rate, batch_size, epochs, max_length,
-                use_lora, lora_r, lora_alpha, eval_ratio
-            ):
-                try:
-                    if self.model_instance.dataset is None:
-                        return "Please preprocess a dataset first."
-                    # Validate parameters
-                    if not model_name:
-                        return "Please select a model."
-                    # Prepare training parameters with proper type conversion
-                    training_params = {
-                        "model_name": str(model_name),
-                        "learning_rate": float(learning_rate),
-                        "batch_size": int(batch_size),
-                        "epochs": int(epochs),
-                        "max_length": int(max_length),
-                        "use_lora": bool(use_lora),
-                        "lora_r": int(lora_r) if use_lora else None,
-                        "lora_alpha": int(lora_alpha) if use_lora else None,
-                        "eval_ratio": float(eval_ratio),
-                        "weight_decay": float(self.default_params["weight_decay"]),
-                        "warmup_ratio": float(self.default_params["warmup_ratio"]),
-                        "lora_dropout": float(self.default_params["lora_dropout"])
-                    }
-                    # Start training in a separate thread
-                    import threading
-                    def train_thread():
-                        status = self.model_instance.train(training_params)
-                        return status
-                    thread = threading.Thread(target=train_thread)
-                    thread.start()
-                    return "Training started! Monitor the progress in the Training tab."
-                except Exception as e:
-                    return f"Error starting training: {str(e)}"
-            def stop_training():
-                if self.model_instance.trainer is not None:
-                    # Attempt to stop the trainer
-                    self.model_instance.trainer.stop_training = True
-                    return "Training stop signal sent. It may take a moment to complete the current step."
-                return "No active training to stop."
-            def update_progress_plot():
-                try:
-                    return self.model_instance.plot_training_progress()
-                except Exception as e:
-                    return None
-            def run_text_generation(prompt, max_length):
-                try:
-                    if self.model_instance.model is None:
-                        return "Please fine-tune a model first."
-                    return self.model_instance.generate_text(prompt, int(max_length))
-                except Exception as e:
-                    return f"Error generating text: {str(e)}"
-            def export_model_fn(format_type):
-                try:
-                    if self.model_instance.model is None:
-                        return "Please fine-tune a model first."
-                    return self.model_instance.export_model(format_type)
-                except Exception as e:
-                    return f"Error exporting model: {str(e)}"
-            # Connect UI components to functions
-            preprocess_button.click(
-                preprocess_data,
-                inputs=[file_upload, file_format],
-                outputs=dataset_info
-            )
-            start_training_button.click(
-                start_training,
-                inputs=[
-                    model_name, learning_rate, batch_size, epochs, max_length,
-                    use_lora, lora_r, lora_alpha, eval_ratio
-                ],
-                outputs=training_status
-            )
-            stop_training_button.click(
-                stop_training,
-                inputs=[],
-                outputs=training_status
-            )
-            refresh_plot_button.click(
-                update_progress_plot,
-                inputs=[],
-                outputs=progress_plot
-            )
-            generate_button.click(
-                run_text_generation,
-                inputs=[test_prompt, max_gen_length],
-                outputs=generated_output
-            )
-            export_button.click(
-                export_model_fn,
-                inputs=[export_format],
-                outputs=export_status
-            )
-        return app

Gemma-Finetune/requirements.txt DELETED Viewed

@@ -1,9 +0,0 @@
-torch>=2.0.0
-transformers>=4.36.0
-unsloth>=0.1.0
-gradio>=4.0.0
-pandas>=1.5.0
-numpy>=1.24.0
-matplotlib>=3.7.0
-peft>=0.7.0
-datasets>=2.14.0

Gemma-Finetune/utils/__pycache__/check_dataset.cpython-311.pyc DELETED Viewed

Binary file (13.3 kB)

Gemma-Finetune/utils/__pycache__/model.cpython-311.pyc DELETED Viewed

Binary file (28.4 kB)

Gemma-Finetune/utils/__pycache__/sample_dataset.cpython-311.pyc DELETED Viewed

Binary file (7.67 kB)

Gemma-Finetune/utils/check_dataset.py DELETED Viewed

@@ -1,272 +0,0 @@
-def validate_dataset(self, file_path, format_type):
-    """
-    Validate and analyze the dataset format, providing detailed feedback
-    Parameters:
-    file_path (str): Path to the dataset file
-    format_type (str): File format (csv, jsonl, text)
-    Returns:
-    dict: Validation results including format, structure, and statistics
-    """
-    import pandas as pd
-    import json
-    import os
-    import re
-    validation_results = {
-        "is_valid": False,
-        "format": format_type,
-        "detected_structure": None,
-        "statistics": {},
-        "issues": [],
-        "recommendations": []
-    }
-    try:
-        # Check if file exists
-        if not os.path.exists(file_path):
-            validation_results["issues"].append(f"File not found: {file_path}")
-            return validation_results
-        # Check file size
-        file_size = os.path.getsize(file_path)
-        validation_results["statistics"]["file_size_bytes"] = file_size
-        validation_results["statistics"]["file_size_mb"] = round(file_size / (1024 * 1024), 2)
-        if file_size == 0:
-            validation_results["issues"].append("File is empty")
-            return validation_results
-        if format_type == "csv":
-            # Load CSV file
-            try:
-                df = pd.read_csv(file_path)
-                validation_results["statistics"]["total_rows"] = len(df)
-                validation_results["statistics"]["total_columns"] = len(df.columns)
-                validation_results["statistics"]["column_names"] = list(df.columns)
-                # Check for null values
-                null_counts = df.isnull().sum().to_dict()
-                validation_results["statistics"]["null_counts"] = null_counts
-                if validation_results["statistics"]["total_rows"] == 0:
-                    validation_results["issues"].append("CSV file has no rows")
-                    return validation_results
-                # Detect structure
-                if "instruction" in df.columns and "response" in df.columns:
-                    validation_results["detected_structure"] = "instruction-response"
-                    validation_results["is_valid"] = True
-                elif "input" in df.columns and "output" in df.columns:
-                    validation_results["detected_structure"] = "input-output"
-                    validation_results["is_valid"] = True
-                elif "prompt" in df.columns and "completion" in df.columns:
-                    validation_results["detected_structure"] = "prompt-completion"
-                    validation_results["is_valid"] = True
-                elif "text" in df.columns:
-                    validation_results["detected_structure"] = "text-only"
-                    validation_results["is_valid"] = True
-                else:
-                    # Look for text columns
-                    text_columns = [col for col in df.columns if df[col].dtype == 'object']
-                    if text_columns:
-                        validation_results["detected_structure"] = "custom"
-                        validation_results["statistics"]["potential_text_columns"] = text_columns
-                        validation_results["is_valid"] = True
-                        validation_results["recommendations"].append(
-                            f"Consider renaming columns to match standard formats: instruction/response, input/output, prompt/completion, or text"
-                        )
-                    else:
-                        validation_results["issues"].append("No suitable text columns found in CSV")
-                # Check for short text
-                if validation_results["detected_structure"] == "instruction-response":
-                    short_instructions = (df["instruction"].str.len() < 10).sum()
-                    short_responses = (df["response"].str.len() < 10).sum()
-                    validation_results["statistics"]["short_instructions"] = short_instructions
-                    validation_results["statistics"]["short_responses"] = short_responses
-                    if short_instructions > 0:
-                        validation_results["issues"].append(f"Found {short_instructions} instructions shorter than 10 characters")
-                    if short_responses > 0:
-                        validation_results["issues"].append(f"Found {short_responses} responses shorter than 10 characters")
-            except Exception as e:
-                validation_results["issues"].append(f"Error parsing CSV: {str(e)}")
-                return validation_results
-        elif format_type == "jsonl":
-            try:
-                # Load JSONL file
-                data = []
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    for line_num, line in enumerate(f, 1):
-                        line = line.strip()
-                        if not line:
-                            continue
-                        try:
-                            json_obj = json.loads(line)
-                            data.append(json_obj)
-                        except json.JSONDecodeError:
-                            validation_results["issues"].append(f"Invalid JSON at line {line_num}")
-                validation_results["statistics"]["total_examples"] = len(data)
-                if len(data) == 0:
-                    validation_results["issues"].append("No valid JSON objects found in file")
-                    return validation_results
-                # Get sample of keys from first object
-                if data:
-                    validation_results["statistics"]["sample_keys"] = list(data[0].keys())
-                # Detect structure
-                structures = []
-                for item in data:
-                    if "instruction" in item and "response" in item:
-                        structures.append("instruction-response")
-                    elif "input" in item and "output" in item:
-                        structures.append("input-output")
-                    elif "prompt" in item and "completion" in item:
-                        structures.append("prompt-completion")
-                    elif "text" in item:
-                        structures.append("text-only")
-                    else:
-                        structures.append("custom")
-                # Count structure types
-                from collections import Counter
-                structure_counts = Counter(structures)
-                validation_results["statistics"]["structure_counts"] = structure_counts
-                # Set detected structure to most common
-                if structures:
-                    most_common = structure_counts.most_common(1)[0][0]
-                    validation_results["detected_structure"] = most_common
-                    validation_results["is_valid"] = True
-                    # Check if mixed
-                    if len(structure_counts) > 1:
-                        validation_results["issues"].append(f"Mixed structures detected: {dict(structure_counts)}")
-                        validation_results["recommendations"].append("Consider standardizing all records to the same structure")
-            except Exception as e:
-                validation_results["issues"].append(f"Error parsing JSONL: {str(e)}")
-                return validation_results
-        elif format_type == "text":
-            try:
-                # Read text file
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    content = f.read()
-                # Get basic stats
-                total_chars = len(content)
-                total_words = len(content.split())
-                total_lines = content.count('\n') + 1
-                validation_results["statistics"]["total_characters"] = total_chars
-                validation_results["statistics"]["total_words"] = total_words
-                validation_results["statistics"]["total_lines"] = total_lines
-                # Check if it's a single large document or multiple examples
-                paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
-                validation_results["statistics"]["total_paragraphs"] = len(paragraphs)
-                # Try to detect structure
-                # Look for common patterns like "Q: ... A: ...", "Input: ... Output: ..."
-                has_qa_pattern = re.search(r"Q:.*?A:", content, re.DOTALL) is not None
-                has_input_output = re.search(r"Input:.*?Output:", content, re.DOTALL) is not None
-                has_prompt_completion = re.search(r"Prompt:.*?Completion:", content, re.DOTALL) is not None
-                if has_qa_pattern:
-                    validation_results["detected_structure"] = "Q&A-format"
-                elif has_input_output:
-                    validation_results["detected_structure"] = "input-output-format"
-                elif has_prompt_completion:
-                    validation_results["detected_structure"] = "prompt-completion-format"
-                elif len(paragraphs) > 1:
-                    validation_results["detected_structure"] = "paragraphs"
-                else:
-                    validation_results["detected_structure"] = "continuous-text"
-                validation_results["is_valid"] = True
-                if validation_results["detected_structure"] == "continuous-text" and total_chars < 1000:
-                    validation_results["issues"].append("Text file is very short for fine-tuning")
-                    validation_results["recommendations"].append("Consider adding more content or examples")
-            except Exception as e:
-                validation_results["issues"].append(f"Error parsing text file: {str(e)}")
-                return validation_results
-        else:
-            validation_results["issues"].append(f"Unsupported file format: {format_type}")
-            return validation_results
-        # General recommendations
-        if validation_results["is_valid"]:
-            if not validation_results["issues"]:
-                validation_results["recommendations"].append("Dataset looks good and ready for fine-tuning!")
-            else:
-                validation_results["recommendations"].append("Address the issues above before proceeding with fine-tuning")
-        return validation_results
-    except Exception as e:
-        validation_results["issues"].append(f"Unexpected error: {str(e)}")
-        return validation_results
-def generate_dataset_report(validation_results):
-    """
-    Generate a user-friendly report from validation results
-    Parameters:
-    validation_results (dict): Results from validate_dataset
-    Returns:
-    str: Formatted report
-    """
-    report = []
-    # Add header
-    report.append("# Dataset Validation Report")
-    report.append("")
-    # Add validation status
-    if validation_results["is_valid"]:
-        report.append("✅ Dataset is valid and can be used for fine-tuning")
-    else:
-        report.append("❌ Dataset has issues that need to be addressed")
-    report.append("")
-    # Add format info
-    report.append(f"**File Format:** {validation_results['format']}")
-    report.append(f"**Detected Structure:** {validation_results['detected_structure']}")
-    report.append("")
-    # Add statistics
-    report.append("## Statistics")
-    for key, value in validation_results["statistics"].items():
-        # Format the key for better readability
-        formatted_key = key.replace("_", " ").title()
-        report.append(f"- {formatted_key}: {value}")
-    report.append("")
-    # Add issues
-    if validation_results["issues"]:
-        report.append("## Issues")
-        for issue in validation_results["issues"]:
-            report.append(f"- ⚠️ {issue}")
-        report.append("")
-    # Add recommendations
-    if validation_results["recommendations"]:
-        report.append("## Recommendations")
-        for recommendation in validation_results["recommendations"]:
-            report.append(f"- 💡 {recommendation}")
-    return "\n".join(report)

Gemma-Finetune/utils/model.py DELETED Viewed

@@ -1,552 +0,0 @@
-import os
-import json
-import torch
-import gradio as gr
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
-from datetime import datetime
-from torch.utils.data import Dataset, DataLoader
-from transformers import (
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    TrainingArguments,
-    Trainer,
-    DataCollatorForLanguageModeling,
-    TrainerCallback
-)
-from peft import (
-    LoraConfig,
-    get_peft_model,
-    prepare_model_for_kbit_training
-)
-from datasets import load_dataset
-from unsloth import FastModel
-class GemmaFineTuning:
-    def __init__(self):
-        self.model = None
-        self.tokenizer = None
-        self.dataset = None
-        self.trainer = None
-        self.training_history = {"loss": [], "eval_loss": [], "step": []}
-        self.model_save_path = None
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.fourbit_models = [
-            "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
-            "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
-            "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
-            "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",
-        ]
-        # Default hyperparameters
-        self.default_params = {
-            "model_name": "google/gemma-2b",
-            "learning_rate": 2e-5,
-            "batch_size": 8,
-            "epochs": 3,
-            "max_length": 512,
-            "weight_decay": 0.01,
-            "warmup_ratio": 0.1,
-            "use_lora": True,
-            "lora_r": 16,
-            "lora_alpha": 32,
-            "lora_dropout": 0.05,
-            "eval_ratio": 0.1,
-        }
-    def load_model_and_tokenizer(self, model_name: str) -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
-        """Load the model and tokenizer"""
-        try:
-            # Map UI model names to actual model IDs
-            model_mapping = {
-                "google/gemma-2b": "unsloth/gemma-2b-it-unsloth-bnb-4bit",
-                "google/gemma-7b": "unsloth/gemma-7b-it-unsloth-bnb-4bit",
-                "google/gemma-2b-it": "unsloth/gemma-2b-it-unsloth-bnb-4bit",
-                "google/gemma-7b-it": "unsloth/gemma-7b-it-unsloth-bnb-4bit"
-            }
-            actual_model_name = model_mapping.get(model_name, model_name)
-            model, tokenizer = FastModel.from_pretrained(
-                model_name=actual_model_name,
-                max_seq_length=2048,
-                load_in_4bit=True,
-                load_in_8bit=False,
-                full_finetuning=False,
-            )
-            # Move model to device
-            model = model.to(self.device)
-            return model, tokenizer
-        except Exception as e:
-            raise ValueError(f"Error loading model {model_name}: {str(e)}")
-    def prepare_dataset(self, file_path, format_type):
-        """
-        Prepare and normalize dataset from various formats
-        Parameters:
-        file_path (str): Path to the dataset file
-        format_type (str): File format (csv, jsonl, text)
-        Returns:
-        dict: Dataset dictionary with train split
-        """
-        import pandas as pd
-        import json
-        import os
-        from datasets import Dataset, DatasetDict
-        try:
-            if format_type == "csv":
-                # Load CSV file
-                df = pd.read_csv(file_path)
-                # Check if the CSV has the expected columns (looking for either instruction-response pairs or text)
-                if "instruction" in df.columns and "response" in df.columns:
-                    # Instruction-following dataset format
-                    dataset_format = "instruction-response"
-                    # Ensure no nulls
-                    df = df.dropna(subset=["instruction", "response"])
-                    # Create formatted text by combining instruction and response
-                    df["text"] = df.apply(lambda row: f"<instruction>{row['instruction']}</instruction>\n<response>{row['response']}</response>", axis=1)
-                elif "input" in df.columns and "output" in df.columns:
-                    # Another common format
-                    dataset_format = "input-output"
-                    df = df.dropna(subset=["input", "output"])
-                    df["text"] = df.apply(lambda row: f"<input>{row['input']}</input>\n<output>{row['output']}</output>", axis=1)
-                elif "prompt" in df.columns and "completion" in df.columns:
-                    # OpenAI-style format
-                    dataset_format = "prompt-completion"
-                    df = df.dropna(subset=["prompt", "completion"])
-                    df["text"] = df.apply(lambda row: f"<prompt>{row['prompt']}</prompt>\n<completion>{row['completion']}</completion>", axis=1)
-                elif "text" in df.columns:
-                    # Simple text format
-                    dataset_format = "text-only"
-                    df = df.dropna(subset=["text"])
-                else:
-                    # Try to infer format from the first text column
-                    text_columns = [col for col in df.columns if df[col].dtype == 'object']
-                    if len(text_columns) > 0:
-                        dataset_format = "inferred"
-                        df["text"] = df[text_columns[0]]
-                        df = df.dropna(subset=["text"])
-                    else:
-                        raise ValueError("CSV file must contain either 'instruction'/'response', 'input'/'output', 'prompt'/'completion', or 'text' columns")
-                # Create dataset
-                dataset = Dataset.from_pandas(df)
-            elif format_type == "jsonl":
-                # Load JSONL file
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    data = [json.loads(line) for line in f if line.strip()]
-                # Check and normalize the format
-                normalized_data = []
-                for item in data:
-                    normalized_item = {}
-                    # Try to find either instruction-response pairs or text
-                    if "instruction" in item and "response" in item:
-                        normalized_item["text"] = f"<instruction>{item['instruction']}</instruction>\n<response>{item['response']}</response>"
-                        normalized_item["instruction"] = item["instruction"]
-                        normalized_item["response"] = item["response"]
-                    elif "input" in item and "output" in item:
-                        normalized_item["text"] = f"<input>{item['input']}</input>\n<output>{item['output']}</output>"
-                        normalized_item["input"] = item["input"]
-                        normalized_item["output"] = item["output"]
-                    elif "prompt" in item and "completion" in item:
-                        normalized_item["text"] = f"<prompt>{item['prompt']}</prompt>\n<completion>{item['completion']}</completion>"
-                        normalized_item["prompt"] = item["prompt"]
-                        normalized_item["completion"] = item["completion"]
-                    elif "text" in item:
-                        normalized_item["text"] = item["text"]
-                    else:
-                        # Try to infer from the first string value
-                        text_keys = [k for k, v in item.items() if isinstance(v, str) and len(v.strip()) > 0]
-                        if text_keys:
-                            normalized_item["text"] = item[text_keys[0]]
-                        else:
-                            continue  # Skip this item if no usable text found
-                    normalized_data.append(normalized_item)
-                if not normalized_data:
-                    raise ValueError("No valid data items found in the JSONL file")
-                # Create dataset
-                dataset = Dataset.from_list(normalized_data)
-            elif format_type == "text":
-                # For text files, split by newlines and create entries
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    content = f.read()
-                # Check if it's a single large document or multiple examples
-                # If file size > 10KB, try to split into paragraphs
-                if os.path.getsize(file_path) > 10240:
-                    # Split by double newlines (paragraphs)
-                    paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
-                    # Filter out very short paragraphs (less than 20 chars)
-                    paragraphs = [p for p in paragraphs if len(p) >= 20]
-                    data = [{"text": p} for p in paragraphs]
-                else:
-                    # Treat as a single example
-                    data = [{"text": content}]
-                # Create dataset
-                dataset = Dataset.from_list(data)
-            else:
-                raise ValueError(f"Unsupported file format: {format_type}")
-            # Return as a DatasetDict with a train split
-            return DatasetDict({"train": dataset})
-        except Exception as e:
-            import traceback
-            error_msg = f"Error processing dataset: {str(e)}\n{traceback.format_exc()}"
-            print(error_msg)
-            raise ValueError(error_msg)
-    def chunk_text(self, text: str, chunk_size: int) -> List[str]:
-        """Split text into chunks of approximately chunk_size characters"""
-        words = text.split()
-        chunks = []
-        current_chunk = []
-        current_length = 0
-        for word in words:
-            if current_length + len(word) + 1 > chunk_size and current_chunk:
-                chunks.append(" ".join(current_chunk))
-                current_chunk = [word]
-                current_length = len(word)
-            else:
-                current_chunk.append(word)
-                current_length += len(word) + 1  # +1 for the space
-        if current_chunk:
-            chunks.append(" ".join(current_chunk))
-        return chunks
-    def preprocess_dataset(self, dataset, tokenizer, max_length):
-        """
-        Tokenize and format the dataset for training
-        Parameters:
-        dataset (DatasetDict): Dataset dictionary with train and validation splits
-        tokenizer: HuggingFace tokenizer
-        max_length (int): Maximum sequence length
-        Returns:
-        DatasetDict: Tokenized dataset ready for training
-        """
-        def tokenize_function(examples):
-            # Check if the dataset has both input and target text columns
-            if "text" in examples:
-                texts = examples["text"]
-                inputs = tokenizer(
-                    texts,
-                    padding="max_length",
-                    truncation=True,
-                    max_length=max_length,
-                    return_tensors="pt"
-                )
-                inputs["labels"] = inputs["input_ids"].clone()
-                return inputs
-            else:
-                # Try to find text columns based on common naming patterns
-                potential_text_cols = [col for col in examples.keys() if isinstance(examples[col], list) and
-                                    all(isinstance(item, str) for item in examples[col])]
-                if not potential_text_cols:
-                    raise ValueError("No suitable text columns found in the dataset")
-                # Use the first text column found
-                text_col = potential_text_cols[0]
-                texts = examples[text_col]
-                inputs = tokenizer(
-                    texts,
-                    padding="max_length",
-                    truncation=True,
-                    max_length=max_length,
-                    return_tensors="pt"
-                )
-                inputs["labels"] = inputs["input_ids"].clone()
-                return inputs
-        # Apply tokenization to each split
-        tokenized_dataset = {}
-        for split, ds in dataset.items():
-            tokenized_dataset[split] = ds.map(
-                tokenize_function,
-                batched=True,
-                remove_columns=ds.column_names
-            )
-        return tokenized_dataset
-    def prepare_training_args(self, params: Dict) -> TrainingArguments:
-        """Set up training arguments based on hyperparameters"""
-        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
-        self.model_save_path = f"gemma-finetuned-{timestamp}"
-        args = TrainingArguments(
-            output_dir=self.model_save_path,
-            per_device_train_batch_size=params.get("batch_size", self.default_params["batch_size"]),
-            gradient_accumulation_steps=4,
-            per_device_eval_batch_size=params.get("batch_size", self.default_params["batch_size"]),
-            learning_rate=params.get("learning_rate", self.default_params["learning_rate"]),
-            num_train_epochs=params.get("epochs", self.default_params["epochs"]),
-            warmup_ratio=params.get("warmup_ratio", self.default_params["warmup_ratio"]),
-            weight_decay=params.get("weight_decay", self.default_params["weight_decay"]),
-            logging_steps=1,
-            evaluation_strategy="steps" if params.get("eval_ratio", 0) > 0 else "no",
-            eval_steps=100 if params.get("eval_ratio", 0) > 0 else None,
-            save_strategy="steps",
-            save_steps=100,
-            save_total_limit=2,
-            load_best_model_at_end=True if params.get("eval_ratio", 0) > 0 else False,
-            report_to="none"
-        )
-        return args
-    def train(self, training_params: Dict) -> str:
-        """Main training method that handles the complete training pipeline"""
-        try:
-            if self.dataset is None:
-                return "Error: No dataset loaded. Please preprocess a dataset first."
-            # Reset training history
-            self.training_history = {"loss": [], "eval_loss": [], "step": []}
-            # Load model and tokenizer if not already loaded or if model name changed
-            current_model_name = training_params.get("model_name", self.default_params["model_name"])
-            if (self.model is None or self.tokenizer is None or
-                getattr(self, '_current_model_name', None) != current_model_name):
-                self.model, self.tokenizer = self.load_model_and_tokenizer(current_model_name)
-                self._current_model_name = current_model_name
-            # Create validation split if needed
-            eval_ratio = float(training_params.get("eval_ratio", self.default_params["eval_ratio"]))
-            if eval_ratio > 0 and "validation" not in self.dataset:
-                split_dataset = self.dataset["train"].train_test_split(test_size=eval_ratio)
-                self.dataset = {
-                    "train": split_dataset["train"],
-                    "validation": split_dataset["test"]
-                }
-            # Apply LoRA if selected
-            if training_params.get("use_lora", self.default_params["use_lora"]):
-                self.model = self.setup_lora(self.model, {
-                    "lora_r": int(training_params.get("lora_r", self.default_params["lora_r"])),
-                    "lora_alpha": int(training_params.get("lora_alpha", self.default_params["lora_alpha"])),
-                    "lora_dropout": float(training_params.get("lora_dropout", self.default_params["lora_dropout"]))
-                })
-            # Preprocess dataset
-            max_length = int(training_params.get("max_length", self.default_params["max_length"]))
-            tokenized_dataset = self.preprocess_dataset(self.dataset, self.tokenizer, max_length)
-            # Update training arguments with proper type conversion
-            training_args = self.prepare_training_args({
-                "batch_size": int(training_params.get("batch_size", self.default_params["batch_size"])),
-                "learning_rate": float(training_params.get("learning_rate", self.default_params["learning_rate"])),
-                "epochs": int(training_params.get("epochs", self.default_params["epochs"])),
-                "weight_decay": float(training_params.get("weight_decay", self.default_params["weight_decay"])),
-                "warmup_ratio": float(training_params.get("warmup_ratio", self.default_params["warmup_ratio"])),
-                "eval_ratio": eval_ratio
-            })
-            # Create trainer with proper callback
-            self.trainer = self.create_trainer(
-                self.model,
-                self.tokenizer,
-                tokenized_dataset,
-                training_args
-            )
-            # Start training
-            self.trainer.train()
-            # Save the model
-            save_path = f"models/gemma-finetuned-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
-            os.makedirs(save_path, exist_ok=True)
-            self.trainer.save_model(save_path)
-            self.tokenizer.save_pretrained(save_path)
-            self.model_save_path = save_path
-            return f"Training completed successfully! Model saved to {save_path}"
-        except Exception as e:
-            import traceback
-            return f"Error during training: {str(e)}\n{traceback.format_exc()}"
-    def setup_lora(self, model, params: Dict) -> torch.nn.Module:
-        """Configure LoRA for parameter-efficient fine-tuning"""
-        # Prepare the model for training if using 8-bit or 4-bit quantization
-        if hasattr(model, "is_quantized") and model.is_quantized:
-            model = prepare_model_for_kbit_training(model)
-        lora_config = LoraConfig(
-            r=params["lora_r"],
-            lora_alpha=params["lora_alpha"],
-            target_modules=["q_proj", "k_proj", "v_proj"],
-            lora_dropout=params["lora_dropout"],
-            bias="none",
-            task_type="CAUSAL_LM",
-        )
-        model = FastModel.get_peft_model(
-                model,
-                finetune_vision_layers     = False, # Turn off for just text!
-                finetune_language_layers   = True,  # Should leave on!
-                finetune_attention_modules = True,  # Attention good for GRPO
-                finetune_mlp_modules       = True,  # SHould leave on always!
-                r = 8,           # Larger = higher accuracy, but might overfit
-                lora_alpha = 8,  # Recommended alpha == r at least
-                lora_dropout = 0,
-                bias = "none",
-                random_state = 3407,
-            )
-        model.print_trainable_parameters()
-        model = model.to(self.device)
-        return model
-    def create_trainer(self, model, tokenizer, dataset, training_args):
-        """Set up the Trainer for model fine-tuning"""
-        # Create data collator
-        data_collator = DataCollatorForLanguageModeling(
-            tokenizer=tokenizer,
-            mlm=False
-        )
-        # Custom callback to store training history
-        class TrainingCallback(TrainerCallback):
-            def __init__(self, app):
-                self.app = app
-            def on_log(self, args, state, control, logs=None, **kwargs):
-                if logs:
-                    for key in ['loss', 'eval_loss']:
-                        if key in logs:
-                            self.app.training_history[key].append(logs[key])
-                            if 'step' in logs:
-                                self.app.training_history['step'].append(logs['step'])
-        # Create trainer
-        trainer = Trainer(
-            model=model,
-            args=training_args,
-            train_dataset=dataset["train"],
-            eval_dataset=dataset["validation"] if "validation" in dataset else None,
-            data_collator=data_collator,
-            callbacks=[TrainingCallback]
-        )
-        return trainer
-    def plot_training_progress(self):
-        """Generate a plot of the training progress"""
-        if not self.training_history["loss"]:
-            return None
-        plt.figure(figsize=(10, 6))
-        plt.plot(self.training_history["step"], self.training_history["loss"], label="Training Loss")
-        if self.training_history["eval_loss"]:
-            # Get the steps where eval happened
-            eval_steps = self.training_history["step"][:len(self.training_history["eval_loss"])]
-            plt.plot(eval_steps, self.training_history["eval_loss"], label="Validation Loss", linestyle="--")
-        plt.xlabel("Training Steps")
-        plt.ylabel("Loss")
-        plt.title("Training Progress")
-        plt.legend()
-        plt.grid(True)
-        return plt
-    def export_model(self, output_format: str) -> str:
-        """Export the fine-tuned model in various formats"""
-        if self.model is None or self.model_save_path is None:
-            return "No model has been trained yet."
-        export_path = f"{self.model_save_path}/exported_{output_format}"
-        os.makedirs(export_path, exist_ok=True)
-        if output_format == "pytorch":
-            # Save as PyTorch format
-            self.model.save_pretrained(export_path)
-            self.tokenizer.save_pretrained(export_path)
-            return f"Model exported in PyTorch format to {export_path}"
-        elif output_format == "tensorflow":
-            # Convert to TensorFlow format
-            try:
-                from transformers.modeling_tf_utils import convert_pt_to_tf
-                # First save the PyTorch model
-                self.model.save_pretrained(export_path)
-                self.tokenizer.save_pretrained(export_path)
-                # Then convert to TF SavedModel format
-                tf_model = convert_pt_to_tf(self.model)
-                tf_model.save_pretrained(f"{export_path}/tf_saved_model")
-                return f"Model exported in TensorFlow format to {export_path}/tf_saved_model"
-            except Exception as e:
-                return f"Failed to export as TensorFlow model: {str(e)}"
-        elif output_format == "gguf":
-            # Export as GGUF format for local inference
-            try:
-                import subprocess
-                # First save the model in PyTorch format
-                self.model.save_pretrained(export_path)
-                self.tokenizer.save_pretrained(export_path)
-                # Use llama.cpp's conversion script (must be installed)
-                subprocess.run([
-                    "python", "-m", "llama_cpp.convert",
-                    "--outtype", "gguf",
-                    "--outfile", f"{export_path}/model.gguf",
-                    export_path
-                ])
-                return f"Model exported in GGUF format to {export_path}/model.gguf"
-            except Exception as e:
-                return f"Failed to export as GGUF model: {str(e)}"
-        else:
-            return f"Unsupported export format: {output_format}"
-    def generate_text(self, prompt: str, max_length: int = 100) -> str:
-        """Generate text using the fine-tuned model"""
-        if self.model is None or self.tokenizer is None:
-            return "No model has been loaded or fine-tuned yet."
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
-        with torch.no_grad():
-            outputs = self.model.generate(
-                **inputs,
-                max_length=max_length + inputs.input_ids.shape[1],
-                temperature=0.7,
-                top_p=0.9,
-                do_sample=True,
-                pad_token_id=self.tokenizer.pad_token_id
-            )
-        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return generated_text

Gemma-Finetune/utils/sample_dataset.py DELETED Viewed

@@ -1,105 +0,0 @@
-import pandas as pd
-import json
-import os
-import random
-def generate_sample_datasets(output_dir="./sample_datasets"):
-    """
-    Generate sample datasets in CSV, JSONL, and text formats for testing the Gemma fine-tuning UI
-    Parameters:
-    output_dir (str): Directory to save the sample datasets
-    """
-    # Create output directory if it doesn't exist
-    os.makedirs(output_dir, exist_ok=True)
-    # Sample data for instruction-response pairs
-    instruction_data = [
-        {"instruction": "Explain the concept of machine learning in simple terms.",
-         "response": "Machine learning is when computers learn from examples rather than being explicitly programmed. It's like teaching a child by showing examples instead of giving exact rules."},
-        {"instruction": "What are the health benefits of regular exercise?",
-         "response": "Regular exercise improves cardiovascular health, strengthens muscles and bones, helps control weight, reduces stress and anxiety, improves sleep quality, and increases overall energy levels."},
-        {"instruction": "How do I make a basic pasta sauce?",
-         "response": "For a basic tomato pasta sauce, sauté minced garlic and onions in olive oil until soft. Add canned crushed tomatoes, salt, pepper, and Italian herbs. Simmer for 20-30 minutes, stirring occasionally. Add basil at the end for fresh flavor."},
-        {"instruction": "Summarize the water cycle.",
-         "response": "The water cycle is the continuous movement of water on, above, and below Earth's surface. It involves evaporation (water turning to vapor), condensation (vapor forming clouds), precipitation (rain or snow), and collection (water returning to bodies of water)."},
-        {"instruction": "What is the difference between a simile and a metaphor?",
-         "response": "A simile compares two things using 'like' or 'as' (e.g., 'as brave as a lion'). A metaphor directly states that one thing is another (e.g., 'he is a lion in battle'). Both are figurative language techniques used to create vivid descriptions."}
-    ]
-    # Generate more instruction-response pairs
-    topics = ["history", "science", "literature", "cooking", "technology", "health", "travel", "sports", "music", "art"]
-    question_starters = ["Explain", "Describe", "How to", "What is", "Why does", "Compare", "Summarize", "List ways to", "Define", "Analyze"]
-    for _ in range(20):
-        topic = random.choice(topics)
-        starter = random.choice(question_starters)
-        instruction = f"{starter} {topic.lower()} {random.choice(['concepts', 'principles', 'ideas', 'techniques', 'methods'])}"
-        response = f"This is a sample response about {topic} that would be more detailed in a real dataset. It would contain multiple sentences explaining {topic} concepts in depth."
-        instruction_data.append({"instruction": instruction, "response": response})
-    # Create a dictionary to store sample datasets
-    datasets = {}
-    # 1. Create CSV in instruction-response format
-    df_instruction = pd.DataFrame(instruction_data)
-    datasets["instruction_response.csv"] = df_instruction
-    # 2. Create CSV in input-output format
-    input_output_data = [{"input": item["instruction"], "output": item["response"]} for item in instruction_data]
-    df_input_output = pd.DataFrame(input_output_data)
-    datasets["input_output.csv"] = df_input_output
-    # 3. Create CSV in text-only format
-    text_data = [{"text": f"Q: {item['instruction']}\nA: {item['response']}"} for item in instruction_data]
-    df_text = pd.DataFrame(text_data)
-    datasets["text_only.csv"] = df_text
-    # 4. Create CSV with non-standard format
-    custom_data = [{"question": item["instruction"], "answer": item["response"]} for item in instruction_data]
-    df_custom = pd.DataFrame(custom_data)
-    datasets["custom_format.csv"] = df_custom
-    # 5. Create JSONL in instruction-response format
-    jsonl_instruction = instruction_data
-    datasets["instruction_response.jsonl"] = jsonl_instruction
-    # 6. Create JSONL in prompt-completion format
-    prompt_completion_data = [{"prompt": item["instruction"], "completion": item["response"]} for item in instruction_data]
-    datasets["prompt_completion.jsonl"] = prompt_completion_data
-    # 7. Create JSONL with non-standard format
-    jsonl_custom = [{"query": item["instruction"], "result": item["response"]} for item in instruction_data]
-    datasets["custom_format.jsonl"] = jsonl_custom
-    # 8. Create text format (paragraphs)
-    text_paragraphs = "\n\n".join([f"Q: {item['instruction']}\nA: {item['response']}" for item in instruction_data])
-    datasets["paragraphs.txt"] = text_paragraphs
-    # 9. Create text format (single examples per line)
-    text_lines = "\n".join([f"{item['instruction']} => {item['response']}" for item in instruction_data])
-    datasets["single_lines.txt"] = text_lines
-    # Save all datasets
-    for filename, data in datasets.items():
-        file_path = os.path.join(output_dir, filename)
-        if filename.endswith('.csv'):
-            data.to_csv(file_path, index=False)
-        elif filename.endswith('.jsonl'):
-            with open(file_path, 'w', encoding='utf-8') as f:
-                for item in data:
-                    f.write(json.dumps(item) + '\n')
-        elif filename.endswith('.txt'):
-            with open(file_path, 'w', encoding='utf-8') as f:
-                f.write(data)
-    print(f"Sample datasets generated in {output_dir}")
-    return list(datasets.keys())
-# if __name__ == "__main__":
-#     # Generate sample datasets
-#     generated_files = generate_sample_datasets()
-#     print(f"Generated {len(generated_files)} sample dataset files:")
-#     for file in generated_files:
-#         print(f" - {file}")