bnb-my-repo

Running on A10G

File size: 26,242 Bytes

import gradio as gr
import torch
from transformers import AutoModel, BitsAndBytesConfig, AutoTokenizer
import tempfile
from huggingface_hub import HfApi
from huggingface_hub import list_models
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from bitsandbytes.nn import Linear4bit
import os
from huggingface_hub import snapshot_download

def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
    # ^ expect a gr.OAuthProfile object as input to get the user's profile
    # if the user is not logged in, profile will be None
    if profile is None:
        return "Hello ! Please Login to your HuggingFace account to use the BitsAndBytes Quantizer!"
    return f"Hello {profile.name} ! Welcome to BitsAndBytes Quantizer"


def check_model_exists(
    oauth_token: gr.OAuthToken | None, username, model_name, quantized_model_name, upload_to_community
):
    """Check if a model exists in the user's Hugging Face repository."""
    try:
        models = list_models(author=username, token=oauth_token.token)
        community_models = list_models(author="bnb-community", token=oauth_token.token)
        model_names = [model.id for model in models]
        community_model_names = [model.id for model in community_models]
        if upload_to_community:
            repo_name = f"bnb-community/{model_name.split('/')[-1]}-bnb-4bit"
        else:
            if quantized_model_name:
                repo_name = f"{username}/{quantized_model_name}"
            else:
                repo_name = f"{username}/{model_name.split('/')[-1]}-bnb-4bit"

        if repo_name in model_names:
            return f"Model '{repo_name}' already exists in your repository."
        elif repo_name in community_model_names:
            return f"Model '{repo_name}' already exists in the bnb-community organization."
        else:
            return None  # Model does not exist
    except Exception as e:
        return f"Error checking model existence: {str(e)}"


def create_model_card(
    model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4
):
    # Try to download the original README
    original_readme = ""
    original_yaml_header = ""
    try:
        # Download the README.md file from the original model
        model_path = snapshot_download(repo_id=model_name, allow_patterns=["README.md"], repo_type="model")
        readme_path = os.path.join(model_path, "README.md")
        
        if os.path.exists(readme_path):
            with open(readme_path, 'r', encoding='utf-8') as f:
                content = f.read()
                
                if content.startswith('---'):
                    parts = content.split('---', 2)
                    if len(parts) >= 3:
                        original_yaml_header = parts[1]
                        original_readme = '---'.join(parts[2:])
                    else:
                        original_readme = content
                else:
                    original_readme = content
    except Exception as e:
        print(f"Error reading original README: {str(e)}")
        original_readme = ""
    
    # Create new YAML header with base_model field
    yaml_header = f"""---
base_model:
- {model_name}"""
    
    # Add any original YAML fields except base_model
    if original_yaml_header:
        in_base_model_section = False
        found_tags = False
        for line in original_yaml_header.strip().split('\n'):
            # Skip if we're in a base_model section that continues to the next line
            if in_base_model_section:
                if line.strip().startswith('-') or not line.strip() or line.startswith(' '):
                    continue
                else:
                    in_base_model_section = False
            
            # Check for base_model field
            if line.strip().startswith('base_model:'):
                in_base_model_section = True
                # If base_model has inline value (like "base_model: model_name")
                if ':' in line and len(line.split(':', 1)[1].strip()) > 0:
                    in_base_model_section = False
                continue
            
            # Check for tags field and add bnb-my-repo
            if line.strip().startswith('tags:'):
                found_tags = True
                yaml_header += f"\n{line}"
                yaml_header += "\n- bnb-my-repo"
                continue
                
            yaml_header += f"\n{line}"
        
        # If tags field wasn't found, add it
        if not found_tags:
            yaml_header += "\ntags:"
            yaml_header += "\n- bnb-my-repo"
    # Complete the YAML header
    yaml_header += "\n---"

    # Create the quantization info section
    quant_info = f"""
# {model_name} (Quantized)

## Description
This model is a quantized version of the original model [`{model_name}`](https://huggingface.co/{model_name}). 

It's quantized using the BitsAndBytes library to 4-bit using the [bnb-my-repo](https://huggingface.co/spaces/bnb-community/bnb-my-repo) space.

## Quantization Details
- **Quantization Type**: int4
- **bnb_4bit_quant_type**: {quant_type_4}
- **bnb_4bit_use_double_quant**: {double_quant_4}
- **bnb_4bit_compute_dtype**: {compute_type_4}
- **bnb_4bit_quant_storage**: {quant_storage_4}

"""

    # Combine everything
    model_card = yaml_header + quant_info
    
    # Append original README content if available
    if original_readme and not original_readme.isspace():
        model_card += "\n\n# 📄 Original Model Information\n\n" + original_readme
    
    return model_card


DTYPE_MAPPING = {
    "int8": torch.int8,
    "uint8": torch.uint8,
    "float16": torch.float16,
    "float32": torch.float32,
    "bfloat16": torch.bfloat16,
}


def quantize_model(
    model_name,
    quant_type_4,
    double_quant_4,
    compute_type_4,
    quant_storage_4,
    auth_token=None,
    progress=gr.Progress(),
):
    progress(0, desc="Loading model")

    # Configure quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type=quant_type_4,
        bnb_4bit_use_double_quant=True if double_quant_4 == "True" else False,
        bnb_4bit_quant_storage=DTYPE_MAPPING[quant_storage_4],
        bnb_4bit_compute_dtype=DTYPE_MAPPING[compute_type_4],
    )

    # Load model
    model = AutoModel.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="cpu",
        use_auth_token=auth_token.token,
        torch_dtype="auto",
    )
    progress(0.33, desc="Quantizing")

    # Quantize model
    # Calculate original model sizeo
    original_size_gb = get_model_size(model)

    modules = list(model.named_modules())
    for idx, (_, module) in enumerate(modules):
        if isinstance(module, Linear4bit):
            module.to("cuda")
            module.to("cpu")
        progress(0.33 + (0.33 * idx / len(modules)), desc="Quantizing")

    progress(0.66, desc="Quantized successfully")
    return model, original_size_gb


def save_model(
    model,
    model_name,
    original_size_gb,
    quant_type_4,
    double_quant_4,
    compute_type_4,
    quant_storage_4,
    username=None,
    auth_token=None,
    quantized_model_name=None,
    public=False,
    upload_to_community=False,
    progress=gr.Progress(),
):
    progress(0.67, desc="Preparing to push")

    with tempfile.TemporaryDirectory() as tmpdirname:
        # Save model
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=auth_token.token)
        tokenizer.save_pretrained(tmpdirname, safe_serialization=True, use_auth_token=auth_token.token)
        model.save_pretrained(
            tmpdirname, safe_serialization=True, use_auth_token=auth_token.token
        )
        progress(0.75, desc="Preparing to push")

        # Prepare repo name and model card
        if upload_to_community:
            repo_name = f"bnb-community/{model_name.split('/')[-1]}-bnb-4bit"
        else:
            if quantized_model_name:
                repo_name = f"{username}/{quantized_model_name}"
            else:
                repo_name = f"{username}/{model_name.split('/')[-1]}-bnb-4bit"

        model_card = create_model_card(
            model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4
        )
        with open(os.path.join(tmpdirname, "README.md"), "w") as f:
            f.write(model_card)
        progress(0.80, desc="Model card created")

        # Push to Hub
        api = HfApi(token=auth_token.token)
        api.create_repo(repo_name, exist_ok=True, private=not public)
        progress(0.85, desc="Pushing to Hub")

        # Upload files
        api.upload_folder(
            folder_path=tmpdirname,
            repo_id=repo_name,
            repo_type="model",
        )
        progress(0.95, desc="Model pushed to Hub")

    # Get model architecture as string
    import io
    from contextlib import redirect_stdout
    import html

    # Capture the model architecture string
    f = io.StringIO()
    with redirect_stdout(f):
        print(model)
    model_architecture_str = f.getvalue()

    # Escape HTML characters and format with line breaks
    model_architecture_str_html = html.escape(model_architecture_str).replace(
        "\n", "<br/>"
    )

    # Format it for display in markdown with proper styling
    model_architecture_info = f"""
    <div class="model-architecture-container" style="margin-top: 20px; margin-bottom: 20px; background-color: #f8f9fa; padding: 15px; border-radius: 8px; border-left: 4px solid #4CAF50;">
        <h3 style="margin-top: 0; color: #2E7D32;">📋 Model Architecture</h3>
        <div class="model-architecture" style="max-height: 500px; overflow-y: auto; overflow-x: auto; background-color: #f5f5f5; padding: 5px; border-radius: 8px; font-family: monospace; white-space: pre-wrap;">
        <div style="line-height: 1.2; font-size: 0.75em;">{model_architecture_str_html}</div>
        </div>
    </div>
    """

    model_size_info = f"""
    <div class="model-size-info" style="margin-top: 20px; margin-bottom: 20px; background-color: #f8f9fa; padding: 15px; border-radius: 8px; border-left: 4px solid #4CAF50;">
        <h3 style="margin-top: 0; color: #2E7D32;">📦 Model Size</h3>
        <p>Original (bf16)≈ {original_size_gb} GB → Quantized ≈ {get_model_size(model)} GB</p>
    </div>
    """

    repo_link = f"""
    <div class="repo-link" style="margin-top: 20px; margin-bottom: 20px; background-color: #f8f9fa; padding: 15px; border-radius: 8px; border-left: 4px solid #4CAF50;">
        <h3 style="margin-top: 0; color: #2E7D32;">🔗 Repository Link</h3>
        <p>Find your repo here: <a href="https://huggingface.co/{repo_name}" target="_blank" style="text-decoration:underline">{repo_name}</a></p>
    </div>
    """
    return f'<h1>🎉 Quantization Completed</h1><br/>{repo_link}{model_size_info}{model_architecture_info}'


def quantize_and_save(
    profile: gr.OAuthProfile | None,
    oauth_token: gr.OAuthToken | None,
    model_name,
    quant_type_4,
    double_quant_4,
    compute_type_4,
    quant_storage_4,
    quantized_model_name,
    public,
    upload_to_community,
    progress=gr.Progress(),
):
    if oauth_token is None:
        return """
        <div class="error-box">
            <h3>❌ Authentication Error</h3>
            <p>Please sign in to your HuggingFace account to use the quantizer.</p>
        </div>
        """
    if not profile:
        return """
        <div class="error-box">
            <h3>❌ Authentication Error</h3>
            <p>Please sign in to your HuggingFace account to use the quantizer.</p>
        </div>
        """
    exists_message = check_model_exists(
        oauth_token, profile.username, model_name, quantized_model_name, upload_to_community
    )
    if exists_message:
        return f"""
        <div class="warning-box">
            <h3>⚠️ Model Already Exists</h3>
            <p>{exists_message}</p>
        </div>
        """
    try:
        # Download phase
        progress(0, desc="Starting quantization process")
        quantized_model, original_size_gb = quantize_model(
            model_name,
            quant_type_4,
            double_quant_4,
            compute_type_4,
            quant_storage_4,
            oauth_token,
            progress,
        )
        final_message = save_model(
            quantized_model,
            model_name,
            original_size_gb,
            quant_type_4,
            double_quant_4,
            compute_type_4,
            quant_storage_4,
            profile.username,
            oauth_token,
            quantized_model_name,
            public,
            upload_to_community,
            progress,
        )
        # Clean up the model to free memory
        del quantized_model
        # Force garbage collection to release memory
        import gc
        gc.collect()
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
        progress(1.0, desc="Memory cleaned")
        return final_message

    except Exception as e:
        error_message = str(e).replace("\n", "<br/>")
        return f"""
        <div class="error-box">
            <h3>❌ Error Occurred</h3>
            <p>{error_message}</p>
        </div>
        """
def get_model_size(model):
    """
    Calculate the size of a PyTorch model in gigabytes.
    
    Args:
        model: PyTorch model
        
    Returns:
        float: Size of the model in GB
    """
    # Get model state dict
    state_dict = model.state_dict()
    
    # Calculate total size in bytes
    total_size = 0
    for param in state_dict.values():
        # Calculate bytes for each parameter
        total_size += param.nelement() * param.element_size()
    
    # Convert bytes to gigabytes (1 GB = 1,073,741,824 bytes)
    size_gb = total_size / (1024 ** 3)
    size_gb = round(size_gb, 2)
    
    return size_gb

css = """/* Custom CSS to allow scrolling */
.gradio-container {overflow-y: auto;}

/* Fix alignment for radio buttons and checkboxes */
.gradio-radio {
    display: flex !important;
    align-items: center !important;
    margin: 10px 0 !important;
}

.gradio-checkbox {
    display: flex !important;
    align-items: center !important;
    margin: 10px 0 !important;
}

/* Ensure consistent spacing and alignment */
.gradio-dropdown, .gradio-textbox, .gradio-radio, .gradio-checkbox {
    margin-bottom: 12px !important;
    width: 100% !important;
}

/* Align radio buttons and checkboxes horizontally */
.option-row {
    display: flex !important;
    justify-content: space-between !important;
    align-items: center !important;
    gap: 20px !important;
    margin-bottom: 12px !important;
}

.option-row .gradio-radio, .option-row .gradio-checkbox {
    margin: 0 !important;
    flex: 1 !important;
}

/* Horizontally align radio button options with text */
.gradio-radio label {
    display: flex !important;
    align-items: center !important;
}

.gradio-radio input[type="radio"] {
    margin-right: 5px !important;
}

/* Remove padding and margin from model name textbox for better alignment */
.model-name-textbox {
    padding-left: 0 !important;
    padding-right: 0 !important;
    margin-left: 0 !important;
    margin-right: 0 !important;
}

/* Quantize button styling with glow effect */
button[variant="primary"] {
    background: linear-gradient(135deg, #3B82F6, #10B981) !important;
    color: white !important;
    padding: 16px 32px !important;
    font-size: 1.1rem !important;
    font-weight: 700 !important;
    border: none !important;
    border-radius: 12px !important;
    box-shadow: 0 0 15px rgba(59, 130, 246, 0.5) !important;
    transition: all 0.3s cubic-bezier(0.25, 0.8, 0.25, 1) !important;
    position: relative;
    overflow: hidden;
    animation: glow 1.5s ease-in-out infinite alternate;
}

button[variant="primary"]::before {
    content: "✨ ";
}

button[variant="primary"]:hover {
    transform: translateY(-5px) scale(1.05) !important;
    box-shadow: 0 10px 25px rgba(59, 130, 246, 0.7) !important;
}

@keyframes glow {
    from {
        box-shadow: 0 0 10px rgba(59, 130, 246, 0.5);
    }
    to {
        box-shadow: 0 0 20px rgba(59, 130, 246, 0.8), 0 0 30px rgba(16, 185, 129, 0.5);
    }
}

/* Login button styling with glow effect */
#login-button {
    background: linear-gradient(135deg, #3B82F6, #10B981) !important;
    color: white !important;
    font-weight: 700 !important;
    border: none !important;
    border-radius: 12px !important;
    box-shadow: 0 0 15px rgba(59, 130, 246, 0.5) !important;
    transition: all 0.3s cubic-bezier(0.25, 0.8, 0.25, 1) !important;
    position: relative;
    overflow: hidden;
    animation: glow 1.5s ease-in-out infinite alternate;
    max-width: 300px !important;
    margin: 0 auto !important;
}

#login-button::before {
    content: "🔑 ";
    display: inline-block !important;
    vertical-align: middle !important;
    margin-right: 5px !important;
    line-height: normal !important;
}

#login-button:hover {
    transform: translateY(-3px) scale(1.03) !important;
    box-shadow: 0 10px 25px rgba(59, 130, 246, 0.7) !important;
}

#login-button::after {
    content: "";
    position: absolute;
    top: 0;
    left: -100%;
    width: 100%;
    height: 100%;
    background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent);
    transition: 0.5s;
}

#login-button:hover::after {
    left: 100%;
}

/* Toggle instructions button styling */
#toggle-button {
    background: linear-gradient(135deg, #3B82F6, #10B981) !important;
    color: white !important;
    font-size: 0.85rem !important;
    font-weight: 600 !important;
    padding: 8px 16px !important;
    border: none !important;
    border-radius: 8px !important;
    box-shadow: 0 2px 10px rgba(59, 130, 246, 0.3) !important;
    transition: all 0.3s ease !important;
    margin: 0.5rem auto 1.5rem auto !important;
    display: block !important;
    max-width: 200px !important;
    text-align: center !important;
    position: relative;
    overflow: hidden;
}

#toggle-button:hover {
    transform: translateY(-2px) !important;
    box-shadow: 0 4px 12px rgba(59, 130, 246, 0.5) !important;
}

#toggle-button::after {
    content: "";
    position: absolute;
    top: 0;
    left: -100%;
    width: 100%;
    height: 100%;
    background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent);
    transition: 0.5s;
}

#toggle-button:hover::after {
    left: 100%;
}
/* Progress Bar Styles */
.progress-container {
    font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
    padding: 20px;
    background: white;
    border-radius: 12px;
    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}

.progress-stage {
    font-size: 0.9rem;
    font-weight: 600;
    color: #64748b;
}

.progress-stage .stage {
    position: relative;
    padding: 8px 12px;
    border-radius: 6px;
    background: #f1f5f9;
    transition: all 0.3s ease;
}

.progress-stage .stage.completed {
    background: #ecfdf5;
}

.progress-bar {
    box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.1);
}
.progress {
    transition: width 0.8s cubic-bezier(0.4, 0, 0.2, 1);
    box-shadow: 0 2px 4px rgba(59, 130, 246, 0.3);
}
"""


with gr.Blocks(theme=gr.themes.Ocean(), css=css) as demo:
    gr.Markdown(
        """
        # 🤗 BitsAndBytes Quantizer : Create your own BNB Quants ! ✨


        <br/>
        <br/>
        """
    )

    gr.LoginButton(elem_id="login-button", elem_classes="center-button", min_width=250)

    m1 = gr.Markdown()
    demo.load(hello, inputs=None, outputs=m1)

    instructions_visible = gr.State(False)

    with gr.Row():
        with gr.Column():
            with gr.Row():
                model_name = HuggingfaceHubSearch(
                    label="🔍 Hub Model ID",
                    placeholder="Search for model id on Huggingface",
                    search_type="model",
                )
            with gr.Row():
                with gr.Column():
                    gr.Markdown(
                        """
                        ### ⚙️ Model Quantization Type Settings
                        """
                    )
                    quant_type_4 = gr.Dropdown(
                        info="The quantization data type in the bnb.nn.Linear4Bit layers",
                        choices=["fp4", "nf4"],
                        value="nf4",
                        visible=True,
                        show_label=False,
                    )
                    compute_type_4 = gr.Dropdown(
                        info="The compute type for the model",
                        choices=["float16", "bfloat16", "float32"],
                        value="bfloat16",
                        visible=True,
                        show_label=False,
                    )
                    quant_storage_4 = gr.Dropdown(
                        info="The storage type for the model",
                        choices=["float16", "float32", "int8", "uint8", "bfloat16"],
                        value="uint8",
                        visible=True,
                        show_label=False,
                    )
                    gr.Markdown(
                        """
                        ### 🔄 Double Quantization Settings
                        """
                    )
                    with gr.Row(elem_classes="option-row"):
                        double_quant_4 = gr.Radio(
                            ["True", "False"],
                            info="Use Double Quant",
                            visible=True,
                            value="True",
                            show_label=False,
                        )
                    gr.Markdown(
                        """
                        ### 💾 Saving Settings
                        """
                    )
                    with gr.Row():
                        quantized_model_name = gr.Textbox(
                            label="✏️ Model Name",
                            info="Model Name (optional : to override default)",
                            value="",
                            interactive=True,
                            elem_classes="model-name-textbox",
                            show_label=False,
                        )

                    with gr.Row():
                        public = gr.Checkbox(
                            label="🌐 Make model public",
                            info="If checked, the model will be publicly accessible",
                            value=True,
                            interactive=True,
                            show_label=True,
                        )
                        
                    with gr.Row():
                        upload_to_community = gr.Checkbox(
                            label="🤗 Upload to bnb-community",
                            info="If checked, the model will be uploaded to the bnb-community organization \n(Give the space access to the bnb-community, if not already done revoke the token and login again)",
                            value=False,
                            interactive=True,
                            show_label=True,
                        )
                        
                    # Add event handler to disable and clear model name when uploading to community
                    def toggle_model_name(upload_to_community_checked):
                        return gr.update(
                            interactive=not upload_to_community_checked,
                            value="Can't change model name when uploading to community" if upload_to_community_checked else quantized_model_name.value
                        )
                    
                    upload_to_community.change(
                        fn=toggle_model_name,
                        inputs=[upload_to_community],
                        outputs=quantized_model_name
                    )

        with gr.Column():
            quantize_button = gr.Button(
                "🚀 Quantize and Push to the Hub", variant="primary"
            )
            output_link = gr.Markdown(
                "🔗 Quantized Model Info", container=True, min_height=200
            )

    quantize_button.click(
        fn=quantize_and_save,
        inputs=[
            model_name,
            quant_type_4,
            double_quant_4,
            compute_type_4,
            quant_storage_4,
            quantized_model_name,
            public,
            upload_to_community,
        ],
        outputs=[output_link],
        show_progress="full",
    )
    # Add information section about the app options
    with gr.Accordion("📚 About this app", open=True):
        gr.Markdown(
            """
            ## 📝 Notes on Quantization Options
            
            ### Quantization Type (bnb_4bit_quant_type)
            - **fp4**: Floating-point 4-bit quantization.
            - **nf4**: Normal float 4-bit quantization.
            
            ### Double Quantization
            - **True**: Applies a second round of quantization to the quantization constants, further reducing memory usage.
            - **False**: Uses standard quantization only.
            
            ### Model Saving Options
            - **Model Name**: Custom name for your quantized model on the Hub. If left empty, a default name will be generated.
            - **Make model public**: If checked, anyone can access your quantized model. If unchecked, only you can access it.
            
            ## 🔍 How It Works
            This app uses the BitsAndBytes library to perform 4-bit quantization on Transformer models. The process:
            1. Downloads the original model
            2. Applies the selected quantization settings
            3. Uploads the quantized model to your HuggingFace account
            
            ## 📊 Memory Usage
            4-bit quantization can reduce model size by up to ≈75% compared to FP16 for big models.
            """
        )

if __name__ == "__main__":
    demo.launch(share=True)