import gradio as gr from transformers import AutoTokenizer, pipeline from auto_gptq import AutoGPTQForCausalLM import torch import os # Model loading with memory optimization MODEL_NAME = "TheBloke/deepseek-coder-1.3b-instruct-GPTQ" cache_dir = "./model_cache" os.makedirs(cache_dir, exist_ok=True) # Load tokenizer and model with 4-bit quantization tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, cache_dir=cache_dir) model = AutoGPTQForCausalLM.from_quantized( MODEL_NAME, model_basename="model", use_safetensors=True, quantize_config=None, device_map="auto", low_cpu_mem_usage=True, cache_dir=cache_dir ) # Create generation pipeline generator = pipeline( "text-generation", model=model, tokenizer=tokenizer, device_map="auto" ) def generate_text(prompt, max_length=512, temperature=0.7): """Generate text with safety checks and context awareness""" full_prompt = f"Instruct: {prompt}\nOutput:" with torch.inference_mode(): response = generator( full_prompt, max_new_tokens=max_length, temperature=temperature, do_sample=True, pad_token_id=tokenizer.eos_token_id )[0]["generated_text"] # Remove prompt from output return response.split("Output:")[-1].strip() # Gradio interface with enhanced UX with gr.Blocks(theme="soft", css=".gradio-container {max-width: 800px; margin: auto;}") as demo: gr.Markdown(""" # 🧠 DeepSeek Coder 1.3B Instruct (GPTQ) *Text-to-Code Generation App* Enter a programming instruction below and adjust parameters for optimal output. """) with gr.Row(): prompt = gr.Textbox( label="Enter your instruction", placeholder="Write a Python function to calculate Fibonacci numbers...", lines=4 ) with gr.Row(): max_length = gr.Slider(64, 2048, value=512, label="Max Output Length") temperature = gr.Slider(0.1, 1.5, value=0.7, label="Creativity (Temperature)") output = gr.Textbox(label="Generated Output", lines=10) submit = gr.Button("✨ Generate Code", variant="primary") submit.click( fn=generate_text, inputs=[prompt, max_length, temperature], outputs=output ) gr.Markdown(""" ### ℹ️ How it works - Uses GPTQ-quantized model for efficient inference - Automatically handles context window management - Temperature controls randomness (0.1=strict, 1.5=creative) """) demo.launch()