import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM from auto_gptq import BaseQuantizeConfig import torch # Initialize model and tokenizer MODEL_NAME = "TheBloke/deepseek-coder-1.3b-instruct-GPTQ" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="cpu", # Optimized for CPU quantization_config=BaseQuantizeConfig(), # Required for GPTQ models torch_dtype=torch.float32, # Better CPU compatibility low_cpu_mem_usage=True ) def generate_text(prompt, max_length=100, temperature=0.7): inputs = tokenizer(prompt, return_tensors="pt").to("cpu") with torch.no_grad(): outputs = model.generate( **inputs, max_length=max_length, temperature=temperature, pad_token_id=tokenizer.eos_token_id ) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Gradio UI with gr.Blocks(theme="soft") as demo: gr.Markdown("# 🧠 DeepSeek Coder 1.3B Text Generator\nOptimized for CPU execution on HuggingFace Spaces") with gr.Row(): with gr.Column(): prompt = gr.Textbox( label="Input Prompt", placeholder="Enter your programming/code-related question...", lines=5 ) max_length = gr.Slider(50, 500, value=150, label="Max Output Length") temperature = gr.Slider(0.1, 1.0, value=0.7, label="Creativity Level") submit = gr.Button("Generate Code", variant="primary") output = gr.Textbox(label="Generated Output", lines=10) submit.click( fn=generate_text, inputs=[prompt, max_length, temperature], outputs=output ) gr.Examples( examples=[ ["Write a Python function to calculate Fibonacci numbers"], ["Explain the difference between list and tuples in Python"], ["Create a simple Flask API endpoint for user registration"] ], fn=generate_text, inputs=[prompt, max_length, temperature], outputs=output, cache_examples=False # Save memory ) if __name__ == "__main__": demo.launch()