import os import torch from fastapi import FastAPI, HTTPException from pydantic import BaseModel from transformers import AutoModelForCausalLM, AutoTokenizer # Set Hugging Face cache directory to avoid permission issues in Docker os.environ["HF_HOME"] = "/app/huggingface_cache" os.environ["TRANSFORMERS_CACHE"] = "/app/huggingface_cache" app = FastAPI() class TextGenerationRequest(BaseModel): prompt: str max_length: int = 100 temperature: float = 0.7 # Load model and tokenizer (Force CPU) model_name = "unsloth/Qwen2.5-7B-bnb-4bit" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, torch_dtype=torch.float32, # Use float32 for CPU device_map={"": "cpu"} # Ensure CPU usage ) @app.get("/") def api_home(): return {"detail": "Welcome to FastAPI TextGen API!"} @app.post("/generate") async def generate_text(request: TextGenerationRequest): try: inputs = tokenizer(request.prompt, return_tensors="pt").to("cpu") # Ensure CPU usage outputs = model.generate( inputs.input_ids, max_length=request.max_length, temperature=request.temperature, do_sample=True, ) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"generated_text": generated_text} except Exception as e: raise HTTPException(status_code=500, detail=str(e))