import os from fastapi import FastAPI, HTTPException from pydantic import BaseModel from transformers import AutoModelForCausalLM, AutoTokenizer import torch # Disable hf_transfer os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "false" app = FastAPI() # Load your fine-tuned model and tokenizer model_name = "OnlyCheeini/greesychat-turbo" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to("cuda") class OpenAIRequest(BaseModel): model: str prompt: str max_tokens: int = 64 temperature: float = 0.7 top_p: float = 0.9 class OpenAIResponse(BaseModel): choices: list @app.post("/v1/completions", response_model=OpenAIResponse) async def generate_text(request: OpenAIRequest): if request.model != model_name: raise HTTPException(status_code=400, detail="Model not found") inputs = tokenizer(request.prompt, return_tensors="pt").to("cuda") outputs = model.generate( **inputs, max_length=inputs['input_ids'].shape[1] + request.max_tokens, temperature=request.temperature, top_p=request.top_p, ) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return OpenAIResponse(choices=[{"text": generated_text}]) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8000)