import os
import torch
import multiprocessing
from fastapi import FastAPI, Request
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from dotenv import load_dotenv
from accelerate import Accelerator
# Load environment variables from a .env file (useful for local development)
load_dotenv()
# HTML for the Buy Me a Coffee badge
html_content = """
Llama-3.2-1B-Instruct-API
Please Chill Out! 😎
This API takes around 5.62 minutes to process a single request due to current hardware limitations.
Want Faster Responses? Help Me Out! 🚀
If you'd like to see this API running faster on high-performance A100 hardware, please consider buying me a coffee. ☕ Your support will go towards upgrading to Hugging Face Pro, which will allow me to run A100-powered spaces for everyone! 🙌
Instructions to Clone and Run Locally:
- Clone the Repository:
git clone https://huggingface.co/spaces/xxparthparekhxx/llama-3.2-1B-FastApi
cd llama-3.2-1B-FastApi
- Run the Docker container:
docker build -t llama-api .
docker run -p 7860:7860 llama-api
- Access the API locally:
Open http://localhost:7860 to access the API docs locally.
"""
# FastAPI app with embedded Buy Me a Coffee badge and instructions
app = FastAPI(
title="Llama-3.2-1B-Instruct-API",
description= html_content,
docs_url="/", # URL for Swagger docs
redoc_url="/doc" # URL for ReDoc docs
)
# Set your Hugging Face token from environment variable
HF_TOKEN = os.getenv("HF_TOKEN")
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
# Auto-select CPU or GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Set PyTorch to use all available CPU cores if running on CPU
torch.set_num_threads(multiprocessing.cpu_count())
# Initialize Accelerator for managing device allocation
accelerator = Accelerator()
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL, token=HF_TOKEN, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL,
token=HF_TOKEN,
torch_dtype=torch.float16,
device_map="auto"
)
# Prepare model for multi-device setup with accelerate
model, tokenizer = accelerator.prepare(model, tokenizer)
# Pydantic model for input
class PromptRequest(BaseModel):
prompt: str
max_new_tokens: int = 100
temperature: float = 0.7
@app.post("/generate/")
async def generate_text(request: PromptRequest):
inputs = tokenizer(request.prompt, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=request.max_new_tokens,
temperature=request.temperature,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"response": response}