Spaces:

xxparthparekhxx
/

llama-3.2-1B-FastApi

Sleeping

File size: 3,897 Bytes

5efb178
 
9e1ad54
5efb178
 
 
 
9e1ad54
5efb178
 
 
 
f7807b8
3d7c61d
 
 
 
 
 
 
 
 
 
 
94d28d9
3d7c61d
 
 
 
 
 
 
 
 
94d28d9
 
3d7c61d
 
 
 
 
1b3b718
3d7c61d
 
 
1b3b718
3d7c61d
 
 
 
 
f7807b8
5efb178
f7807b8
 
 
3d7c61d
f7807b8
 
 
5efb178
 
 
32e4fb8
5efb178
9e1ad54
5efb178
 
 
9e1ad54
 
 
 
 
 
5efb178
72ff94c
5efb178
 
 
678ca1e
5efb178
 
 
9e1ad54
 
 
5efb178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72ff94c
5efb178

import os
import torch
import multiprocessing
from fastapi import FastAPI, Request
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from dotenv import load_dotenv
from accelerate import Accelerator

# Load environment variables from a .env file (useful for local development)
load_dotenv()

# HTML for the Buy Me a Coffee badge
html_content = """
<!DOCTYPE html>
<html>
    <head>
        <title>Llama-3.2-1B-Instruct-API</title>
    </head>
    <body>
        <div style="text-align: center;">
            <a href="https://buymeacoffee.com/xxparthparekhxx" target="_blank">
                <img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png" 
                     alt="Buy Me A Coffee" 
                     height="40px">
            </a>
            <h2>Please Chill Out! 😎</h2>
            <p>This API takes around <strong>5.62 minutes</strong> to process a single request due to current hardware limitations.</p>
            <h3>Want Faster Responses? Help Me Out! 🚀</h3>
            <p>If you'd like to see this API running faster on high-performance <strong>A100</strong> hardware, please consider buying me a coffee. ☕ Your support will go towards upgrading to <strong>Hugging Face Pro</strong>, which will allow me to run A100-powered spaces for everyone! 🙌</p>
            <h4>Instructions to Clone and Run Locally:</h4>
            <ol>
                <li><strong>Clone the Repository:</strong>
                    <pre>
                    git clone https://huggingface.co/spaces/xxparthparekhxx/llama-3.2-1B-FastApi
                    cd llama-3.2-1B-FastApi
                    </pre>
                </li>
                <li><strong>Run the Docker container:</strong>
                    <pre>
                    docker build -t llama-api .
                    docker run -p 7860:7860 llama-api
                    </pre>
                </li>
                <li><strong>Access the API locally:</strong>
                    <p>Open <a href="http://localhost:7860">http://localhost:7860</a> to access the API docs locally.</p>
                </li>
            </ol>
        </div>
    </body>
</html>
"""

# FastAPI app with embedded Buy Me a Coffee badge and instructions
app = FastAPI(
    title="Llama-3.2-1B-Instruct-API",
    description= html_content,
    docs_url="/",  # URL for Swagger docs
    redoc_url="/doc"  # URL for ReDoc docs
)
# Set your Hugging Face token from environment variable
HF_TOKEN = os.getenv("HF_TOKEN")

MODEL = "meta-llama/Llama-3.2-1B-Instruct"

# Auto-select CPU or GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Set PyTorch to use all available CPU cores if running on CPU
torch.set_num_threads(multiprocessing.cpu_count())

# Initialize Accelerator for managing device allocation
accelerator = Accelerator()

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL, token=HF_TOKEN, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    token=HF_TOKEN,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Prepare model for multi-device setup with accelerate
model, tokenizer = accelerator.prepare(model, tokenizer)

# Pydantic model for input
class PromptRequest(BaseModel):
    prompt: str
    max_new_tokens: int = 100
    temperature: float = 0.7

@app.post("/generate/")
async def generate_text(request: PromptRequest):
    inputs = tokenizer(request.prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=request.max_new_tokens,
            temperature=request.temperature,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return {"response": response}