import os import torch import multiprocessing from fastapi import FastAPI, Request from pydantic import BaseModel from transformers import AutoModelForCausalLM, AutoTokenizer from dotenv import load_dotenv from accelerate import Accelerator # Load environment variables from a .env file (useful for local development) load_dotenv() # HTML for the Buy Me a Coffee badge html_content = """ Llama-3.2-1B-Instruct-API
Buy Me A Coffee

Please Chill Out! 😎

This API takes around 5.62 minutes to process a single request due to current hardware limitations.

Want Faster Responses? Help Me Out! 🚀

If you'd like to see this API running faster on high-performance A100 hardware, please consider buying me a coffee. ☕ Your support will go towards upgrading to Hugging Face Pro, which will allow me to run A100-powered spaces for everyone! 🙌

Instructions to Clone and Run Locally:

  1. Clone the Repository:
                        git clone https://huggingface.co/spaces/xxparthparekhxx/llama-3.2-1B-FastApi
                        cd llama-3.2-1B-FastApi
                        
  2. Run the Docker container:
                        docker build -t llama-api .
                        docker run -p 7860:7860 llama-api
                        
  3. Access the API locally:

    Open http://localhost:7860 to access the API docs locally.

""" # FastAPI app with embedded Buy Me a Coffee badge and instructions app = FastAPI( title="Llama-3.2-1B-Instruct-API", description= html_content, docs_url="/", # URL for Swagger docs redoc_url="/doc" # URL for ReDoc docs ) # Set your Hugging Face token from environment variable HF_TOKEN = os.getenv("HF_TOKEN") MODEL = "meta-llama/Llama-3.2-1B-Instruct" # Auto-select CPU or GPU device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Set PyTorch to use all available CPU cores if running on CPU torch.set_num_threads(multiprocessing.cpu_count()) # Initialize Accelerator for managing device allocation accelerator = Accelerator() # Load model and tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL, token=HF_TOKEN, use_fast=True) model = AutoModelForCausalLM.from_pretrained( MODEL, token=HF_TOKEN, torch_dtype=torch.float16, device_map="auto" ) # Prepare model for multi-device setup with accelerate model, tokenizer = accelerator.prepare(model, tokenizer) # Pydantic model for input class PromptRequest(BaseModel): prompt: str max_new_tokens: int = 100 temperature: float = 0.7 @app.post("/generate/") async def generate_text(request: PromptRequest): inputs = tokenizer(request.prompt, return_tensors="pt").to(device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=request.max_new_tokens, temperature=request.temperature, do_sample=False, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"response": response}