Spaces:

xxparthparekhxx
/

llama-3.2-1B-FastApi

Sleeping

parth parekh

improved ui

94d28d9 7 months ago

3.9 kB

	import os
	import torch
	import multiprocessing
	from fastapi import FastAPI, Request
	from pydantic import BaseModel
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from dotenv import load_dotenv
	from accelerate import Accelerator

	# Load environment variables from a .env file (useful for local development)
	load_dotenv()

	# HTML for the Buy Me a Coffee badge
	html_content = """
	<!DOCTYPE html>
	<html>
	<head>
	<title>Llama-3.2-1B-Instruct-API</title>
	</head>
	<body>
	<div style="text-align: center;">
	<a href="https://buymeacoffee.com/xxparthparekhxx" target="_blank">
	<img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png"
	alt="Buy Me A Coffee"
	height="40px">
	</a>
	<h2>Please Chill Out! 😎</h2>
	<p>This API takes around <strong>5.62 minutes</strong> to process a single request due to current hardware limitations.</p>
	<h3>Want Faster Responses? Help Me Out! 🚀</h3>
	<p>If you'd like to see this API running faster on high-performance <strong>A100</strong> hardware, please consider buying me a coffee. ☕ Your support will go towards upgrading to <strong>Hugging Face Pro</strong>, which will allow me to run A100-powered spaces for everyone! 🙌</p>
	<h4>Instructions to Clone and Run Locally:</h4>
	<ol>
	<li><strong>Clone the Repository:</strong>
	<pre>
	git clone https://huggingface.co/spaces/xxparthparekhxx/llama-3.2-1B-FastApi
	cd llama-3.2-1B-FastApi
	</pre>
	</li>
	<li><strong>Run the Docker container:</strong>
	<pre>
	docker build -t llama-api .
	docker run -p 7860:7860 llama-api
	</pre>
	</li>
	<li><strong>Access the API locally:</strong>
	<p>Open <a href="http://localhost:7860">http://localhost:7860</a> to access the API docs locally.</p>
	</li>
	</ol>
	</div>
	</body>
	</html>
	"""

	# FastAPI app with embedded Buy Me a Coffee badge and instructions
	app = FastAPI(
	title="Llama-3.2-1B-Instruct-API",
	description= html_content,
	docs_url="/", # URL for Swagger docs
	redoc_url="/doc" # URL for ReDoc docs
	)
	# Set your Hugging Face token from environment variable
	HF_TOKEN = os.getenv("HF_TOKEN")

	MODEL = "meta-llama/Llama-3.2-1B-Instruct"

	# Auto-select CPU or GPU
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Set PyTorch to use all available CPU cores if running on CPU
	torch.set_num_threads(multiprocessing.cpu_count())

	# Initialize Accelerator for managing device allocation
	accelerator = Accelerator()

	# Load model and tokenizer
	tokenizer = AutoTokenizer.from_pretrained(MODEL, token=HF_TOKEN, use_fast=True)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL,
	token=HF_TOKEN,
	torch_dtype=torch.float16,
	device_map="auto"
	)

	# Prepare model for multi-device setup with accelerate
	model, tokenizer = accelerator.prepare(model, tokenizer)

	# Pydantic model for input
	class PromptRequest(BaseModel):
	prompt: str
	max_new_tokens: int = 100
	temperature: float = 0.7

	@app.post("/generate/")
	async def generate_text(request: PromptRequest):
	inputs = tokenizer(request.prompt, return_tensors="pt").to(device)

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=request.max_new_tokens,
	temperature=request.temperature,
	do_sample=False,
	pad_token_id=tokenizer.eos_token_id
	)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return {"response": response}