File size: 4,978 Bytes
5efb178 9e1ad54 5efb178 9e1ad54 4460c63 5efb178 f7807b8 3d7c61d 94d28d9 3d7c61d 22bc6d2 554efdd 3d7c61d 554efdd 3d7c61d 1b3b718 3d7c61d f7807b8 5efb178 f7807b8 3d7c61d f7807b8 5efb178 4460c63 32e4fb8 5efb178 9e1ad54 72ff94c 5efb178 678ca1e 4460c63 5efb178 9e1ad54 4460c63 5efb178 4460c63 5efb178 4460c63 5efb178 72ff94c 5efb178 4460c63 5efb178 4460c63 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import os
import torch
import multiprocessing
from fastapi import FastAPI, Request
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from dotenv import load_dotenv
from accelerate import Accelerator
from typing import List, Tuple
# Load environment variables from a .env file (useful for local development)
load_dotenv()
# HTML for the Buy Me a Coffee badge
html_content = """
<!DOCTYPE html>
<html>
<head>
<title>Llama-3.2-1B-Instruct-API</title>
</head>
<body>
<div style="text-align: center;">
<a href="https://buymeacoffee.com/xxparthparekhxx" target="_blank">
<img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png"
alt="Buy Me A Coffee"
height="40px">
</a>
<h2>Please Chill Out! π</h2>
<p>This API takes around <strong>5.62 minutes</strong> to process a single request due to current hardware limitations.</p>
<h3>Want Faster Responses? Help Me Out! π</h3>
<p>If you'd like to see this API running faster on high-performance <strong>A100</strong> hardware, please consider buying me a coffee. β Your support will go towards upgrading to <strong>Hugging Face Pro</strong>, which will allow me to run A100-powered spaces for everyone! π</p>
<h4>Instructions to Clone and Run Locally:</h4>
<ol>
<li><strong>Clone the Repository:</strong>
<div>
<code>git clone https://huggingface.co/spaces/xxparthparekhxx/llama-3.2-1B-FastApi</code>
</div>
<div>
<code>cd llama-3.2-1B-FastApi</code>
</div>
</li>
<li><strong>Run the Docker container:</strong>
<div> <code>
docker build -t llama-api . </code> </div>
<div> <code> docker run -p 7860:7860 llama-api </code> </div>
</li>
<li><strong>Access the API locally:</strong>
<p>Open <a href="http://localhost:7860">http://localhost:7860</a> to access the API docs locally.</p>
</li>
</ol>
</div>
</body>
</html>
"""
# FastAPI app with embedded Buy Me a Coffee badge and instructions
app = FastAPI(
title="Llama-3.2-1B-Instruct-API",
description= html_content,
docs_url="/", # URL for Swagger docs
redoc_url="/doc" # URL for ReDoc docs
)
HF_TOKEN = os.getenv("HF_TOKEN")
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
torch.set_num_threads(multiprocessing.cpu_count())
accelerator = Accelerator()
tokenizer = AutoTokenizer.from_pretrained(MODEL, token=HF_TOKEN, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL,
token=HF_TOKEN,
torch_dtype=torch.float16,
device_map=device
)
model, tokenizer = accelerator.prepare(model, tokenizer)
# Pydantic models for request validation
class PromptRequest(BaseModel):
prompt: str
max_new_tokens: int = 100
temperature: float = 0.7
class ChatRequest(BaseModel):
message: str
history: List[Tuple[str, str]] = []
max_new_tokens: int = 100
temperature: float = 0.7
system_prompt: str = "You are a helpful assistant."
# Endpoints
@app.post("/generate/")
async def generate_text(request: PromptRequest):
inputs = tokenizer(request.prompt, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=request.max_new_tokens,
temperature=request.temperature,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"response": response}
@app.post("/chat/")
async def chat(request: ChatRequest):
conversation = [
{"role": "system", "content": request.system_prompt}
]
for human, assistant in request.history:
conversation.extend([
{"role": "user", "content": human},
{"role": "assistant", "content": assistant}
])
conversation.append({"role": "user", "content": request.message})
input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model.generate(
input_ids,
max_new_tokens=request.max_new_tokens,
temperature=request.temperature,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the assistant's response
assistant_response = response.split("Assistant:")[-1].strip()
return {"response": assistant_response} |