File size: 4,978 Bytes
5efb178
 
9e1ad54
5efb178
 
 
 
9e1ad54
4460c63
5efb178
 
 
f7807b8
3d7c61d
 
 
 
 
 
 
 
 
 
 
94d28d9
3d7c61d
 
 
 
 
 
 
22bc6d2
554efdd
 
 
 
 
 
3d7c61d
 
554efdd
 
 
3d7c61d
 
1b3b718
3d7c61d
 
 
 
 
f7807b8
5efb178
f7807b8
 
 
3d7c61d
f7807b8
 
 
5efb178
4460c63
32e4fb8
5efb178
 
 
9e1ad54
 
 
72ff94c
5efb178
 
 
678ca1e
4460c63
5efb178
 
9e1ad54
4460c63
5efb178
 
 
 
 
4460c63
 
 
 
 
 
 
 
 
5efb178
 
 
4460c63
5efb178
 
 
 
 
72ff94c
5efb178
 
4460c63
5efb178
 
4460c63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import torch
import multiprocessing
from fastapi import FastAPI, Request
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from dotenv import load_dotenv
from accelerate import Accelerator
from typing import List, Tuple
# Load environment variables from a .env file (useful for local development)
load_dotenv()

# HTML for the Buy Me a Coffee badge
html_content = """
<!DOCTYPE html>
<html>
    <head>
        <title>Llama-3.2-1B-Instruct-API</title>
    </head>
    <body>
        <div style="text-align: center;">
            <a href="https://buymeacoffee.com/xxparthparekhxx" target="_blank">
                <img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png" 
                     alt="Buy Me A Coffee" 
                     height="40px">
            </a>
            <h2>Please Chill Out! 😎</h2>
            <p>This API takes around <strong>5.62 minutes</strong> to process a single request due to current hardware limitations.</p>
            <h3>Want Faster Responses? Help Me Out! πŸš€</h3>
            <p>If you'd like to see this API running faster on high-performance <strong>A100</strong> hardware, please consider buying me a coffee. β˜• Your support will go towards upgrading to <strong>Hugging Face Pro</strong>, which will allow me to run A100-powered spaces for everyone! πŸ™Œ</p>
            <h4>Instructions to Clone and Run Locally:</h4>
            <ol>
                <li><strong>Clone the Repository:</strong>   
                <div>
                    <code>git clone https://huggingface.co/spaces/xxparthparekhxx/llama-3.2-1B-FastApi</code>
                 </div> 
                 <div>
                    <code>cd llama-3.2-1B-FastApi</code>
                 </div>     
                </li>
                <li><strong>Run the Docker container:</strong>
                   <div>  <code>
                    docker build -t llama-api .  </code> </div>
                 <div>  <code>   docker run -p 7860:7860 llama-api </code> </div>
                </li>
                <li><strong>Access the API locally:</strong>
                    <p>Open <a href="http://localhost:7860">http://localhost:7860</a> to access the API docs locally.</p>
                </li>
            </ol>
        </div>
    </body>
</html>
"""

# FastAPI app with embedded Buy Me a Coffee badge and instructions
app = FastAPI(
    title="Llama-3.2-1B-Instruct-API",
    description= html_content,
    docs_url="/",  # URL for Swagger docs
    redoc_url="/doc"  # URL for ReDoc docs
)

HF_TOKEN = os.getenv("HF_TOKEN")
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

torch.set_num_threads(multiprocessing.cpu_count())
accelerator = Accelerator()

tokenizer = AutoTokenizer.from_pretrained(MODEL, token=HF_TOKEN, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    token=HF_TOKEN,
    torch_dtype=torch.float16,
    device_map=device
)

model, tokenizer = accelerator.prepare(model, tokenizer)
# Pydantic models for request validation
class PromptRequest(BaseModel):
    prompt: str
    max_new_tokens: int = 100
    temperature: float = 0.7

class ChatRequest(BaseModel):
    message: str
    history: List[Tuple[str, str]] = []
    max_new_tokens: int = 100
    temperature: float = 0.7
    system_prompt: str = "You are a helpful assistant."


# Endpoints
@app.post("/generate/")
async def generate_text(request: PromptRequest):
    inputs = tokenizer(request.prompt, return_tensors="pt").to(device)
   
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=request.max_new_tokens,
            temperature=request.temperature,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
   
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return {"response": response}

@app.post("/chat/")
async def chat(request: ChatRequest):
    conversation = [
        {"role": "system", "content": request.system_prompt}
    ]
    for human, assistant in request.history:
        conversation.extend([
            {"role": "user", "content": human},
            {"role": "assistant", "content": assistant}
        ])
    conversation.append({"role": "user", "content": request.message})

    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_new_tokens=request.max_new_tokens,
            temperature=request.temperature,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the assistant's response
    assistant_response = response.split("Assistant:")[-1].strip()
    
    return {"response": assistant_response}