parth parekh commited on
Commit
5efb178
Β·
1 Parent(s): 1ca6fd7

added server

Browse files
Files changed (3) hide show
  1. Dockerfile +18 -0
  2. main.py +52 -0
  3. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python image as a base
2
+ FROM python:3.12-slim
3
+
4
+ # Set the working directory inside the container
5
+ WORKDIR /app
6
+
7
+ # Copy the requirements file and install dependencies
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ # Copy the rest of the application code
12
+ COPY . .
13
+
14
+ # Expose port 8000 for the FastAPI app
15
+ EXPOSE 8000
16
+
17
+ # Run the FastAPI app with uvicorn
18
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
main.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from fastapi import FastAPI, Request
4
+ from pydantic import BaseModel
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+ from dotenv import load_dotenv
7
+
8
+ # Load environment variables from a .env file (useful for local development)
9
+ load_dotenv()
10
+
11
+ # Initialize FastAPI app
12
+ app = FastAPI()
13
+
14
+ # Set your Hugging Face token from environment variable
15
+ HF_TOKEN = os.getenv("HF_TOKEN")
16
+
17
+ MODEL = "meta-llama/Llama-3.2-3B-Instruct"
18
+
19
+ device = "cuda" if torch.cuda.is_available() else "cpu"
20
+ print(f"Using device: {device}")
21
+
22
+ # Load model and tokenizer
23
+ tokenizer = AutoTokenizer.from_pretrained(MODEL, token=HF_TOKEN)
24
+ model = AutoModelForCausalLM.from_pretrained(
25
+ MODEL,
26
+ token=HF_TOKEN,
27
+ torch_dtype=torch.float16, # Use float16 for better GPU memory usage
28
+ device_map="auto"
29
+ )
30
+
31
+ # Pydantic model for input
32
+ class PromptRequest(BaseModel):
33
+ prompt: str
34
+ max_new_tokens: int = 100
35
+ temperature: float = 0.7
36
+
37
+ @app.post("/generate/")
38
+ async def generate_text(request: PromptRequest):
39
+ inputs = tokenizer(request.prompt, return_tensors="pt").to(device)
40
+
41
+ with torch.no_grad():
42
+ outputs = model.generate(
43
+ **inputs,
44
+ max_new_tokens=request.max_new_tokens,
45
+ temperature=request.temperature,
46
+ do_sample=True,
47
+ pad_token_id=tokenizer.eos_token_id
48
+ )
49
+
50
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
51
+ return {"response": response}
52
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi
2
+ transformers
3
+ torch
4
+ uvicorn
5
+ python-dotenv