Spaces:

Redmind
/

qwen2.5-7b-4bit

Runtime error

Gokulavelan commited on Mar 15

Commit

762b3c6

1 Parent(s): 5aaa5d5

chnages

Files changed (2) hide show

Dockerfile CHANGED Viewed

@@ -1,28 +1,24 @@
-# Use the official Python image
 FROM python:3.9-slim
 # Set environment variables
 ENV PYTHONUNBUFFERED=1 \
-    PYTHONDONTWRITEBYTECODE=1
-# Set Hugging Face cache directory to a writable location inside the container
-ENV HF_HOME="/app/huggingface_cache"
-# Ensure the cache directory exists and is writable
 RUN mkdir -p $HF_HOME && chmod -R 777 $HF_HOME
-# Set working directory inside the container
 WORKDIR /app
-# Copy and install dependencies
-COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy the entire project (including main.py in root)
-COPY . /app
-# Expose the FastAPI port
 EXPOSE 8000
-# Run FastAPI (use `main` instead of `app.main`)
-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

+# Use official Python image
 FROM python:3.9-slim
 # Set environment variables
 ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    HF_HOME="/app/huggingface_cache" \
+    TRANSFORMERS_CACHE="/app/huggingface_cache"
+# Create cache directory
 RUN mkdir -p $HF_HOME && chmod -R 777 $HF_HOME
+# Copy app files
 WORKDIR /app
+COPY . .
+# Install dependencies
 RUN pip install --no-cache-dir -r requirements.txt
+# Expose FastAPI port
 EXPOSE 8000
+# Run FastAPI
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

app/main.py CHANGED Viewed

@@ -1,7 +1,12 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
 app = FastAPI()
@@ -10,25 +15,25 @@ class TextGenerationRequest(BaseModel):
     max_length: int = 100
     temperature: float = 0.7
-# Load model and tokenizer (force CPU usage)
 model_name = "unsloth/Qwen2.5-7B-bnb-4bit"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     trust_remote_code=True,
-    torch_dtype=torch.float32,  # Change to float32 for CPU
-    device_map="cpu"  # Force CPU usage
 )
-@app.get("/", tags=["Home"])
 def api_home():
-    return {'detail': 'Welcome to FastAPI TextGen Tutorial!'}
 @app.post("/generate")
 async def generate_text(request: TextGenerationRequest):
     try:
-        inputs = tokenizer(request.prompt, return_tensors="pt").to("cpu")  # Move to CPU
         outputs = model.generate(
             inputs.input_ids,
             max_length=request.max_length,

+import os
+import torch
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
+# Set Hugging Face cache directory (to avoid permission issues in Docker)
+os.environ["HF_HOME"] = "/app/huggingface_cache"
+os.environ["TRANSFORMERS_CACHE"] = "/app/huggingface_cache"
 app = FastAPI()
     max_length: int = 100
     temperature: float = 0.7
+# Load model and tokenizer (Force CPU)
 model_name = "unsloth/Qwen2.5-7B-bnb-4bit"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     trust_remote_code=True,
+    torch_dtype=torch.float32,  # Use float32 for CPU
+    device_map="cpu"  # Force CPU
 )
+@app.get("/")
 def api_home():
+    return {"detail": "Welcome to FastAPI TextGen API!"}
 @app.post("/generate")
 async def generate_text(request: TextGenerationRequest):
     try:
+        inputs = tokenizer(request.prompt, return_tensors="pt").to("cpu")  # Ensure CPU usage
         outputs = model.generate(
             inputs.input_ids,
             max_length=request.max_length,