Spaces:

YALCINKAYA
/

opsgenius3

Sleeping

App Files Files Community

YALCINKAYA commited on Feb 26

Commit

cbe8e48

verified ·

1 Parent(s): bfe1386

Update app.py

Browse files

🔹 Detects GPU availability and assigns cuda if available, otherwise falls back to cpu.
🔹 Moves the model to GPU when loading (model.to(device)).
🔹 Moves input tensors to GPU before passing them to the model (inputs.to(device)).

Files changed (1) hide show

app.py +12 -48

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import os
 from flask import Flask, jsonify, request
-from flask_cors import CORS
-from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
-import re
 # Set the HF_HOME environment variable to a writable directory
 os.environ["HF_HOME"] = "/workspace/huggingface_cache"  # Change this to a writable path in your space
@@ -12,6 +12,9 @@ app = Flask(__name__)
 # Enable CORS for specific origins
 CORS(app, resources={r"api/predict/*": {"origins": ["http://localhost:3000", "https://main.dbn2ikif9ou3g.amplifyapp.com"]}})
 # Global variables for model and tokenizer
 model = None
 tokenizer = None
@@ -23,9 +26,9 @@ def get_model_and_tokenizer(model_id):
             print(f"Loading tokenizer for model_id: {model_id}")
             tokenizer = AutoTokenizer.from_pretrained(model_id)
             tokenizer.pad_token = tokenizer.eos_token
-            print(f"Loading model for model_id: {model_id}")
-            model = AutoModelForCausalLM.from_pretrained(model_id)
             model.config.use_cache = False
         except Exception as e:
             print(f"Error loading model: {e}")
@@ -33,47 +36,12 @@ def get_model_and_tokenizer(model_id):
     else:
         print(f"Model and tokenizer for {model_id} are already loaded.")
-        # max_new_tokens=100,
-        # min_length=5,
-        # do_sample=False,
-        # num_beams=1,
-        # pad_token_id=tokenizer.eos_token_id,
-        # truncation=True
-        #penalty_alpha=0.6,
-        #do_sample = True,
-        #top_k=5,
-        #temperature=0.5,
-        #repetition_penalty=1.2,
-        #max_new_tokens=60,
-        #pad_token_id=tokenizer.eos_token_id,
-        #truncation=True,
-        #penalty_alpha=0.6,           # Keep this to balance exploration and exploitation
-        #do_sample=True,               # Keep sampling to allow for variability in responses
-        #top_k=20,                    # Increase top_k to give more options for sampling
-        #temperature=0.3,             # Lower temperature to make outputs more deterministic and focused
-        #repetition_penalty=1.5,      # Increase repetition penalty to discourage repeated phrases
-        #max_new_tokens=60,           # Keep this as is, depending on your expected output length
-        #pad_token_id=tokenizer.eos_token_id,
-        #truncation=True,                       # Enable truncation for input sequences
-        #penalty_alpha=0.6,           # Maintain this for balance
-        #do_sample=True,               # Allow sampling for variability
-        #top_k=3,                    # Reduce top_k to narrow down options
-        #temperature=0.7,             # Keep this low for more deterministic responses
-        #repetition_penalty=1.2,      # Keep this moderate to avoid repetitive responses
-        #max_new_tokens=60,           # Maintain this limit
-        #pad_token_id=tokenizer.eos_token_id,
-        #truncation=True,              # Enable truncation for longer prompts
-       #
 def generate_response(user_input, model_id):
     # Ensure model and tokenizer are loaded
-    get_model_and_tokenizer(model_id)  # Load the model/tokenizer if not already loaded
     prompt = user_input
-    inputs = tokenizer([prompt], return_tensors="pt")
     generation_config = GenerationConfig(
         penalty_alpha=0.6,
@@ -91,10 +59,6 @@ def generate_response(user_input, model_id):
     response = tokenizer.decode(outputs[:, inputs['input_ids'].shape[-1]:][0], skip_special_tokens=True)
     cleaned_response = response.replace("User:", "").replace("Assistant:", "").strip()
     return cleaned_response.strip().split("\n")[0]  # Keep only the first line of response
-    #return response.strip().split("Assistant:")[-1].strip()
-def formatted_prompt(question) -> str:
-    return f"<|startoftext|>User: {question}\nAssistant:"
 @app.route("/", methods=["GET"])
 def handle_get_request():

 import os
+import torch
 from flask import Flask, jsonify, request
+from flask_cors import CORS
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
 # Set the HF_HOME environment variable to a writable directory
 os.environ["HF_HOME"] = "/workspace/huggingface_cache"  # Change this to a writable path in your space
 # Enable CORS for specific origins
 CORS(app, resources={r"api/predict/*": {"origins": ["http://localhost:3000", "https://main.dbn2ikif9ou3g.amplifyapp.com"]}})
+# Detect GPU or fallback to CPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Global variables for model and tokenizer
 model = None
 tokenizer = None
             print(f"Loading tokenizer for model_id: {model_id}")
             tokenizer = AutoTokenizer.from_pretrained(model_id)
             tokenizer.pad_token = tokenizer.eos_token
+            print(f"Loading model for model_id: {model_id} on {device}")
+            model = AutoModelForCausalLM.from_pretrained(model_id).to(device)  # Move model to GPU
             model.config.use_cache = False
         except Exception as e:
             print(f"Error loading model: {e}")
     else:
         print(f"Model and tokenizer for {model_id} are already loaded.")
 def generate_response(user_input, model_id):
     # Ensure model and tokenizer are loaded
+    get_model_and_tokenizer(model_id)
     prompt = user_input
+    inputs = tokenizer([prompt], return_tensors="pt").to(device)  # Move inputs to GPU
     generation_config = GenerationConfig(
         penalty_alpha=0.6,
     response = tokenizer.decode(outputs[:, inputs['input_ids'].shape[-1]:][0], skip_special_tokens=True)
     cleaned_response = response.replace("User:", "").replace("Assistant:", "").strip()
     return cleaned_response.strip().split("\n")[0]  # Keep only the first line of response
 @app.route("/", methods=["GET"])
 def handle_get_request():