YALCINKAYA commited on
Commit
cbe8e48
·
verified ·
1 Parent(s): bfe1386

Update app.py

Browse files

🔹 Detects GPU availability and assigns cuda if available, otherwise falls back to cpu.
🔹 Moves the model to GPU when loading (model.to(device)).
🔹 Moves input tensors to GPU before passing them to the model (inputs.to(device)).

Files changed (1) hide show
  1. app.py +12 -48
app.py CHANGED
@@ -1,8 +1,8 @@
1
  import os
 
2
  from flask import Flask, jsonify, request
3
- from flask_cors import CORS
4
- from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
5
- import re
6
 
7
  # Set the HF_HOME environment variable to a writable directory
8
  os.environ["HF_HOME"] = "/workspace/huggingface_cache" # Change this to a writable path in your space
@@ -12,6 +12,9 @@ app = Flask(__name__)
12
  # Enable CORS for specific origins
13
  CORS(app, resources={r"api/predict/*": {"origins": ["http://localhost:3000", "https://main.dbn2ikif9ou3g.amplifyapp.com"]}})
14
 
 
 
 
15
  # Global variables for model and tokenizer
16
  model = None
17
  tokenizer = None
@@ -23,9 +26,9 @@ def get_model_and_tokenizer(model_id):
23
  print(f"Loading tokenizer for model_id: {model_id}")
24
  tokenizer = AutoTokenizer.from_pretrained(model_id)
25
  tokenizer.pad_token = tokenizer.eos_token
26
-
27
- print(f"Loading model for model_id: {model_id}")
28
- model = AutoModelForCausalLM.from_pretrained(model_id)
29
  model.config.use_cache = False
30
  except Exception as e:
31
  print(f"Error loading model: {e}")
@@ -33,47 +36,12 @@ def get_model_and_tokenizer(model_id):
33
  else:
34
  print(f"Model and tokenizer for {model_id} are already loaded.")
35
 
36
- # max_new_tokens=100,
37
- # min_length=5,
38
- # do_sample=False,
39
- # num_beams=1,
40
- # pad_token_id=tokenizer.eos_token_id,
41
- # truncation=True
42
-
43
- #penalty_alpha=0.6,
44
- #do_sample = True,
45
- #top_k=5,
46
- #temperature=0.5,
47
- #repetition_penalty=1.2,
48
- #max_new_tokens=60,
49
- #pad_token_id=tokenizer.eos_token_id,
50
- #truncation=True,
51
-
52
- #penalty_alpha=0.6, # Keep this to balance exploration and exploitation
53
- #do_sample=True, # Keep sampling to allow for variability in responses
54
- #top_k=20, # Increase top_k to give more options for sampling
55
- #temperature=0.3, # Lower temperature to make outputs more deterministic and focused
56
- #repetition_penalty=1.5, # Increase repetition penalty to discourage repeated phrases
57
- #max_new_tokens=60, # Keep this as is, depending on your expected output length
58
- #pad_token_id=tokenizer.eos_token_id,
59
- #truncation=True, # Enable truncation for input sequences
60
-
61
- #penalty_alpha=0.6, # Maintain this for balance
62
- #do_sample=True, # Allow sampling for variability
63
- #top_k=3, # Reduce top_k to narrow down options
64
- #temperature=0.7, # Keep this low for more deterministic responses
65
- #repetition_penalty=1.2, # Keep this moderate to avoid repetitive responses
66
- #max_new_tokens=60, # Maintain this limit
67
- #pad_token_id=tokenizer.eos_token_id,
68
- #truncation=True, # Enable truncation for longer prompts
69
- #
70
-
71
  def generate_response(user_input, model_id):
72
  # Ensure model and tokenizer are loaded
73
- get_model_and_tokenizer(model_id) # Load the model/tokenizer if not already loaded
74
-
75
  prompt = user_input
76
- inputs = tokenizer([prompt], return_tensors="pt")
77
 
78
  generation_config = GenerationConfig(
79
  penalty_alpha=0.6,
@@ -91,10 +59,6 @@ def generate_response(user_input, model_id):
91
  response = tokenizer.decode(outputs[:, inputs['input_ids'].shape[-1]:][0], skip_special_tokens=True)
92
  cleaned_response = response.replace("User:", "").replace("Assistant:", "").strip()
93
  return cleaned_response.strip().split("\n")[0] # Keep only the first line of response
94
- #return response.strip().split("Assistant:")[-1].strip()
95
-
96
- def formatted_prompt(question) -> str:
97
- return f"<|startoftext|>User: {question}\nAssistant:"
98
 
99
  @app.route("/", methods=["GET"])
100
  def handle_get_request():
 
1
  import os
2
+ import torch
3
  from flask import Flask, jsonify, request
4
+ from flask_cors import CORS
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
 
6
 
7
  # Set the HF_HOME environment variable to a writable directory
8
  os.environ["HF_HOME"] = "/workspace/huggingface_cache" # Change this to a writable path in your space
 
12
  # Enable CORS for specific origins
13
  CORS(app, resources={r"api/predict/*": {"origins": ["http://localhost:3000", "https://main.dbn2ikif9ou3g.amplifyapp.com"]}})
14
 
15
+ # Detect GPU or fallback to CPU
16
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
+
18
  # Global variables for model and tokenizer
19
  model = None
20
  tokenizer = None
 
26
  print(f"Loading tokenizer for model_id: {model_id}")
27
  tokenizer = AutoTokenizer.from_pretrained(model_id)
28
  tokenizer.pad_token = tokenizer.eos_token
29
+
30
+ print(f"Loading model for model_id: {model_id} on {device}")
31
+ model = AutoModelForCausalLM.from_pretrained(model_id).to(device) # Move model to GPU
32
  model.config.use_cache = False
33
  except Exception as e:
34
  print(f"Error loading model: {e}")
 
36
  else:
37
  print(f"Model and tokenizer for {model_id} are already loaded.")
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def generate_response(user_input, model_id):
40
  # Ensure model and tokenizer are loaded
41
+ get_model_and_tokenizer(model_id)
42
+
43
  prompt = user_input
44
+ inputs = tokenizer([prompt], return_tensors="pt").to(device) # Move inputs to GPU
45
 
46
  generation_config = GenerationConfig(
47
  penalty_alpha=0.6,
 
59
  response = tokenizer.decode(outputs[:, inputs['input_ids'].shape[-1]:][0], skip_special_tokens=True)
60
  cleaned_response = response.replace("User:", "").replace("Assistant:", "").strip()
61
  return cleaned_response.strip().split("\n")[0] # Keep only the first line of response
 
 
 
 
62
 
63
  @app.route("/", methods=["GET"])
64
  def handle_get_request():