Spaces:

YALCINKAYA
/

opsgenius3

Sleeping

App Files Files Community

YALCINKAYA commited on 26 days ago

Commit

284c0f7

verified ·

1 Parent(s): 34139ad

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -35

app.py CHANGED Viewed

@@ -1,21 +1,23 @@
 import os
 import torch
 from flask import Flask, jsonify, request
-from flask_cors import CORS
-from transformers import GPTNeoForCausalLM, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, GenerationConfig
 # Set the HF_HOME environment variable to a writable directory
-os.environ["HF_HOME"] = "/workspace/huggingface_cache"  # Change this to a writable path in your space
 app = Flask(__name__)
 # Enable CORS for specific origins
-CORS(app, resources={r"api/predict/*": {"origins": ["http://localhost:3000", "https://main.dbn2ikif9ou3g.amplifyapp.com"]}})
 # Global variables for model and tokenizer
 model = None
 tokenizer = None
-def get_model_and_tokenizer(model_id):
     global model, tokenizer
     if model is None or tokenizer is None:
         try:
@@ -23,33 +25,32 @@ def get_model_and_tokenizer(model_id):
             tokenizer = AutoTokenizer.from_pretrained(model_id)
             tokenizer.pad_token = tokenizer.eos_token
-            print(f"Loading model for model_id: {model_id} on {device}")
-            bnb_config = BitsAndBytesConfig(
-                load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
             )
             model = AutoModelForCausalLM.from_pretrained(
                 model_id, quantization_config=bnb_config, device_map="auto"
             )
-            model.config.use_cache=False
-            model.config.pretraining_tp=1
         except Exception as e:
             print(f"Error loading model: {e}")
-            raise e  # Raise the error to be caught in the POST request
-    else:
-        print(f"Model and tokenizer for {model_id} are already loaded.")
 def generate_response(user_input, model_id):
     # Ensure model and tokenizer are loaded
     get_model_and_tokenizer(model_id)
-    prompt = user_input
-    generation_config = GenerationConfig(
         penalty_alpha=0.6,
         do_sample=True,
         top_p=0.2,
@@ -57,18 +58,18 @@ def generate_response(user_input, model_id):
         temperature=0.3,
         repetition_penalty=1.2,
         max_new_tokens=60,
-        pad_token_id=tokenizer.eos_token_id,
-        stop_sequences=["User:", "Assistant:", "\n"],
     )
-    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
     outputs = model.generate(**inputs, generation_config=generation_config)
-    response = (tokenizer.decode(outputs[0], skip_special_tokens=True))
-    cleaned_response = response.replace("User:", "").replace("Assistant:", "").strip()
-    return cleaned_response.strip().split("\n")[0]  # Keep only the first line of response
 @app.route("/", methods=["GET"])
 def handle_get_request():
@@ -81,21 +82,21 @@ def handle_post_request():
     if data is None:
         return jsonify({"error": "No JSON data provided"}), 400
-    message = data.get("inputs", "No message provided.")
-    model_id = data.get("model_id", "YALCINKAYA/FinetunedByYalcin")  # Default model if not provided
     try:
-        print(f"Loading")
-        # Generate a response from the model
         model_response = generate_response(message, model_id)
         return jsonify({
-            "received_message": model_response,
-            "model_id": model_id,
             "status": "POST request successful!"
         })
     except Exception as e:
         print(f"Error handling POST request: {e}")
-        return jsonify({"error": "An error occurred while processing your request."}), 500
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=7860)

 import os
 import torch
 from flask import Flask, jsonify, request
+from flask_cors import CORS
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig
+import re
 # Set the HF_HOME environment variable to a writable directory
+os.environ["HF_HOME"] = "/workspace/huggingface_cache"
 app = Flask(__name__)
 # Enable CORS for specific origins
+CORS(app, resources={r"/send_message": {"origins": ["http://localhost:3000", "https://main.dbn2ikif9ou3g.amplifyapp.com"]}})
 # Global variables for model and tokenizer
 model = None
 tokenizer = None
+def get_model_and_tokenizer(model_id: str):
     global model, tokenizer
     if model is None or tokenizer is None:
         try:
             tokenizer = AutoTokenizer.from_pretrained(model_id)
             tokenizer.pad_token = tokenizer.eos_token
+            print(f"Loading model for model_id: {model_id}")
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True
             )
             model = AutoModelForCausalLM.from_pretrained(
                 model_id, quantization_config=bnb_config, device_map="auto"
             )
+            model.config.use_cache = False
+            model.config.pretraining_tp = 1
+            model.config.pad_token_id = tokenizer.eos_token_id  # Fix padding issue
         except Exception as e:
             print(f"Error loading model: {e}")
+            raise e
 def generate_response(user_input, model_id):
     # Ensure model and tokenizer are loaded
     get_model_and_tokenizer(model_id)
+    prompt = user_input
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    generation_config = GenerationConfig(
         penalty_alpha=0.6,
         do_sample=True,
         top_p=0.2,
         temperature=0.3,
         repetition_penalty=1.2,
         max_new_tokens=60,
+        pad_token_id=tokenizer.eos_token_id
     )
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    model.to(device)
     outputs = model.generate(**inputs, generation_config=generation_config)
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Clean up response
+    cleaned_response = re.sub(r"(User:|Assistant:)", "", response).strip()
+    return cleaned_response.split("\n")[0]  # Keep only the first line of response
 @app.route("/", methods=["GET"])
 def handle_get_request():
     if data is None:
         return jsonify({"error": "No JSON data provided"}), 400
+    message = data.get("inputs", "No message provided.")
+    model_id = data.get("model_id", "YALCINKAYA/FinetunedByYalcin")
     try:
+        print(f"Processing request")
         model_response = generate_response(message, model_id)
         return jsonify({
+            "received_message": model_response,
+            "model_id": model_id,
             "status": "POST request successful!"
         })
     except Exception as e:
+        error_message = str(e) if app.debug else "An error occurred while processing your request."
         print(f"Error handling POST request: {e}")
+        return jsonify({"error": error_message}), 500
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=7860)