Spaces:
Running
Running
File size: 3,173 Bytes
27f6ef7 e384a9f 233b98c bbaa18e e384a9f 8a9401d e384a9f 8a9401d bbaa18e 9f05250 bbaa18e 8a9401d 9f05250 bbaa18e 9f05250 bbaa18e 9f05250 bbaa18e 9f05250 bbaa18e 9f05250 8a9401d 233b98c e384a9f b4930ce bbaa18e 233b98c bbaa18e 233b98c e384a9f 8a9401d e384a9f bbaa18e e384a9f 8a9401d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import os
from flask import Flask, jsonify, request
from flask_cors import CORS
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig
from peft import LoraConfig, AutoPeftModelForCausalLM
# Set the HF_HOME environment variable to a writable directory
os.environ["HF_HOME"] = "/workspace/huggingface_cache" # Change this to a writable path in your space
app = Flask(__name__)
# Enable CORS for specific origins
CORS(app, resources={r"api/predict/*": {"origins": ["http://localhost:3000", "https://main.dbn2ikif9ou3g.amplifyapp.com"]}})
# Global variables for model and tokenizer
model = None
tokenizer = None
def initialize_model(model_id):
global model, tokenizer
try:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype="float16",
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=bnb_config,
device_map="auto"
)
model.config.use_cache = False
model.config.pretraining_tp = 1
except Exception as e:
print(f"Error loading model: {e}")
def generate_response(user_input, model_id):
prompt = formatted_prompt(user_input)
if model is None or tokenizer is None:
return "Model or tokenizer not initialized."
# Prepare the input tensors
inputs = tokenizer(prompt, return_tensors="pt") # You may want to move to GPU here if available
generation_config = GenerationConfig(
max_new_tokens=100,
min_length=5,
temperature=0.7,
do_sample=False,
num_beams=1,
pad_token_id=tokenizer.eos_token_id,
truncation=True
)
# Generate response
outputs = model.generate(**inputs, generation_config=generation_config)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
def formatted_prompt(question) -> str:
return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"
@app.route("/", methods=["GET"])
def handle_get_request():
message = request.args.get("message", "No message provided.")
return jsonify({"message": message, "status": "GET request successful!"})
@app.route("/send_message", methods=["POST"])
def handle_post_request():
data = request.get_json()
if data is None:
return jsonify({"error": "No JSON data provided"}), 400
message = data.get("inputs", "No message provided.")
model_id = data.get("model_id", "YALCINKAYA/opsgenius-large") # Default model if not provided
# Generate a response from the model
model_response = generate_response(message, model_id)
return jsonify({
"received_message": model_response,
"status": "POST request successful!"
})
# Initialize the model and tokenizer when the app starts
initialize_model("YALCINKAYA/opsgenius-large")
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860)
|