File size: 3,173 Bytes
27f6ef7
e384a9f
233b98c
bbaa18e
 
 
 
 
e384a9f
 
8a9401d
e384a9f
 
8a9401d
bbaa18e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f05250
bbaa18e
 
8a9401d
9f05250
bbaa18e
9f05250
bbaa18e
9f05250
 
bbaa18e
9f05250
bbaa18e
 
9f05250
 
 
 
 
 
 
8a9401d
233b98c
 
e384a9f
 
 
 
 
 
 
 
 
 
 
 
b4930ce
bbaa18e
233b98c
bbaa18e
 
233b98c
e384a9f
8a9401d
e384a9f
 
 
bbaa18e
 
 
e384a9f
8a9401d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
from flask import Flask, jsonify, request
from flask_cors import CORS 
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig  
from peft import LoraConfig, AutoPeftModelForCausalLM

# Set the HF_HOME environment variable to a writable directory
os.environ["HF_HOME"] = "/workspace/huggingface_cache"  # Change this to a writable path in your space

app = Flask(__name__)

# Enable CORS for specific origins
CORS(app, resources={r"api/predict/*": {"origins": ["http://localhost:3000", "https://main.dbn2ikif9ou3g.amplifyapp.com"]}})

# Global variables for model and tokenizer
model = None
tokenizer = None

def initialize_model(model_id):
    global model, tokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        tokenizer.pad_token = tokenizer.eos_token
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype="float16",
            bnb_4bit_use_double_quant=True
        )
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            device_map="auto"
        )
        model.config.use_cache = False
        model.config.pretraining_tp = 1
    except Exception as e:
        print(f"Error loading model: {e}")

def generate_response(user_input, model_id):
    prompt = formatted_prompt(user_input)
    
    if model is None or tokenizer is None:
        return "Model or tokenizer not initialized."

    # Prepare the input tensors
    inputs = tokenizer(prompt, return_tensors="pt")  # You may want to move to GPU here if available
    generation_config = GenerationConfig(
        max_new_tokens=100,
        min_length=5,
        temperature=0.7, 
        do_sample=False,
        num_beams=1,
        pad_token_id=tokenizer.eos_token_id,
        truncation=True
    )

    # Generate response
    outputs = model.generate(**inputs, generation_config=generation_config)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

def formatted_prompt(question) -> str:
    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"

@app.route("/", methods=["GET"])
def handle_get_request():
    message = request.args.get("message", "No message provided.")
    return jsonify({"message": message, "status": "GET request successful!"})

@app.route("/send_message", methods=["POST"])
def handle_post_request():
    data = request.get_json()
    if data is None:
        return jsonify({"error": "No JSON data provided"}), 400

    message = data.get("inputs", "No message provided.") 
    model_id = data.get("model_id", "YALCINKAYA/opsgenius-large")  # Default model if not provided

    # Generate a response from the model
    model_response = generate_response(message, model_id)

    return jsonify({
        "received_message": model_response, 
        "status": "POST request successful!"
    })

# Initialize the model and tokenizer when the app starts
initialize_model("YALCINKAYA/opsgenius-large")

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=7860)