from flask import Flask, request, jsonify from transformers import AutoModelForCausalLM, AutoTokenizer import torch app = Flask(__name__) # Load the model and tokenizer model_name = "dicta-il/dictalm2.0-instruct" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) tokenizer = AutoTokenizer.from_pretrained(model_name) # Ensure the tokenizer has a pad token, if not, add one. if tokenizer.pad_token is None: tokenizer.add_special_tokens({'pad_token': '[PAD]'}) model.resize_token_embeddings(len(tokenizer)) # Set the device to load the model onto device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) @app.route('/chat', methods=['POST']) def chat(): data = request.json messages = data.get("messages", []) if not messages: return jsonify({"error": "No messages provided"}), 400 # Combine messages into a single input string with the correct template conversation = "" for i, message in enumerate(messages): role = message["role"] content = message["content"] if role == "user": if i == 0: conversation += f"[INST] {content} [/INST]" else: conversation += f" [INST] {content} [/INST]" elif role == "assistant": conversation += f" {content}" conversation += "" # Tokenize the combined conversation encoded = tokenizer(conversation, return_tensors="pt").to(device) # Generate response using the model generated_ids = model.generate( input_ids=encoded['input_ids'], attention_mask=encoded['attention_mask'], max_new_tokens=50, pad_token_id=tokenizer.pad_token_id, do_sample=True ) decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True) return jsonify({"response": decoded}) if __name__ == '__main__': app.run(host='0.0.0.0', port=5000)