Spaces:
Sleeping
Sleeping
from flask import Flask, request, jsonify | |
from huggingface_hub import InferenceClient | |
app = Flask(__name__) | |
app.config["DEBUG"] = True # Enable for debugging | |
# Load model client | |
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1") | |
# Function for text generation with enhanced prompt formatting | |
def generate( | |
prompt, history, system_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0 | |
): | |
temperature = float(temperature) | |
if temperature < 1e-2: | |
temperature = 1e-2 | |
top_p = float(top_p) | |
generate_kwargs = dict( | |
temperature=temperature, | |
max_new_tokens=max_new_tokens, | |
top_p=top_p, | |
repetition_penalty=repetition_penalty, | |
do_sample=True, | |
seed=42, | |
) | |
# Enhanced prompt formatting for better context | |
formatted_prompt = f"{system_prompt}\n{', '.join(f'{user_prompt} ||| {bot_response}' for user_prompt, bot_response in history)}\n{prompt}" | |
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False) | |
output = "" | |
for response in stream: | |
output += response.token.text | |
return output | |
def generate_text(): | |
data = request.json | |
prompt = data.get("prompt") | |
history = data.get("history", []) | |
system_prompt = data.get("system_prompt") | |
temperature = data.get("temperature", 0.9) | |
max_new_tokens = data.get("max_new_tokens", 256) | |
top_p = data.get("top_p", 0.95) | |
repetition_penalty = data.get("repetition_penalty", 1.0) | |
response = generate( | |
prompt, history, system_prompt, temperature, max_new_tokens, top_p, repetition_penalty | |
) | |
return jsonify({"response": response}) | |
if __name__ == "__main__": | |
app.run(host="0.0.0.0", port=7860) |