Spaces:

vkrishnan569
/

Law_Model_Server

Sleeping

File size: 9,897 Bytes

# from flask import Flask, request, jsonify
# from llama_cpp import Llama
# from huggingface_hub import hf_hub_download
# from model import model_download
# # model_download()

# # Initialize the Llama model with chat format set to "llama-2"
# llm = Llama(model_path="E:\langchain-chat-gui-main\langchain-chat-gui-main\model-unsloth.Q8_0.gguf", chat_format="llama-2")

# # Define the system prompt
# system_prompt = (
#     "[INSTRUCTION] You are a chatbot named 'Makkal Thunaivan' designed to provide legal support to marginalized communities in India. "
#     "You were fine-tuned by Sathish Kumar and his team members at the University College of Engineering Dindigul. "
#      "Developer Team members include Karthikeyan as Model Trainer, Prashanna as Dataset Researcher, Nivas as Model Architect, and Sathish Kumar as Team Leader and Frontend Developer and Model Tester. "
#     "Your purpose is to answer questions related to Indian law and marginalized communities in India. "
#     "You have been trained on various legal topics. "
#     "Your responses should be concise, meaningful, and accurate."
#     "When a user asks for more information or details, provide a more comprehensive explanation. "
#     "Your responses should be respectful and informative."
#     "Do not provide information unrelated to India or Indian law. "
#     "Feel free to ask questions."
# )

# # Initialize the conversation history list with the system prompt
# conversation_history = [{"role": "system", "content": system_prompt}]

# # Create a Flask application
# app = Flask(__name__)

# # Define the model function
# def model(query):
#     global conversation_history  # Declare global to update history

#     # Add the user's query to the conversation history
#     conversation_history.append({"role": "user", "content": query})

#     # Calculate the total number of tokens in the conversation history
#     # (You may need to modify this part to calculate the token count accurately based on your tokenizer)
#     total_tokens = sum(len(message["content"].split()) for message in conversation_history)

#     # If the total number of tokens exceeds the model's context window, trim the history
#     # You may need to adjust the 512 value based on your model's actual context window size
#     context_window_size = 512
#     while total_tokens > context_window_size:
#         # Remove the oldest messages from the conversation history
#         conversation_history.pop(0)
#         # Recalculate the total number of tokens
#         total_tokens = sum(len(message["content"].split()) for message in conversation_history)

#     # Generate chat completion with the conversation history
#     response = llm.create_chat_completion(messages=conversation_history, max_tokens=75)
    
#     # Extract the assistant's response from the completion dictionary
#     if response and 'choices' in response and response['choices']:
#         assistant_response = response['choices'][0]['message']['content']
#         assistant_response = assistant_response.strip()
        
#         # Add the assistant's response to the conversation history
#         conversation_history.append({"role": "assistant", "content": assistant_response})

#         # Print the assistant's response
#         print("Assistant response:", assistant_response)
        
#         # Return the assistant's response
#         return assistant_response
#     else:
#         print("Error: Invalid response structure.")
#         return None


# # Define the endpoint for the API
# @app.route("/chat", methods=["GET"])
# def chat_endpoint():
#     # Get the query parameter from the request
#     query = request.args.get("query")
    
#     # Check if the "refresh" parameter is set to "true"
#     refresh = request.args.get("refresh")
#     if refresh and refresh.lower() == "true":
#         # Clear the conversation history
#         global conversation_history
#         conversation_history = [{"role": "system", "content": system_prompt}]
#         return jsonify({"response": "Conversation history cleared."})
    
#     # If there is no query, return an error message
#     if not query:
#         return jsonify({"error": "Query parameter is required."}), 400
    
#     # Call the model function with the query
#     response = model(query)
    
#     # Return the assistant's response as JSON
#     return jsonify({"response": response})

# # Run the Flask app
# if __name__ == "__main__":
#     app.run(host="0.0.0.0", port=5000)



from flask import Flask, request, jsonify
from llama_cpp import Llama
import logging

# Initialize logging
logging.basicConfig(level=logging.INFO)

# Initialize the Llama model with chat format set to "llama-2"
llm = Llama(model_path="./law-chat.Q2_K.gguf", chat_format="llama-2")

# Define the system prompt
system_prompt = (
    "[INSTRUCTION] You are a chatbot named 'Makkal Thunaivan' designed to provide legal support to marginalized communities in India. "
    "You were fine-tuned by Sathish Kumar and his team members at the University College of Engineering Dindigul. "
     "Developer Team members include Karthikeyan as Model Trainer, Prashanna as Dataset Researcher, Nivas as Model Architect, and Sathish Kumar as Team Leader and Frontend Developer and Model Tester. "
    "Your purpose is to answer questions related to Indian law and marginalized communities in India. "
    "You have been trained on various legal topics. "
    "Your responses should be concise, meaningful, and accurate."
    "When a user asks for more information or details, provide a more comprehensive explanation. "
    "Your responses should be respectful and informative."
    "Do not provide information unrelated to India or Indian law. "
    "Feel free to ask questions."
)

# Initialize the conversation history list with the system prompt
conversation_history = [{"role": "system", "content": system_prompt}]

# Define conversation history size limit
MAX_CONVERSATION_HISTORY_SIZE = 2000

# Create a Flask application
app = Flask(__name__)

# Define a function to calculate the total number of tokens in conversation history using the Llama model's tokenizer
def calculate_total_tokens(messages):
    try:
        # Convert content to string and tokenize
        total_tokens = sum(len(llm.tokenize(str(message["content"]), add_bos=False, special=True)) for message in messages)
        return total_tokens
    except Exception as e:
        logging.error(f"Error during tokenization: {e}")
        return 0  # Return a safe value (0) to handle the error

# Define a function to trim the conversation history if the total number of tokens exceeds the context window size
def trim_conversation_history():
    global conversation_history
    total_tokens = calculate_total_tokens(conversation_history)
    context_window_size = 2000

    while total_tokens > context_window_size:
        # Remove the oldest messages from the conversation history
        conversation_history.pop(0)
        # Recalculate the total number of tokens
        total_tokens = calculate_total_tokens(conversation_history)

# Define the model function
def model(query):
    global conversation_history

    # Add the user's query to the conversation history
    conversation_history.append({"role": "user", "content": query})

    # Calculate the total number of tokens in the conversation history
    total_tokens = calculate_total_tokens(conversation_history)

    # If the total number of tokens exceeds the model's context window, trim the history
    trim_conversation_history()

    # Generate chat completion with the conversation history
    try:
        response = llm.create_chat_completion(messages=conversation_history, max_tokens=200)

        # Extract the assistant's response from the completion dictionary
        if response and 'choices' in response and response['choices']:
            assistant_response = response['choices'][0]['message']['content']
            assistant_response = assistant_response.strip()

            # Add the assistant's response to the conversation history
            conversation_history.append({"role": "assistant", "content": assistant_response})

            # Return the assistant's response
            return assistant_response
        else:
            logging.error("Error: Invalid response structure.")
            return None
    except Exception as e:
        logging.error(f"Error during chat completion: {e}")
        return None

# Define the endpoint for the API
@app.route("/chat", methods=["GET"])
def chat_endpoint():
    # Get the query parameter from the request
    query = request.args.get("query")

    # Check if the "refresh" parameter is set to "true"
    refresh = request.args.get("refresh")
    if refresh and refresh.lower() == "true":
        # Clear the conversation history
        global conversation_history
        conversation_history = [{"role": "system", "content": system_prompt}]
        return jsonify({"response": "Conversation history cleared."})

    # If there is no query, return an error message
    if not query:
        return jsonify({"error": "Query parameter is required."}), 400

    # Call the model function with the query
    response = model(query)

    # Return the assistant's response as JSON
    if response is None:
        return jsonify({"error": "An error occurred while processing the request."}), 500

    # Check the size of the conversation history and clear if necessary
    if len(conversation_history) > MAX_CONVERSATION_HISTORY_SIZE:
        conversation_history = [{"role": "system", "content": system_prompt}]
        return jsonify({"response": response, "notification": "Conversation history was cleared due to exceeding maximum size."})
    print(response)
    return jsonify({"response": response})

# Run the Flask app
if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000)