Spaces:

Detomo
/

Taiken_chatbot_API

Sleeping

File size: 3,242 Bytes

44a025a

import pandas as pd
import json
import re
import numpy as np
import os
from typing import List, Dict, Tuple, Any

from app.services.model_service import get_model, reload_embeddings

# Ensure data directory exists
os.makedirs("data", exist_ok=True)


def remove_prefix(text: str, prefix_pattern: str) -> str:
    """
    Removes the prefix matching the given pattern from the text.
    """
    return re.sub(prefix_pattern, "", text).strip()


def process_file(file_path: str, file_type: str) -> List[Dict[str, str]]:
    """
    Process Excel or CSV file and extract question-answer pairs.
    """
    if file_type == "excel":
        df = pd.read_excel(file_path)
    elif file_type == "csv":
        df = pd.read_csv(file_path)
    else:
        raise ValueError("Unsupported file type. Use 'excel' or 'csv'.")

    # Check if the necessary columns exist
    if "質問" not in df.columns or "回答" not in df.columns:
        raise ValueError("The file must contain '質問' and '回答' columns.")

    # Initialize the list to store processed data
    qa_list = []
    df.dropna(inplace=True)
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        raw_question = str(row["質問"])
        raw_answer = str(row["回答"])

        # Remove prefixes using regex patterns
        question = remove_prefix(raw_question, r"^Q\d+\.\s*")
        answer = remove_prefix(raw_answer, r"^A\.\s*")

        qa_list.append({"question": question, "answer": answer})
        # print(qa_list)

    return qa_list


def save_raw_data(qa_list: List[Dict[str, str]]) -> None:
    """
    Save the raw question-answer pairs to a JSON file.
    """
    with open("data/raw.json", "w", encoding="utf-8") as json_file:
        json.dump(qa_list, json_file, ensure_ascii=False, indent=2)


def create_and_save_embeddings(qa_list: List[Dict[str, str]]) -> None:
    """
    Create embeddings for questions and answers and save them.
    """
    questions = [item["question"] for item in qa_list]
    answers = [item["answer"] for item in qa_list]

    # Use the global model
    model = get_model()

    # Create embeddings for questions and answers
    question_embeddings = model.encode(questions, convert_to_numpy=True)
    answer_embeddings = model.encode(answers, convert_to_numpy=True)

    # Save embeddings as numpy arrays
    np.save("data/question_embeddings.npy", question_embeddings)
    np.save("data/answer_embeddings.npy", answer_embeddings)

    # Save the original data
    with open("data/qa_data.json", "w", encoding="utf-8") as f:
        json.dump(qa_list, f, ensure_ascii=False, indent=2)


def process_and_create_embeddings(file_path: str, file_type: str) -> Dict[str, Any]:
    """
    Process the input file and create embeddings.
    """
    try:
        qa_list = process_file(file_path, file_type)
        save_raw_data(qa_list)
        create_and_save_embeddings(qa_list)

        # Reload embeddings into memory
        reload_embeddings()

        return {
            "status": "success",
            "message": "Embeddings created successfully",
            "data": {"total_qa_pairs": len(qa_list)},
        }
    except Exception as e:
        return {"status": "error", "message": str(e)}