import json import faiss import numpy as np from sentence_transformers import SentenceTransformer # Load dữ liệu with open("data/raw.json", "r", encoding="utf-8") as f: data = json.load(f) questions = [item["question"] for item in data] answers = [item["answer"] for item in data] # Load mô hình embedding model = SentenceTransformer("pkshatech/GLuCoSE-base-ja") # Tạo embedding cho câu hỏi và câu trả lời question_embeddings = model.encode(questions) answer_embeddings = model.encode(answers) # Lưu FAISS index dim = question_embeddings.shape[1] index_q = faiss.IndexFlatL2(dim) index_a = faiss.IndexFlatL2(dim) index_q.add(np.array(question_embeddings).astype(np.float32)) index_a.add(np.array(answer_embeddings).astype(np.float32)) faiss.write_index(index_q, "faiss_question.index") faiss.write_index(index_a, "faiss_answer.index") # Lưu dữ liệu gốc with open("qa_data.json", "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2)