Spaces:
Sleeping
Sleeping
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_community.document_loaders import WebBaseLoader | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain_community.vectorstores import Chroma | |
from langchain_ollama import embeddings | |
from langchain_ollama import ChatOllama | |
from langchain_core.runnables import RunnablePassthrough | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain.output_parsers import PydanticOutputParser | |
from langchain.text_splitter import CharacterTextSplitter | |
from sentence_transformers import SentenceTransformer | |
from aift.multimodal import textqa | |
from aift import setting | |
from langchain_community.document_loaders import TextLoader | |
from langchain_text_splitters import CharacterTextSplitter | |
import streamlit as st | |
class CustomEmbeddings: | |
def __init__(self, model_name="mrp/simcse-model-m-bert-thai-cased"): | |
""" | |
Initialize the embedding model using SentenceTransformer. | |
:param model_name: Name of the pre-trained model | |
""" | |
self.model = SentenceTransformer(model_name) | |
def embed_query(self, text): | |
""" | |
Generate embeddings for a single query. | |
:param text: Input text to embed | |
:return: Embedding vector as a Python list | |
""" | |
embedding = self.model.encode([text]) | |
return embedding[0].tolist() # Convert NumPy array to list | |
async def aembed_query(self, text): | |
""" | |
Asynchronous version of `embed_query`. | |
:param text: Input text to embed | |
:return: Embedding vector as a Python list | |
""" | |
return self.embed_query(text) | |
def embed_documents(self, texts): | |
""" | |
Generate embeddings for multiple documents. | |
:param texts: List of input texts to embed | |
:return: List of embedding vectors as Python lists | |
""" | |
embeddings = self.model.encode(texts) | |
return [embedding.tolist() for embedding in embeddings] | |
async def aembed_documents(self, texts): | |
""" | |
Asynchronous version of `embed_documents`. | |
:param texts: List of input texts to embed | |
:return: List of embedding vectors as Python lists | |
""" | |
return self.embed_documents(texts) | |
# Set Pathumma API Key | |
setting.set_api_key('T69FqnYgOdreO5G0nZaM8gHcjo1sifyU') | |
# Define a simple wrapper for Pathumma | |
class PathummaModel: | |
def __init__(self): | |
pass | |
def generate(self, instruction: str, return_json: bool = False): | |
response = textqa.generate(instruction=instruction, return_json=return_json) | |
if return_json: | |
return response.get("content", "") | |
return response | |
def __call__(self, input: str): | |
return self.generate(input, return_json=False) | |
# Initialize Pathumma Model | |
model_local = PathummaModel() | |
# Load the document, split it into chunks, embed each chunk and load it into the vector store. | |
raw_documents = TextLoader('./mainn.txt').load() | |
text_splitter = CharacterTextSplitter(chunk_size=7500, chunk_overlap=0) | |
documents = text_splitter.split_documents(raw_documents) | |
# 2. Convert documents to Embeddings and store them | |
vectorstore = Chroma.from_documents( | |
documents=documents, | |
collection_name="rag-chroma", | |
embedding=CustomEmbeddings(model_name="mrp/simcse-model-m-bert-thai-cased"), | |
) | |
retriever = vectorstore.as_retriever() | |
after_rag_template = """ตอบคำถามโดยพิจารณาจากบริบทต่อไปนี้เท่านั้น: | |
{context} | |
คำถาม: {question} | |
""" | |
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template) | |
# Query retriever for context and pass to Pathumma | |
def system_call(text_input): | |
question = text_input | |
retrieved_context = retriever.invoke(question) | |
context = "\n".join([doc.page_content for doc in retrieved_context]) | |
after_rag_chain = after_rag_prompt.invoke({ | |
"context": context, | |
"question": question, | |
}) | |
response = model_local(after_rag_chain) | |
st.write("response") | |
st.write(response) | |
system_call("ผมชื่ออะไรเหรอ") |