CPE-LLM-s1 / app.py
Santipab's picture
Upload app.py
bde3dc5 verified
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_ollama import embeddings
from langchain_ollama import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
from aift.multimodal import textqa
from aift import setting
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
import streamlit as st
class CustomEmbeddings:
def __init__(self, model_name="mrp/simcse-model-m-bert-thai-cased"):
"""
Initialize the embedding model using SentenceTransformer.
:param model_name: Name of the pre-trained model
"""
self.model = SentenceTransformer(model_name)
def embed_query(self, text):
"""
Generate embeddings for a single query.
:param text: Input text to embed
:return: Embedding vector as a Python list
"""
embedding = self.model.encode([text])
return embedding[0].tolist() # Convert NumPy array to list
async def aembed_query(self, text):
"""
Asynchronous version of `embed_query`.
:param text: Input text to embed
:return: Embedding vector as a Python list
"""
return self.embed_query(text)
def embed_documents(self, texts):
"""
Generate embeddings for multiple documents.
:param texts: List of input texts to embed
:return: List of embedding vectors as Python lists
"""
embeddings = self.model.encode(texts)
return [embedding.tolist() for embedding in embeddings]
async def aembed_documents(self, texts):
"""
Asynchronous version of `embed_documents`.
:param texts: List of input texts to embed
:return: List of embedding vectors as Python lists
"""
return self.embed_documents(texts)
# Set Pathumma API Key
setting.set_api_key('T69FqnYgOdreO5G0nZaM8gHcjo1sifyU')
# Define a simple wrapper for Pathumma
class PathummaModel:
def __init__(self):
pass
def generate(self, instruction: str, return_json: bool = False):
response = textqa.generate(instruction=instruction, return_json=return_json)
if return_json:
return response.get("content", "")
return response
def __call__(self, input: str):
return self.generate(input, return_json=False)
# Initialize Pathumma Model
model_local = PathummaModel()
# Load the document, split it into chunks, embed each chunk and load it into the vector store.
raw_documents = TextLoader('./mainn.txt').load()
text_splitter = CharacterTextSplitter(chunk_size=7500, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
# 2. Convert documents to Embeddings and store them
vectorstore = Chroma.from_documents(
documents=documents,
collection_name="rag-chroma",
embedding=CustomEmbeddings(model_name="mrp/simcse-model-m-bert-thai-cased"),
)
retriever = vectorstore.as_retriever()
after_rag_template = """ตอบคำถามโดยพิจารณาจากบริบทต่อไปนี้เท่านั้น:
{context}
คำถาม: {question}
"""
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
# Query retriever for context and pass to Pathumma
def system_call(text_input):
question = text_input
retrieved_context = retriever.invoke(question)
context = "\n".join([doc.page_content for doc in retrieved_context])
after_rag_chain = after_rag_prompt.invoke({
"context": context,
"question": question,
})
response = model_local(after_rag_chain)
st.write("response")
st.write(response)
system_call("ผมชื่ออะไรเหรอ")