Spaces:

Santipab
/

CPE-LLM-s1

Sleeping

App Files Files Community

CPE-LLM-s1 / app.py

Santipab

Upload app.py

bde3dc5 verified 4 months ago

raw

history blame contribute delete

4.32 kB

	from langchain_community.document_loaders import PyPDFLoader
	from langchain_community.document_loaders import WebBaseLoader
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_community.vectorstores import Chroma
	from langchain_ollama import embeddings
	from langchain_ollama import ChatOllama
	from langchain_core.runnables import RunnablePassthrough
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.prompts import ChatPromptTemplate
	from langchain.output_parsers import PydanticOutputParser
	from langchain.text_splitter import CharacterTextSplitter
	from sentence_transformers import SentenceTransformer
	from aift.multimodal import textqa
	from aift import setting
	from langchain_community.document_loaders import TextLoader
	from langchain_text_splitters import CharacterTextSplitter
	import streamlit as st

	class CustomEmbeddings:
	def __init__(self, model_name="mrp/simcse-model-m-bert-thai-cased"):
	"""
	Initialize the embedding model using SentenceTransformer.
	:param model_name: Name of the pre-trained model
	"""
	self.model = SentenceTransformer(model_name)

	def embed_query(self, text):
	"""
	Generate embeddings for a single query.
	:param text: Input text to embed
	:return: Embedding vector as a Python list
	"""
	embedding = self.model.encode([text])
	return embedding[0].tolist() # Convert NumPy array to list

	async def aembed_query(self, text):
	"""
	Asynchronous version of `embed_query`.
	:param text: Input text to embed
	:return: Embedding vector as a Python list
	"""
	return self.embed_query(text)

	def embed_documents(self, texts):
	"""
	Generate embeddings for multiple documents.
	:param texts: List of input texts to embed
	:return: List of embedding vectors as Python lists
	"""
	embeddings = self.model.encode(texts)
	return [embedding.tolist() for embedding in embeddings]

	async def aembed_documents(self, texts):
	"""
	Asynchronous version of `embed_documents`.
	:param texts: List of input texts to embed
	:return: List of embedding vectors as Python lists
	"""
	return self.embed_documents(texts)

	# Set Pathumma API Key
	setting.set_api_key('T69FqnYgOdreO5G0nZaM8gHcjo1sifyU')

	# Define a simple wrapper for Pathumma
	class PathummaModel:
	def __init__(self):
	pass

	def generate(self, instruction: str, return_json: bool = False):
	response = textqa.generate(instruction=instruction, return_json=return_json)
	if return_json:
	return response.get("content", "")
	return response

	def __call__(self, input: str):
	return self.generate(input, return_json=False)

	# Initialize Pathumma Model
	model_local = PathummaModel()

	# Load the document, split it into chunks, embed each chunk and load it into the vector store.
	raw_documents = TextLoader('./mainn.txt').load()
	text_splitter = CharacterTextSplitter(chunk_size=7500, chunk_overlap=0)
	documents = text_splitter.split_documents(raw_documents)

	# 2. Convert documents to Embeddings and store them
	vectorstore = Chroma.from_documents(
	documents=documents,
	collection_name="rag-chroma",
	embedding=CustomEmbeddings(model_name="mrp/simcse-model-m-bert-thai-cased"),
	)
	retriever = vectorstore.as_retriever()

	after_rag_template = """ตอบคำถามโดยพิจารณาจากบริบทต่อไปนี้เท่านั้น:
	{context}
	คำถาม: {question}
	"""
	after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)

	# Query retriever for context and pass to Pathumma
	def system_call(text_input):
	question = text_input
	retrieved_context = retriever.invoke(question)
	context = "\n".join([doc.page_content for doc in retrieved_context])

	after_rag_chain = after_rag_prompt.invoke({
	"context": context,
	"question": question,
	})
	response = model_local(after_rag_chain)
	st.write("response")
	st.write(response)
	system_call("ผมชื่ออะไรเหรอ")