Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pdfplumber | |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering | |
import torch | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.memory import ConversationBufferMemory | |
# โหลดโมเดล ThaiBERT จาก Hugging Face | |
tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased") | |
model = AutoModelForQuestionAnswering.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased") | |
# ฟังก์ชันสำหรับอ่านเนื้อหาจาก PDF | |
def extract_text_from_pdf(pdf_file): | |
with pdfplumber.open(pdf_file) as pdf: | |
text = "" | |
for page in pdf.pages: | |
text += page.extract_text() | |
return text | |
# ฟังก์ชันสำหรับการตอบคำถามด้วย ThaiBERT | |
def answer_question(question, context): | |
inputs = tokenizer.encode_plus(question, context, return_tensors="pt") | |
answer_start_scores, answer_end_scores = model(**inputs) | |
answer_start = torch.argmax(answer_start_scores.start_logits) | |
answer_end = torch.argmax(answer_end_scores.end_logits) + 1 | |
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end])) | |
return answer | |
# ตั้งค่าอินเตอร์เฟสของหน้าเว็บด้วย Streamlit | |
st.title("ThaiBERT PDF QA System") | |
uploaded_file = st.file_uploader("Upload a PDF", type="pdf") | |
if uploaded_file: | |
# อ่านเนื้อหาจาก PDF | |
pdf_text = extract_text_from_pdf(uploaded_file) | |
# สร้าง chain สำหรับถามตอบ | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
docs = text_splitter.create_documents([pdf_text]) | |
# สร้าง embeddings โดยใช้ transformers | |
model_name = "sentence-transformers/paraphrase-xlm-r-multilingual-v1" | |
embedding_model = AutoModel.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# ปรับแต่ง Chroma กับ embeddings ของคุณ | |
vector_store = Chroma.from_documents(documents=docs, embedding=embedding_model) | |
retriever = vector_store.as_retriever() | |
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) | |
qa_chain = ConversationalRetrievalChain( | |
retriever=retriever, | |
llm=None, # ลบ HuggingFaceHub เพราะไม่ได้ใช้งาน | |
memory=memory | |
) | |
# หน้าต่างสำหรับใส่คำถาม | |
user_question = st.text_input("Ask a question about the PDF content") | |
if user_question: | |
response = qa_chain.run(user_question) | |
st.write("Answer:", response) | |