Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
from langchain.llms import HuggingFacePipeline | |
from langchain_community.document_loaders import PDFPlumberLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_core.vectorstores import InMemoryVectorStore | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from transformers import pipeline | |
# Set up Hugging Face model and token | |
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1" # Change to your preferred model | |
access_token = os.getenv("HF_TOKEN") # Your Hugging Face API token | |
# Set up HuggingFace pipeline | |
hf_pipeline = pipeline("text-generation", model=model_name, token=access_token) | |
# Template for response generation | |
template = """ | |
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. | |
Question: {question} | |
Context: {context} | |
Answer: | |
""" | |
# Directory to store uploaded PDFs | |
pdfs_directory = '../pdfs' | |
os.makedirs(pdfs_directory, exist_ok=True) | |
# Initialize the embedding model | |
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # Choose any model | |
# Initialize the vector store for document indexing | |
vector_store = InMemoryVectorStore(embedding=embedding) | |
# Function to upload PDF file | |
def upload_pdf(file): | |
with open(pdfs_directory + file.name, "wb") as f: | |
f.write(file.getbuffer()) | |
# Function to load PDF content | |
def load_pdf(file_path): | |
loader = PDFPlumberLoader(file_path) | |
documents = loader.load() | |
return documents | |
# Function to split text into manageable chunks | |
def split_text(documents): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=200, | |
add_start_index=True | |
) | |
return text_splitter.split_documents(documents) | |
# Function to index documents in the vector store | |
def index_docs(documents): | |
vector_store.add_documents(documents) | |
# Function to retrieve relevant documents based on query | |
def retrieve_docs(query): | |
return vector_store.similarity_search(query) | |
# Function to generate an answer based on retrieved documents | |
def answer_question(question, documents): | |
context = "\n\n".join([doc.page_content for doc in documents]) | |
full_context = f"{context}" | |
prompt = ChatPromptTemplate.from_template(template) | |
# Use HuggingFacePipeline for generating responses | |
hf_chain = HuggingFacePipeline(pipeline=hf_pipeline) # Wrap pipeline with HuggingFacePipeline | |
chain = prompt | hf_chain # Send the prompt to Hugging Face model via HuggingFacePipeline | |
return chain.invoke({"question": question, "context": full_context}) | |
# Streamlit file uploader for PDF | |
uploaded_file = st.file_uploader( | |
"Upload PDF", | |
type="pdf", | |
accept_multiple_files=False | |
) | |
if uploaded_file: | |
# Upload, load, split, and index documents | |
upload_pdf(uploaded_file) | |
documents = load_pdf(pdfs_directory + uploaded_file.name) | |
chunked_documents = split_text(documents) | |
index_docs(chunked_documents) | |
# User input for a question | |
question = st.chat_input() | |
if question: | |
st.chat_message("user").write(question) | |
related_documents = retrieve_docs(question) | |
answer = answer_question(question, related_documents) | |
st.chat_message("assistant").write(answer) | |