Spaces:

Mattral
/

SampleMiniRagDoc

Sleeping

App Files Files Community

SampleMiniRagDoc / app.py

Mattral

Update app.py

d23d793 verified 5 months ago

raw

history blame

3.47 kB

	import streamlit as st
	import os
	from langchain.llms import HuggingFacePipeline
	from langchain_community.document_loaders import PDFPlumberLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.vectorstores import InMemoryVectorStore
	from langchain_core.prompts import ChatPromptTemplate
	from langchain.embeddings import HuggingFaceEmbeddings
	from transformers import pipeline


	# Set up Hugging Face model and token
	model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1" # Change to your preferred model
	access_token = os.getenv("HF_TOKEN") # Your Hugging Face API token

	# Set up HuggingFace pipeline
	hf_pipeline = pipeline("text-generation", model=model_name, token=access_token)

	# Template for response generation
	template = """
	You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
	Question: {question}
	Context: {context}
	Answer:
	"""

	# Directory to store uploaded PDFs
	pdfs_directory = '../pdfs'
	os.makedirs(pdfs_directory, exist_ok=True)

	# Initialize the embedding model
	embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # Choose any model

	# Initialize the vector store for document indexing
	vector_store = InMemoryVectorStore(embedding=embedding)

	# Function to upload PDF file
	def upload_pdf(file):
	with open(pdfs_directory + file.name, "wb") as f:
	f.write(file.getbuffer())

	# Function to load PDF content
	def load_pdf(file_path):
	loader = PDFPlumberLoader(file_path)
	documents = loader.load()
	return documents

	# Function to split text into manageable chunks
	def split_text(documents):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	add_start_index=True
	)
	return text_splitter.split_documents(documents)

	# Function to index documents in the vector store
	def index_docs(documents):
	vector_store.add_documents(documents)

	# Function to retrieve relevant documents based on query
	def retrieve_docs(query):
	return vector_store.similarity_search(query)

	# Function to generate an answer based on retrieved documents
	def answer_question(question, documents):
	context = "\n\n".join([doc.page_content for doc in documents])
	full_context = f"{context}"
	prompt = ChatPromptTemplate.from_template(template)

	# Use HuggingFacePipeline for generating responses
	hf_chain = HuggingFacePipeline(pipeline=hf_pipeline) # Wrap pipeline with HuggingFacePipeline
	chain = prompt \| hf_chain # Send the prompt to Hugging Face model via HuggingFacePipeline

	return chain.invoke({"question": question, "context": full_context})

	# Streamlit file uploader for PDF
	uploaded_file = st.file_uploader(
	"Upload PDF",
	type="pdf",
	accept_multiple_files=False
	)

	if uploaded_file:
	# Upload, load, split, and index documents
	upload_pdf(uploaded_file)
	documents = load_pdf(pdfs_directory + uploaded_file.name)
	chunked_documents = split_text(documents)
	index_docs(chunked_documents)

	# User input for a question
	question = st.chat_input()

	if question:
	st.chat_message("user").write(question)
	related_documents = retrieve_docs(question)
	answer = answer_question(question, related_documents)
	st.chat_message("assistant").write(answer)