Spaces:
Sleeping
Sleeping
File size: 3,466 Bytes
6e04d14 d23d793 6e04d14 8626678 6e04d14 0e6b175 d23d793 ff2d69b 6e04d14 d23d793 6e04d14 d23d793 6e04d14 ff2d69b d23d793 ff2d69b 6e04d14 ff2d69b 6e04d14 8b3d7a3 d23d793 6e04d14 d23d793 6e04d14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import streamlit as st
import os
from langchain.llms import HuggingFacePipeline
from langchain_community.document_loaders import PDFPlumberLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import pipeline
# Set up Hugging Face model and token
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1" # Change to your preferred model
access_token = os.getenv("HF_TOKEN") # Your Hugging Face API token
# Set up HuggingFace pipeline
hf_pipeline = pipeline("text-generation", model=model_name, token=access_token)
# Template for response generation
template = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""
# Directory to store uploaded PDFs
pdfs_directory = '../pdfs'
os.makedirs(pdfs_directory, exist_ok=True)
# Initialize the embedding model
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # Choose any model
# Initialize the vector store for document indexing
vector_store = InMemoryVectorStore(embedding=embedding)
# Function to upload PDF file
def upload_pdf(file):
with open(pdfs_directory + file.name, "wb") as f:
f.write(file.getbuffer())
# Function to load PDF content
def load_pdf(file_path):
loader = PDFPlumberLoader(file_path)
documents = loader.load()
return documents
# Function to split text into manageable chunks
def split_text(documents):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
add_start_index=True
)
return text_splitter.split_documents(documents)
# Function to index documents in the vector store
def index_docs(documents):
vector_store.add_documents(documents)
# Function to retrieve relevant documents based on query
def retrieve_docs(query):
return vector_store.similarity_search(query)
# Function to generate an answer based on retrieved documents
def answer_question(question, documents):
context = "\n\n".join([doc.page_content for doc in documents])
full_context = f"{context}"
prompt = ChatPromptTemplate.from_template(template)
# Use HuggingFacePipeline for generating responses
hf_chain = HuggingFacePipeline(pipeline=hf_pipeline) # Wrap pipeline with HuggingFacePipeline
chain = prompt | hf_chain # Send the prompt to Hugging Face model via HuggingFacePipeline
return chain.invoke({"question": question, "context": full_context})
# Streamlit file uploader for PDF
uploaded_file = st.file_uploader(
"Upload PDF",
type="pdf",
accept_multiple_files=False
)
if uploaded_file:
# Upload, load, split, and index documents
upload_pdf(uploaded_file)
documents = load_pdf(pdfs_directory + uploaded_file.name)
chunked_documents = split_text(documents)
index_docs(chunked_documents)
# User input for a question
question = st.chat_input()
if question:
st.chat_message("user").write(question)
related_documents = retrieve_docs(question)
answer = answer_question(question, related_documents)
st.chat_message("assistant").write(answer)
|