Multimodal / app.py
Muzammil6376's picture
Update app.py
ced2810 verified
raw
history blame
2.88 kB
import os
import tempfile
import gradio as gr
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
from PIL import Image
from transformers import pipeline
# Directories for temporary storage
FIGURES_DIR = tempfile.mkdtemp(prefix="figures_")
# Configure Hugging Face
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
# Initialize embeddings and vector store
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = None
# Initialize image captioning pipeline
captioner = pipeline("image-to-text", model="Salesforce/blip2-flan-t5-xl", use_auth_token=HUGGINGFACEHUB_API_TOKEN)
# Initialize LLM for QA
llm = HuggingFaceHub(
repo_id="google/flan-t5-xxl",
model_kwargs={"temperature":0.0, "max_length":256},
huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
)
# Helper functions
def process_pdf(pdf_file):
# Load text content
loader = UnstructuredPDFLoader(pdf_file.name)
docs = loader.load()
# Basic text from PDF
raw_text = "\n".join([d.page_content for d in docs])
# Optionally extract images and caption them
# Here, we simply caption any embedded images
captions = []
# (In a real pipeline, extract and save images separately)
# For demo, we skip actual image files extraction
# Combine text and captions
combined = raw_text + "\n\n" + "\n".join(captions)
return combined
def build_index(text):
global vector_store
# Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_text(text)
# Create or update FAISS index
vector_store = FAISS.from_texts(chunks, embeddings)
def answer_query(query):
qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vector_store.as_retriever()
)
return qa.run(query)
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# Multimodal RAG QA App")
with gr.Row():
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"] )
question_input = gr.Textbox(label="Ask a question", placeholder="Enter your question here...")
output = gr.Textbox(label="Answer", interactive=False)
def on_submit(pdf, question):
if pdf is not None:
text = process_pdf(pdf)
build_index(text)
if not question:
return "Please enter a question."
return answer_query(question)
submit_btn = gr.Button("Get Answer")
submit_btn.click(on_submit, inputs=[pdf_input, question_input], outputs=output)
if __name__ == "__main__":
demo.launch()