Spaces:

Muzammil6376
/

Multimodal

Running

App Files Files Community

Multimodal / app.py

Muzammil6376

Update app.py

ced2810 verified 20 days ago

raw

history blame

2.88 kB

	import os
	import tempfile

	import gradio as gr
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.document_loaders import UnstructuredPDFLoader
	from langchain.chains import RetrievalQA
	from langchain.llms import HuggingFaceHub
	from PIL import Image
	from transformers import pipeline

	# Directories for temporary storage
	FIGURES_DIR = tempfile.mkdtemp(prefix="figures_")

	# Configure Hugging Face
	HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

	# Initialize embeddings and vector store
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	vector_store = None

	# Initialize image captioning pipeline
	captioner = pipeline("image-to-text", model="Salesforce/blip2-flan-t5-xl", use_auth_token=HUGGINGFACEHUB_API_TOKEN)

	# Initialize LLM for QA
	llm = HuggingFaceHub(
	repo_id="google/flan-t5-xxl",
	model_kwargs={"temperature":0.0, "max_length":256},
	huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
	)

	# Helper functions

	def process_pdf(pdf_file):
	# Load text content
	loader = UnstructuredPDFLoader(pdf_file.name)
	docs = loader.load()

	# Basic text from PDF
	raw_text = "\n".join([d.page_content for d in docs])

	# Optionally extract images and caption them
	# Here, we simply caption any embedded images
	captions = []
	# (In a real pipeline, extract and save images separately)
	# For demo, we skip actual image files extraction

	# Combine text and captions
	combined = raw_text + "\n\n" + "\n".join(captions)
	return combined


	def build_index(text):
	global vector_store
	# Split into chunks
	splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	chunks = splitter.split_text(text)

	# Create or update FAISS index
	vector_store = FAISS.from_texts(chunks, embeddings)


	def answer_query(query):
	qa = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="stuff",
	retriever=vector_store.as_retriever()
	)
	return qa.run(query)

	# Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("# Multimodal RAG QA App")

	with gr.Row():
	pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"] )
	question_input = gr.Textbox(label="Ask a question", placeholder="Enter your question here...")

	output = gr.Textbox(label="Answer", interactive=False)

	def on_submit(pdf, question):
	if pdf is not None:
	text = process_pdf(pdf)
	build_index(text)
	if not question:
	return "Please enter a question."
	return answer_query(question)

	submit_btn = gr.Button("Get Answer")
	submit_btn.click(on_submit, inputs=[pdf_input, question_input], outputs=output)

	if __name__ == "__main__":
	demo.launch()