Spaces:

Muzammil6376
/

Multimodal

Sleeping

File size: 2,879 Bytes

67a56f6
ced2810
2a4ba68
67a56f6
ced2810
 
 
 
 
 
67a56f6
ced2810
2a4ba68
ced2810
 
67a56f6
ced2810
 
67a56f6
ced2810
d179e57
ced2810
583b178
ced2810
 
225229c
ced2810
 
 
 
 
 
225229c
ced2810
ae644bf
225229c
ced2810
 
 
2a4ba68
ced2810
 
2a4ba68
ced2810
 
 
 
 
d179e57
ced2810
 
 
d179e57
225229c
ced2810
 
 
 
 
225229c
ced2810
 
2a4ba68
d179e57
ced2810
 
 
 
 
 
 
 
 
 
 
d179e57
 
ced2810
 
 
 
d179e57
ced2810
 
 
 
 
 
 
d179e57
ced2810
 
d179e57
 
ced2810

import os
import tempfile

import gradio as gr
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
from PIL import Image
from transformers import pipeline

# Directories for temporary storage
FIGURES_DIR = tempfile.mkdtemp(prefix="figures_")

# Configure Hugging Face
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# Initialize embeddings and vector store
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = None

# Initialize image captioning pipeline
captioner = pipeline("image-to-text", model="Salesforce/blip2-flan-t5-xl", use_auth_token=HUGGINGFACEHUB_API_TOKEN)

# Initialize LLM for QA
llm = HuggingFaceHub(
    repo_id="google/flan-t5-xxl",
    model_kwargs={"temperature":0.0, "max_length":256},
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
)

# Helper functions

def process_pdf(pdf_file):
    # Load text content
    loader = UnstructuredPDFLoader(pdf_file.name)
    docs = loader.load()

    # Basic text from PDF
    raw_text = "\n".join([d.page_content for d in docs])

    # Optionally extract images and caption them
    # Here, we simply caption any embedded images
    captions = []
    # (In a real pipeline, extract and save images separately)
    # For demo, we skip actual image files extraction

    # Combine text and captions
    combined = raw_text + "\n\n" + "\n".join(captions)
    return combined


def build_index(text):
    global vector_store
    # Split into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_text(text)

    # Create or update FAISS index
    vector_store = FAISS.from_texts(chunks, embeddings)


def answer_query(query):
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever()
    )
    return qa.run(query)

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Multimodal RAG QA App")

    with gr.Row():
        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"] )
        question_input = gr.Textbox(label="Ask a question", placeholder="Enter your question here...")

    output = gr.Textbox(label="Answer", interactive=False)

    def on_submit(pdf, question):
        if pdf is not None:
            text = process_pdf(pdf)
            build_index(text)
        if not question:
            return "Please enter a question."
        return answer_query(question)

    submit_btn = gr.Button("Get Answer")
    submit_btn.click(on_submit, inputs=[pdf_input, question_input], outputs=output)

if __name__ == "__main__":
    demo.launch()