Muzammil6376 commited on
Commit
ced2810
Β·
verified Β·
1 Parent(s): b6b04c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -147
app.py CHANGED
@@ -1,174 +1,95 @@
1
  import os
2
- import shutil
3
- from typing import List
4
 
5
  import gradio as gr
 
 
 
 
 
 
6
  from PIL import Image
 
7
 
8
- # PDF parsing
9
- from pypdf import PdfReader
10
- from unstructured.partition.pdf import partition_pdf
11
- from unstructured.partition.utils.constants import PartitionStrategy
12
 
13
- # Text splitting
14
- from langchain.text_splitter import CharacterTextSplitter
15
 
16
- # Vectorstore and embeddings
17
- from langchain_community.vectorstores import FAISS
18
- from langchain_huggingface import HuggingFaceEmbeddings
19
-
20
- # Vision-language captioning (BLIP)
21
- from transformers import BlipProcessor, BlipForConditionalGeneration
22
-
23
- # LLM via HF Inference API
24
- from huggingface_hub import InferenceClient
25
-
26
- # ── Globals ───────────────────────────────────────────────────────────────────
27
- retriever = None
28
- pdf_text: str = ""
29
-
30
- # ── Setup directories ──────────────────────────────────────────────────────────
31
- FIGURES_DIR = "figures"
32
- if os.path.exists(FIGURES_DIR):
33
- shutil.rmtree(FIGURES_DIR)
34
- os.makedirs(FIGURES_DIR, exist_ok=True)
35
-
36
- # ── Models & Clients ───────────────────────────────────────────────────────────
37
- hf_client = InferenceClient() # uses HUGGINGFACEHUB_API_TOKEN
38
-
39
- # Embeddings model (local lightweight SBERT)
40
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
41
 
42
- # BLIP for image captioning
43
- blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
44
- blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
45
 
46
- # ── Helper functions ───────────────────────────────────────────────────────────
47
-
48
- def generate_caption(image_path: str) -> str:
49
- image = Image.open(image_path).convert("RGB")
50
- inputs = blip_processor(image, return_tensors="pt")
51
- outputs = blip_model.generate(**inputs)
52
- return blip_processor.decode(outputs[0], skip_special_tokens=True)
53
 
 
54
 
55
  def process_pdf(pdf_file):
56
- global retriever, pdf_text
57
- if pdf_file is None:
58
- return None, "❌ Please upload a PDF.", gr.update(interactive=False)
59
-
60
- # read full text
61
- reader = PdfReader(pdf_file.name)
62
- pages = [p.extract_text() or "" for p in reader.pages]
63
- pdf_text = "
64
-
65
- ".join(pages)
66
-
67
- # extract elements with images via unstructured
68
- try:
69
- elements = partition_pdf(
70
- filename=pdf_file.name,
71
- strategy=PartitionStrategy.HI_RES,
72
- extract_image_block_types=["Image", "Table"],
73
- extract_image_block_output_dir=FIGURES_DIR,
74
- )
75
- text_elems = [e.text for e in elements if e.category not in ["Image","Table"] and e.text]
76
- image_files = [os.path.join(FIGURES_DIR, f) for f in os.listdir(FIGURES_DIR)
77
- if f.lower().endswith((".png",".jpg",".jpeg"))]
78
- except:
79
- text_elems = pages
80
- image_files = []
81
-
82
- # generate captions
83
- captions = [generate_caption(img) for img in image_files]
84
-
85
- # split text into chunks
86
- splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
87
- chunks = []
88
- for t in text_elems:
89
- chunks.extend(splitter.split_text(t))
90
-
91
- # combine text chunks and image captions
92
- docs = chunks + captions
93
-
94
- # embed and index
95
- vectors = embeddings.embed_documents(docs)
96
- pairs = list(zip(docs, vectors))
97
- index = FAISS.from_embeddings(pairs)
98
- retriever = index.as_retriever(search_kwargs={"k": 2})
99
-
100
- status = f"βœ… Indexed β€” {len(chunks)} text chunks + {len(captions)} captions"
101
- return os.path.basename(pdf_file.name), status, gr.update(interactive=True)
102
-
103
-
104
- def ask_question(pdf_name, question):
105
- if retriever is None:
106
- return "❌ Please upload + index a PDF first."
107
- if not question:
108
- return "❌ Please ask something."
109
-
110
- docs = retriever.get_relevant_documents(question)
111
- context = "\n\n".join(d.page_content for d in docs)
112
- prompt = f"Use the following excerpts to answer:\n{context}\nQuestion: {question}\nAnswer:"
113
-
114
- res = hf_client.chat_completion(
115
- model="google/gemma-3-27b-it",
116
- messages=[{"role":"user","content":prompt}],
117
- max_tokens=128,
118
- temperature=0.5,
119
- )
120
- return res["choices"][0]["message"]["content"].strip()
121
 
 
 
122
 
123
- def generate_summary():
124
- if not pdf_text:
125
- return "❌ Please index a PDF first."
126
- return ask_question(None, f"Summarize concisely:\n{pdf_text[:2000]}")
 
127
 
 
 
 
128
 
129
- def extract_keywords():
130
- if not pdf_text:
131
- return "❌ Please index first."
132
- return ask_question(None, f"Extract 10–15 key terms:\n{pdf_text[:2000]}")
133
 
 
 
 
 
 
134
 
135
- def clear_all():
136
- global retriever, pdf_text
137
- retriever = None
138
- pdf_text = ""
139
- shutil.rmtree(FIGURES_DIR, ignore_errors=True)
140
- os.makedirs(FIGURES_DIR, exist_ok=True)
141
- return None, "", gr.update(interactive=False)
142
 
143
- # ── Gradio UI ────────────────────────────────────────────────────────────────
144
- theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
145
- with gr.Blocks(theme=theme) as demo:
146
- gr.Markdown("# Multimodal RAG with HF & LangChain")
147
- with gr.Row():
148
- with gr.Column():
149
- pdf_disp = gr.Textbox(label="Active PDF", interactive=False)
150
- pdf_file = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
151
- btn_proc = gr.Button("πŸ“„ Process PDF")
152
- status = gr.Textbox(label="Status", interactive=False)
153
 
154
- with gr.Column():
155
- q_in = gr.Textbox(label="Your question", interactive=False)
156
- btn_ask = gr.Button("❓ Ask", interactive=False)
157
- ans = gr.Textbox(label="Answer", interactive=False)
 
 
 
 
 
 
 
158
 
159
  with gr.Row():
160
- btn_sum = gr.Button("πŸ“‹ Summary", interactive=False)
161
- sum_out = gr.Textbox(interactive=False)
162
- btn_key = gr.Button("🏷️ Keywords", interactive=False)
163
- key_out = gr.Textbox(interactive=False)
164
 
165
- btn_clear = gr.Button("πŸ—‘οΈ Clear All")
 
 
 
 
 
 
166
 
167
- btn_proc.click(process_pdf, [pdf_file], [pdf_disp, status, q_in])
168
- btn_ask.click(ask_question, [pdf_disp, q_in], ans)
169
- btn_sum.click(generate_summary, [], sum_out)
170
- btn_key.click(extract_keywords, [], key_out)
171
- btn_clear.click(clear_all, [], [pdf_disp, status, q_in])
172
 
173
  if __name__ == "__main__":
174
- demo.launch(debug=True)
 
1
  import os
2
+ import tempfile
 
3
 
4
  import gradio as gr
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain.document_loaders import UnstructuredPDFLoader
9
+ from langchain.chains import RetrievalQA
10
+ from langchain.llms import HuggingFaceHub
11
  from PIL import Image
12
+ from transformers import pipeline
13
 
14
+ # Directories for temporary storage
15
+ FIGURES_DIR = tempfile.mkdtemp(prefix="figures_")
 
 
16
 
17
+ # Configure Hugging Face
18
+ HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
19
 
20
+ # Initialize embeddings and vector store
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
22
+ vector_store = None
23
 
24
+ # Initialize image captioning pipeline
25
+ captioner = pipeline("image-to-text", model="Salesforce/blip2-flan-t5-xl", use_auth_token=HUGGINGFACEHUB_API_TOKEN)
 
26
 
27
+ # Initialize LLM for QA
28
+ llm = HuggingFaceHub(
29
+ repo_id="google/flan-t5-xxl",
30
+ model_kwargs={"temperature":0.0, "max_length":256},
31
+ huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
32
+ )
 
33
 
34
+ # Helper functions
35
 
36
  def process_pdf(pdf_file):
37
+ # Load text content
38
+ loader = UnstructuredPDFLoader(pdf_file.name)
39
+ docs = loader.load()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # Basic text from PDF
42
+ raw_text = "\n".join([d.page_content for d in docs])
43
 
44
+ # Optionally extract images and caption them
45
+ # Here, we simply caption any embedded images
46
+ captions = []
47
+ # (In a real pipeline, extract and save images separately)
48
+ # For demo, we skip actual image files extraction
49
 
50
+ # Combine text and captions
51
+ combined = raw_text + "\n\n" + "\n".join(captions)
52
+ return combined
53
 
 
 
 
 
54
 
55
+ def build_index(text):
56
+ global vector_store
57
+ # Split into chunks
58
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
59
+ chunks = splitter.split_text(text)
60
 
61
+ # Create or update FAISS index
62
+ vector_store = FAISS.from_texts(chunks, embeddings)
 
 
 
 
 
63
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ def answer_query(query):
66
+ qa = RetrievalQA.from_chain_type(
67
+ llm=llm,
68
+ chain_type="stuff",
69
+ retriever=vector_store.as_retriever()
70
+ )
71
+ return qa.run(query)
72
+
73
+ # Gradio UI
74
+ with gr.Blocks() as demo:
75
+ gr.Markdown("# Multimodal RAG QA App")
76
 
77
  with gr.Row():
78
+ pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"] )
79
+ question_input = gr.Textbox(label="Ask a question", placeholder="Enter your question here...")
80
+
81
+ output = gr.Textbox(label="Answer", interactive=False)
82
 
83
+ def on_submit(pdf, question):
84
+ if pdf is not None:
85
+ text = process_pdf(pdf)
86
+ build_index(text)
87
+ if not question:
88
+ return "Please enter a question."
89
+ return answer_query(question)
90
 
91
+ submit_btn = gr.Button("Get Answer")
92
+ submit_btn.click(on_submit, inputs=[pdf_input, question_input], outputs=output)
 
 
 
93
 
94
  if __name__ == "__main__":
95
+ demo.launch()