Muzammil6376 commited on
Commit
67a56f6
Β·
verified Β·
1 Parent(s): 4bb4c94

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +169 -0
app.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import PyPDF2
4
+ import gradio as gr
5
+ from PIL import Image
6
+
7
+ # Unstructured for rich PDF parsing
8
+ from unstructured.partition.pdf import partition_pdf
9
+ from unstructured.partition.utils.constants import PartitionStrategy
10
+
11
+ # Vision-language captioning (BLIP)
12
+ from transformers import BlipProcessor, BlipForConditionalGeneration
13
+
14
+ # LangChain vectorstore and embeddings
15
+ from langchain_community.vectorstores import FAISS
16
+ from langchain_community.embeddings import HuggingFaceEmbeddings
17
+
18
+ # HF Inference client for chat completions
19
+ from huggingface_hub import InferenceClient
20
+
21
+ # ── Globals ───────────────────────────────────────────────────────────────────
22
+ retriever = None # FAISS retriever for multimodal content
23
+ current_pdf_name = None # Name of the currently loaded PDF
24
+ combined_texts = None # Combined text + image captions corpus
25
+
26
+ # ── Setup: directories ─────────────────────────────────────────────────────────
27
+ FIGURES_DIR = "figures"
28
+ if os.path.exists(FIGURES_DIR):
29
+ shutil.rmtree(FIGURES_DIR)
30
+ os.makedirs(FIGURES_DIR, exist_ok=True)
31
+
32
+ # ── Models & Clients ───────────────────────────────────────────────────────────
33
+ # Chat model (Mistral-7B-Instruct)
34
+ chat_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
35
+ # Text embeddings (BAAI BGE)
36
+ embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
37
+ # Image captioning (BLIP)
38
+ blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
39
+ blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
40
+
41
+
42
+ def generate_caption(image_path: str) -> str:
43
+ """
44
+ Generates a natural-language caption for an image using BLIP.
45
+ """
46
+ image = Image.open(image_path).convert('RGB')
47
+ inputs = blip_processor(image, return_tensors="pt")
48
+ out = blip_model.generate(**inputs)
49
+ caption = blip_processor.decode(out[0], skip_special_tokens=True)
50
+ return caption
51
+
52
+
53
+ def process_pdf(pdf_file) -> str:
54
+ """
55
+ Parses the uploaded PDF into text chunks and image captions,
56
+ builds a FAISS index, and prepares the retriever.
57
+ Returns status message.
58
+ """
59
+ global current_pdf_name, retriever, combined_texts
60
+ if pdf_file is None:
61
+ return "❌ Please upload a PDF file."
62
+
63
+ # Save PDF locally for unstructured
64
+ pdf_path = pdf_file.name
65
+ current_pdf_name = os.path.basename(pdf_path)
66
+
67
+ # Extract text, table, and image blocks
68
+ elements = partition_pdf(
69
+ filename=pdf_path,
70
+ strategy=PartitionStrategy.HI_RES,
71
+ extract_image_block_types=["Image", "Table"],
72
+ extract_image_block_output_dir=FIGURES_DIR
73
+ )
74
+
75
+ # Separate text and image elements
76
+ text_elements = [el.text for el in elements if el.category not in ["Image", "Table"] and el.text]
77
+ image_files = [os.path.join(FIGURES_DIR, f)
78
+ for f in os.listdir(FIGURES_DIR)
79
+ if f.lower().endswith((".png", ".jpg", ".jpeg"))]
80
+
81
+ # Generate captions for each image
82
+ captions = []
83
+ for img in image_files:
84
+ cap = generate_caption(img)
85
+ captions.append(cap)
86
+
87
+ # Combine all pieces for indexing
88
+ combined_texts = text_elements + captions
89
+
90
+ # Create FAISS index and retriever
91
+ index = FAISS.from_texts(combined_texts, embeddings)
92
+ retriever = index.as_retriever(search_kwargs={"k": 2})
93
+
94
+ status = f"βœ… Indexed '{current_pdf_name}' β€” {len(text_elements)} text blocks + {len(captions)} image captions"
95
+ return status
96
+
97
+
98
+ def ask_question(question: str) -> str:
99
+ """
100
+ Retrieves relevant chunks from the FAISS index and generates an answer via chat model.
101
+ """
102
+ global retriever
103
+ if retriever is None:
104
+ return "❌ Please upload and process a PDF first."
105
+ if not question.strip():
106
+ return "❌ Please enter a question."
107
+
108
+ docs = retriever.get_relevant_documents(question)
109
+ context = "\n\n".join(doc.page_content for doc in docs)
110
+
111
+ prompt = (
112
+ "Use the following document excerpts to answer the question.\n\n"
113
+ f"{context}\n\n"
114
+ f"Question: {question}\n"
115
+ "Answer:"
116
+ )
117
+
118
+ response = chat_client.chat_completion(
119
+ messages=[{"role": "user", "content": prompt}],
120
+ max_tokens=128,
121
+ temperature=0.5
122
+ )
123
+ answer = response["choices"][0]["message"]["content"].strip()
124
+ return answer
125
+
126
+
127
+ def clear_interface():
128
+ """Resets global state and clears the figures directory."""
129
+ global retriever, current_pdf_name, combined_texts
130
+ retriever = None
131
+ current_pdf_name = None
132
+ combined_texts = None
133
+ shutil.rmtree(FIGURES_DIR)
134
+ os.makedirs(FIGURES_DIR, exist_ok=True)
135
+ return ""
136
+
137
+ # ── Gradio UI ────────────────────────────────────────────────────────────────
138
+ theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
139
+ with gr.Blocks(theme=theme, css="""
140
+ .container { border-radius: 10px; padding: 15px; }
141
+ .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
142
+ .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
143
+ .main-title { text-align: center; font-size: 64px; font-weight: bold; margin-bottom: 20px; }
144
+ """) as demo:
145
+ gr.Markdown("<div class='main-title'>DocQueryAI (Multimodal)</div>")
146
+
147
+ with gr.Row():
148
+ with gr.Column():
149
+ gr.Markdown("## πŸ“„ Document Input")
150
+ pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
151
+ pdf_file = gr.File(file_types=[".pdf"], type="file")
152
+ process_btn = gr.Button("πŸ“€ Process Document", variant="primary")
153
+ status_box = gr.Textbox(label="Status", interactive=False)
154
+
155
+ with gr.Column():
156
+ gr.Markdown("## ❓ Ask Questions")
157
+ question_input = gr.Textbox(lines=3, placeholder="Enter your question here…")
158
+ ask_btn = gr.Button("πŸ” Ask Question", variant="primary")
159
+ answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
160
+
161
+ clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
162
+ gr.Markdown("<div class='footer'>Powered by LangChain + Mistral 7B + FAISS + BLIP | Gradio</div>")
163
+
164
+ process_btn.click(fn=process_pdf, inputs=[pdf_file], outputs=[status_box])
165
+ ask_btn.click(fn=ask_question, inputs=[question_input], outputs=[answer_output])
166
+ clear_btn.click(fn=clear_interface, outputs=[status_box, answer_output])
167
+
168
+ if __name__ == "__main__":
169
+ demo.launch(debug=True, share=True)