prithivMLmods commited on
Commit
852f994
·
verified ·
1 Parent(s): 53656ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +203 -81
app.py CHANGED
@@ -41,32 +41,37 @@ for name, model_id in MODEL_OPTIONS.items():
41
  image_extensions = Image.registered_extensions()
42
 
43
  def identify_and_save_blob(blob_path):
 
44
  try:
45
  with open(blob_path, 'rb') as file:
46
  blob_content = file.read()
47
  try:
48
- Image.open(io.BytesIO(blob_content)).verify() # Validate image
49
- extension = ".png" # Default extension
50
  media_type = "image"
51
  except (IOError, SyntaxError):
52
  raise ValueError("Unsupported media type. Please upload a valid image.")
 
53
  filename = f"temp_{uuid.uuid4()}_media{extension}"
54
  with open(filename, "wb") as f:
55
  f.write(blob_content)
 
56
  return filename, media_type
 
57
  except FileNotFoundError:
58
  raise ValueError(f"The file {blob_path} was not found.")
59
  except Exception as e:
60
- raise ValueError(f"Error processing file: {e}")
61
 
62
  @spaces.GPU
63
  def qwen_inference(model_name, media_input, text_input=None):
 
64
  model = models[model_name]
65
  processor = processors[model_name]
66
 
67
  if isinstance(media_input, str):
68
  media_path = media_input
69
- if media_path.endswith(tuple(image_extensions.keys())):
70
  media_type = "image"
71
  else:
72
  try:
@@ -74,33 +79,52 @@ def qwen_inference(model_name, media_input, text_input=None):
74
  except Exception as e:
75
  raise ValueError("Unsupported media type. Please upload a valid image.")
76
 
77
- messages = [{
78
- "role": "user",
79
- "content": [
80
- {"type": media_type, media_type: media_path},
81
- {"type": "text", "text": text_input},
82
- ],
83
- }]
 
 
 
 
 
84
 
85
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
86
  image_inputs, _ = process_vision_info(messages)
87
- inputs = processor(text=[text], images=image_inputs, padding=True, return_tensors="pt").to("cuda")
 
 
 
 
 
88
 
89
- streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
 
 
90
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
 
91
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
92
  thread.start()
93
 
94
  buffer = ""
95
  for new_text in streamer:
96
  buffer += new_text
 
97
  buffer = buffer.replace("<|im_end|>", "")
98
  yield buffer
99
 
100
  def format_plain_text(output_text):
101
- return output_text.replace("\\(", "").replace("\\)", "").replace("\\[", "").replace("\\]", "")
 
 
102
 
103
  def generate_document(media_path, output_text, file_format, font_choice, font_size, line_spacing, alignment, image_size):
 
104
  plain_text = format_plain_text(output_text)
105
  if file_format == "pdf":
106
  return generate_pdf(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size)
@@ -108,30 +132,65 @@ def generate_document(media_path, output_text, file_format, font_choice, font_si
108
  return generate_docx(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size)
109
 
110
  def generate_pdf(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size):
 
111
  filename = f"output_{uuid.uuid4()}.pdf"
112
- doc = SimpleDocTemplate(filename, pagesize=A4, rightMargin=inch, leftMargin=inch, topMargin=inch, bottomMargin=inch)
 
 
 
 
 
 
 
113
  styles = getSampleStyleSheet()
114
  styles["Normal"].fontName = font_choice
115
  styles["Normal"].fontSize = int(font_size)
116
  styles["Normal"].leading = int(font_size) * line_spacing
117
- styles["Normal"].alignment = {"Left": 0, "Center": 1, "Right": 2, "Justified": 4}[alignment]
 
 
 
 
 
 
 
118
  font_path = f"font/{font_choice}"
119
  pdfmetrics.registerFont(TTFont(font_choice, font_path))
 
120
  story = []
121
- image_sizes = {"Small": (200, 200), "Medium": (400, 400), "Large": (600, 600)}
 
 
 
 
 
 
122
  img = RLImage(media_path, width=image_sizes[image_size][0], height=image_sizes[image_size][1])
123
  story.append(img)
124
  story.append(Spacer(1, 12))
125
- story.append(Paragraph(plain_text, styles["Normal"]))
 
 
 
 
126
  doc.build(story)
127
  return filename
128
 
129
  def generate_docx(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size):
 
130
  filename = f"output_{uuid.uuid4()}.docx"
131
  doc = docx.Document()
132
- image_sizes = {"Small": docx.shared.Inches(2), "Medium": docx.shared.Inches(4), "Large": docx.shared.Inches(6)}
 
 
 
 
 
 
133
  doc.add_picture(media_path, width=image_sizes[image_size])
134
  doc.add_paragraph()
 
 
135
  paragraph = doc.add_paragraph()
136
  paragraph.paragraph_format.line_spacing = line_spacing
137
  paragraph.paragraph_format.alignment = {
@@ -140,78 +199,141 @@ def generate_docx(media_path, plain_text, font_choice, font_size, line_spacing,
140
  "Right": WD_ALIGN_PARAGRAPH.RIGHT,
141
  "Justified": WD_ALIGN_PARAGRAPH.JUSTIFY
142
  }[alignment]
143
- run = paragraph.add_run(format_plain_text(output_text))
144
  run.font.name = font_choice
145
  run.font.size = docx.shared.Pt(int(font_size))
 
146
  doc.save(filename)
147
  return filename
148
 
149
- # CSS for compact styling
150
  css = """
151
- #output { height: 300px; overflow: auto; border: 1px solid #ccc; }
152
- .submit-btn { background-color: #cf3434 !important; color: white !important; }
153
- .submit-btn:hover { background-color: #ff2323 !important; }
154
- .download-btn { background-color: #35a6d6 !important; color: white !important; }
155
- .download-btn:hover { background-color: #22bcff !important; }
156
- .compact { margin: 5px 0; }
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  """
158
 
 
159
  with gr.Blocks(css=css) as demo:
160
- gr.Markdown("# Qwen2VL: Compact Vision & Language Processing")
161
-
162
- with gr.Row():
163
- with gr.Column(scale=1):
164
- model_choice = gr.Dropdown(label="Model", choices=list(MODEL_OPTIONS.keys()), value="Latex OCR", elem_classes="compact")
165
- input_media = gr.File(label="Upload Image", type="filepath", elem_classes="compact")
166
- text_input = gr.Textbox(label="Question", placeholder="Ask about the image...", elem_classes="compact")
167
- submit_btn = gr.Button("Submit", elem_classes="submit-btn compact")
168
- with gr.Column(scale=1):
169
- output_text = gr.Textbox(label="Output", lines=8, elem_classes="compact")
170
- plain_text_output = gr.Textbox(label="Plain Text", lines=8, elem_classes="compact")
171
-
172
- submit_btn.click(qwen_inference, [model_choice, input_media, text_input], [output_text]
173
- ).then(lambda txt: format_plain_text(txt), [output_text], [plain_text_output])
174
-
175
- # Examples section remains compact
176
- gr.Examples(
177
- examples=[
178
- ["examples/1.png", "summarize the letter", "Text Analogy Ocrtest"],
179
- ["examples/2.jpg", "Summarize the full image in detail", "Latex OCR"],
180
- ["examples/3.png", "Describe the photo", "Qwen2VL Base"],
181
- ["examples/4.png", "summarize and solve the problem", "Math Prase"],
182
- ],
183
- inputs=[input_media, text_input, model_choice],
184
- outputs=[output_text, plain_text_output],
185
- fn=lambda img, question, model: qwen_inference(model, img, question),
186
- cache_examples=False
187
- )
188
-
189
- # Advanced options tucked into an accordion
190
- with gr.Accordion("Advanced Document Options", open=False):
191
  with gr.Row():
192
- line_spacing = gr.Dropdown(choices=[0.5, 1.0, 1.15, 1.5, 2.0, 2.5, 3.0], value=1.5, label="Line Spacing", elem_classes="compact")
193
- font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18", "20", "22", "24"], value="18", label="Font Size", elem_classes="compact")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  font_choice = gr.Dropdown(
196
- choices=["DejaVuMathTeXGyre.ttf", "FiraCode-Medium.ttf", "InputMono-Light.ttf",
197
- "JetBrainsMono-Thin.ttf", "ProggyCrossed Regular Mac.ttf", "SourceCodePro-Black.ttf",
198
- "arial.ttf", "calibri.ttf", "mukta-malar-extralight.ttf", "noto-sans-arabic-medium.ttf",
199
- "times new roman.ttf", "ANGSA.ttf", "Book-Antiqua.ttf", "CONSOLA.TTF", "COOPBL.TTF",
200
- "Rockwell-Bold.ttf", "Candara Light.TTF", "Carlito-Regular.ttf", "Castellar.ttf",
201
- "Courier New.ttf", "LSANS.TTF", "Lucida Bright Regular.ttf", "TRTempusSansITC.ttf",
202
- "Verdana.ttf", "bell-mt.ttf", "eras-itc-light.ttf", "fonnts.com-aptos-light.ttf",
203
- "georgia.ttf", "segoeuithis.ttf", "youyuan.TTF", "TfPonetoneExpanded-7BJZA.ttf"],
204
- value="youyuan.TTF", label="Font Choice", elem_classes="compact")
205
- alignment = gr.Dropdown(choices=["Left", "Center", "Right", "Justified"], value="Justified", label="Alignment", elem_classes="compact")
206
- with gr.Row():
207
- image_size = gr.Dropdown(choices=["Small", "Medium", "Large"], value="Small", label="Image Size", elem_classes="compact")
208
- file_format = gr.Radio(["pdf", "docx"], label="Format", value="pdf", elem_classes="compact")
209
- get_document_btn = gr.Button("Get Document", elem_classes="download-btn compact")
210
-
211
- get_document_btn.click(
212
- generate_document,
213
- [input_media, output_text, file_format, font_choice, font_size, line_spacing, alignment, image_size],
214
- gr.File(label="Download Document")
215
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
  demo.launch(debug=True)
 
41
  image_extensions = Image.registered_extensions()
42
 
43
  def identify_and_save_blob(blob_path):
44
+ """Identifies if the blob is an image and saves it."""
45
  try:
46
  with open(blob_path, 'rb') as file:
47
  blob_content = file.read()
48
  try:
49
+ Image.open(io.BytesIO(blob_content)).verify() # Check if it's a valid image
50
+ extension = ".png" # Default to PNG for saving
51
  media_type = "image"
52
  except (IOError, SyntaxError):
53
  raise ValueError("Unsupported media type. Please upload a valid image.")
54
+
55
  filename = f"temp_{uuid.uuid4()}_media{extension}"
56
  with open(filename, "wb") as f:
57
  f.write(blob_content)
58
+
59
  return filename, media_type
60
+
61
  except FileNotFoundError:
62
  raise ValueError(f"The file {blob_path} was not found.")
63
  except Exception as e:
64
+ raise ValueError(f"An error occurred while processing the file: {e}")
65
 
66
  @spaces.GPU
67
  def qwen_inference(model_name, media_input, text_input=None):
68
+ """Handles inference for the selected model."""
69
  model = models[model_name]
70
  processor = processors[model_name]
71
 
72
  if isinstance(media_input, str):
73
  media_path = media_input
74
+ if media_path.endswith(tuple([i for i in image_extensions.keys()])):
75
  media_type = "image"
76
  else:
77
  try:
 
79
  except Exception as e:
80
  raise ValueError("Unsupported media type. Please upload a valid image.")
81
 
82
+ messages = [
83
+ {
84
+ "role": "user",
85
+ "content": [
86
+ {
87
+ "type": media_type,
88
+ media_type: media_path
89
+ },
90
+ {"type": "text", "text": text_input},
91
+ ],
92
+ }
93
+ ]
94
 
95
+ text = processor.apply_chat_template(
96
+ messages, tokenize=False, add_generation_prompt=True
97
+ )
98
  image_inputs, _ = process_vision_info(messages)
99
+ inputs = processor(
100
+ text=[text],
101
+ images=image_inputs,
102
+ padding=True,
103
+ return_tensors="pt",
104
+ ).to("cuda")
105
 
106
+ streamer = TextIteratorStreamer(
107
+ processor.tokenizer, skip_prompt=True, skip_special_tokens=True
108
+ )
109
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
110
+
111
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
112
  thread.start()
113
 
114
  buffer = ""
115
  for new_text in streamer:
116
  buffer += new_text
117
+ # Remove <|im_end|> or similar tokens from the output
118
  buffer = buffer.replace("<|im_end|>", "")
119
  yield buffer
120
 
121
  def format_plain_text(output_text):
122
+ """Formats the output text as plain text without LaTeX delimiters."""
123
+ plain_text = output_text.replace("\\(", "").replace("\\)", "").replace("\\[", "").replace("\\]", "")
124
+ return plain_text
125
 
126
  def generate_document(media_path, output_text, file_format, font_choice, font_size, line_spacing, alignment, image_size):
127
+ """Generates a document with the input image and plain text output."""
128
  plain_text = format_plain_text(output_text)
129
  if file_format == "pdf":
130
  return generate_pdf(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size)
 
132
  return generate_docx(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size)
133
 
134
  def generate_pdf(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size):
135
+ """Generates a PDF document."""
136
  filename = f"output_{uuid.uuid4()}.pdf"
137
+ doc = SimpleDocTemplate(
138
+ filename,
139
+ pagesize=A4,
140
+ rightMargin=inch,
141
+ leftMargin=inch,
142
+ topMargin=inch,
143
+ bottomMargin=inch
144
+ )
145
  styles = getSampleStyleSheet()
146
  styles["Normal"].fontName = font_choice
147
  styles["Normal"].fontSize = int(font_size)
148
  styles["Normal"].leading = int(font_size) * line_spacing
149
+ styles["Normal"].alignment = {
150
+ "Left": 0,
151
+ "Center": 1,
152
+ "Right": 2,
153
+ "Justified": 4
154
+ }[alignment]
155
+
156
+ # Register font
157
  font_path = f"font/{font_choice}"
158
  pdfmetrics.registerFont(TTFont(font_choice, font_path))
159
+
160
  story = []
161
+
162
+ # Add image with size adjustment
163
+ image_sizes = {
164
+ "Small": (200, 200),
165
+ "Medium": (400, 400),
166
+ "Large": (600, 600)
167
+ }
168
  img = RLImage(media_path, width=image_sizes[image_size][0], height=image_sizes[image_size][1])
169
  story.append(img)
170
  story.append(Spacer(1, 12))
171
+
172
+ # Add plain text output
173
+ text = Paragraph(plain_text, styles["Normal"])
174
+ story.append(text)
175
+
176
  doc.build(story)
177
  return filename
178
 
179
  def generate_docx(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size):
180
+ """Generates a DOCX document."""
181
  filename = f"output_{uuid.uuid4()}.docx"
182
  doc = docx.Document()
183
+
184
+ # Add image with size adjustment
185
+ image_sizes = {
186
+ "Small": docx.shared.Inches(2),
187
+ "Medium": docx.shared.Inches(4),
188
+ "Large": docx.shared.Inches(6)
189
+ }
190
  doc.add_picture(media_path, width=image_sizes[image_size])
191
  doc.add_paragraph()
192
+
193
+ # Add plain text output
194
  paragraph = doc.add_paragraph()
195
  paragraph.paragraph_format.line_spacing = line_spacing
196
  paragraph.paragraph_format.alignment = {
 
199
  "Right": WD_ALIGN_PARAGRAPH.RIGHT,
200
  "Justified": WD_ALIGN_PARAGRAPH.JUSTIFY
201
  }[alignment]
202
+ run = paragraph.add_run(plain_text)
203
  run.font.name = font_choice
204
  run.font.size = docx.shared.Pt(int(font_size))
205
+
206
  doc.save(filename)
207
  return filename
208
 
209
+ # Updated CSS for output styling
210
  css = """
211
+ #output_text, #plain_text_output {
212
+ height: 200px;
213
+ overflow: auto;
214
+ border: 1px solid #ccc;
215
+ }
216
+ .submit-btn {
217
+ background-color: #cf3434 !important;
218
+ color: white !important;
219
+ }
220
+ .submit-btn:hover {
221
+ background-color: #ff2323 !important;
222
+ }
223
+ .download-btn {
224
+ background-color: #35a6d6 !important;
225
+ color: white !important;
226
+ }
227
+ .download-btn:hover {
228
+ background-color: #22bcff !important;
229
+ }
230
  """
231
 
232
+ # Gradio app setup with optimized UI
233
  with gr.Blocks(css=css) as demo:
234
+ gr.Markdown("# Qwen2VL Models: Vision and Language Processing")
235
+
236
+ with gr.Tab(label="Image Input"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  with gr.Row():
238
+ with gr.Column():
239
+ model_choice = gr.Dropdown(
240
+ label="Model Selection",
241
+ choices=list(MODEL_OPTIONS.keys()),
242
+ value="Latex OCR"
243
+ )
244
+ input_media = gr.File(
245
+ label="Upload Image", type="filepath"
246
+ )
247
+ text_input = gr.Textbox(label="Question", placeholder="Ask a question about the image...")
248
+ submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
249
+
250
+ with gr.Column():
251
+ output_text = gr.Textbox(label="Output Text", lines=5, elem_id="output_text")
252
+ plain_text_output = gr.Textbox(label="Standardized Plain Text", lines=5, elem_id="plain_text_output")
253
+
254
  with gr.Row():
255
+ gr.Examples(
256
+ examples=[
257
+ ["examples/1.png", "summarize the letter", "Text Analogy Ocrtest"],
258
+ ["examples/2.jpg", "Summarize the full image in detail", "Latex OCR"],
259
+ ["examples/3.png", "Describe the photo", "Qwen2VL Base"],
260
+ ["examples/4.png", "summarize and solve the problem", "Math Prase"],
261
+ ],
262
+ inputs=[input_media, text_input, model_choice],
263
+ outputs=[output_text, plain_text_output],
264
+ fn=lambda img, question, model: qwen_inference(model, img, question),
265
+ cache_examples=False,
266
+ )
267
+
268
+ with gr.Accordion("Document Generation Options", open=False):
269
+ line_spacing = gr.Dropdown(
270
+ choices=[0.5, 1.0, 1.15, 1.5, 2.0, 2.5, 3.0],
271
+ value=1.5,
272
+ label="Line Spacing"
273
+ )
274
+ font_size = gr.Dropdown(
275
+ choices=["8", "10", "12", "14", "16", "18", "20", "22", "24"],
276
+ value="18",
277
+ label="Font Size"
278
+ )
279
  font_choice = gr.Dropdown(
280
+ choices=[
281
+ "DejaVuMathTeXGyre.ttf",
282
+ "FiraCode-Medium.ttf",
283
+ "InputMono-Light.ttf",
284
+ "JetBrainsMono-Thin.ttf",
285
+ "ProggyCrossed Regular Mac.ttf",
286
+ "SourceCodePro-Black.ttf",
287
+ "arial.ttf",
288
+ "calibri.ttf",
289
+ "mukta-malar-extralight.ttf",
290
+ "noto-sans-arabic-medium.ttf",
291
+ "times new roman.ttf",
292
+ "ANGSA.ttf",
293
+ "Book-Antiqua.ttf",
294
+ "CONSOLA.TTF",
295
+ "COOPBL.TTF",
296
+ "Rockwell-Bold.ttf",
297
+ "Candara Light.TTF",
298
+ "Carlito-Regular.ttf Carlito-Regular.ttf",
299
+ "Castellar.ttf",
300
+ "Courier New.ttf",
301
+ "LSANS.TTF",
302
+ "Lucida Bright Regular.ttf",
303
+ "TRTempusSansITC.ttf",
304
+ "Verdana.ttf",
305
+ "bell-mt.ttf",
306
+ "eras-itc-light.ttf",
307
+ "fonnts.com-aptos-light.ttf",
308
+ "georgia.ttf",
309
+ "segoeuithis.ttf",
310
+ "youyuan.TTF",
311
+ "TfPonetoneExpanded-7BJZA.ttf",
312
+ ],
313
+ value="youyuan.TTF",
314
+ label="Font Choice"
315
+ )
316
+ alignment = gr.Dropdown(
317
+ choices=["Left", "Center", "Right", "Justified"],
318
+ value="Justified",
319
+ label="Text Alignment"
320
+ )
321
+ image_size = gr.Dropdown(
322
+ choices=["Small", "Medium", "Large"],
323
+ value="Small",
324
+ label="Image Size"
325
+ )
326
+ file_format = gr.Radio(["pdf", "docx"], label="File Format", value="pdf")
327
+ get_document_btn = gr.Button(value="Get Document", elem_classes="download-btn")
328
+
329
+ submit_btn.click(
330
+ qwen_inference, [model_choice, input_media, text_input], [output_text]
331
+ ).then(
332
+ lambda output_text: format_plain_text(output_text), [output_text], [plain_text_output]
333
+ )
334
+
335
+ get_document_btn.click(
336
+ generate_document, [input_media, output_text, file_format, font_choice, font_size, line_spacing, alignment, image_size], gr.File(label="Download Document")
337
+ )
338
 
339
  demo.launch(debug=True)