prithivMLmods commited on
Commit
53656ad
·
verified ·
1 Parent(s): 1e18569

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -207
app.py CHANGED
@@ -41,37 +41,32 @@ for name, model_id in MODEL_OPTIONS.items():
41
  image_extensions = Image.registered_extensions()
42
 
43
  def identify_and_save_blob(blob_path):
44
- """Identifies if the blob is an image and saves it."""
45
  try:
46
  with open(blob_path, 'rb') as file:
47
  blob_content = file.read()
48
  try:
49
- Image.open(io.BytesIO(blob_content)).verify() # Check if it's a valid image
50
- extension = ".png" # Default to PNG for saving
51
  media_type = "image"
52
  except (IOError, SyntaxError):
53
  raise ValueError("Unsupported media type. Please upload a valid image.")
54
-
55
  filename = f"temp_{uuid.uuid4()}_media{extension}"
56
  with open(filename, "wb") as f:
57
  f.write(blob_content)
58
-
59
  return filename, media_type
60
-
61
  except FileNotFoundError:
62
  raise ValueError(f"The file {blob_path} was not found.")
63
  except Exception as e:
64
- raise ValueError(f"An error occurred while processing the file: {e}")
65
 
66
  @spaces.GPU
67
  def qwen_inference(model_name, media_input, text_input=None):
68
- """Handles inference for the selected model."""
69
  model = models[model_name]
70
  processor = processors[model_name]
71
 
72
  if isinstance(media_input, str):
73
  media_path = media_input
74
- if media_path.endswith(tuple([i for i in image_extensions.keys()])):
75
  media_type = "image"
76
  else:
77
  try:
@@ -79,53 +74,33 @@ def qwen_inference(model_name, media_input, text_input=None):
79
  except Exception as e:
80
  raise ValueError("Unsupported media type. Please upload a valid image.")
81
 
82
- messages = [
83
- {
84
- "role": "user",
85
- "content": [
86
- {
87
- "type": media_type,
88
- media_type: media_path
89
- },
90
- {"type": "text", "text": text_input},
91
- ],
92
- }
93
- ]
94
 
95
- text = processor.apply_chat_template(
96
- messages, tokenize=False, add_generation_prompt=True
97
- )
98
  image_inputs, _ = process_vision_info(messages)
99
- inputs = processor(
100
- text=[text],
101
- images=image_inputs,
102
- padding=True,
103
- return_tensors="pt",
104
- ).to("cuda")
105
 
106
- streamer = TextIteratorStreamer(
107
- processor.tokenizer, skip_prompt=True, skip_special_tokens=True
108
- )
109
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
110
-
111
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
112
  thread.start()
113
 
114
  buffer = ""
115
  for new_text in streamer:
116
  buffer += new_text
117
- # Remove <|im_end|> or similar tokens from the output
118
  buffer = buffer.replace("<|im_end|>", "")
119
  yield buffer
120
 
121
  def format_plain_text(output_text):
122
- """Formats the output text as plain text without LaTeX delimiters."""
123
- # Remove LaTeX delimiters and convert to plain text
124
- plain_text = output_text.replace("\\(", "").replace("\\)", "").replace("\\[", "").replace("\\]", "")
125
- return plain_text
126
 
127
  def generate_document(media_path, output_text, file_format, font_choice, font_size, line_spacing, alignment, image_size):
128
- """Generates a document with the input image and plain text output."""
129
  plain_text = format_plain_text(output_text)
130
  if file_format == "pdf":
131
  return generate_pdf(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size)
@@ -133,65 +108,30 @@ def generate_document(media_path, output_text, file_format, font_choice, font_si
133
  return generate_docx(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size)
134
 
135
  def generate_pdf(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size):
136
- """Generates a PDF document."""
137
  filename = f"output_{uuid.uuid4()}.pdf"
138
- doc = SimpleDocTemplate(
139
- filename,
140
- pagesize=A4,
141
- rightMargin=inch,
142
- leftMargin=inch,
143
- topMargin=inch,
144
- bottomMargin=inch
145
- )
146
  styles = getSampleStyleSheet()
147
  styles["Normal"].fontName = font_choice
148
  styles["Normal"].fontSize = int(font_size)
149
  styles["Normal"].leading = int(font_size) * line_spacing
150
- styles["Normal"].alignment = {
151
- "Left": 0,
152
- "Center": 1,
153
- "Right": 2,
154
- "Justified": 4
155
- }[alignment]
156
-
157
- # Register font
158
  font_path = f"font/{font_choice}"
159
  pdfmetrics.registerFont(TTFont(font_choice, font_path))
160
-
161
  story = []
162
-
163
- # Add image with size adjustment
164
- image_sizes = {
165
- "Small": (200, 200),
166
- "Medium": (400, 400),
167
- "Large": (600, 600)
168
- }
169
  img = RLImage(media_path, width=image_sizes[image_size][0], height=image_sizes[image_size][1])
170
  story.append(img)
171
  story.append(Spacer(1, 12))
172
-
173
- # Add plain text output
174
- text = Paragraph(plain_text, styles["Normal"])
175
- story.append(text)
176
-
177
  doc.build(story)
178
  return filename
179
 
180
  def generate_docx(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size):
181
- """Generates a DOCX document."""
182
  filename = f"output_{uuid.uuid4()}.docx"
183
  doc = docx.Document()
184
-
185
- # Add image with size adjustment
186
- image_sizes = {
187
- "Small": docx.shared.Inches(2),
188
- "Medium": docx.shared.Inches(4),
189
- "Large": docx.shared.Inches(6)
190
- }
191
  doc.add_picture(media_path, width=image_sizes[image_size])
192
  doc.add_paragraph()
193
-
194
- # Add plain text output
195
  paragraph = doc.add_paragraph()
196
  paragraph.paragraph_format.line_spacing = line_spacing
197
  paragraph.paragraph_format.alignment = {
@@ -200,144 +140,78 @@ def generate_docx(media_path, plain_text, font_choice, font_size, line_spacing,
200
  "Right": WD_ALIGN_PARAGRAPH.RIGHT,
201
  "Justified": WD_ALIGN_PARAGRAPH.JUSTIFY
202
  }[alignment]
203
- run = paragraph.add_run(plain_text)
204
  run.font.name = font_choice
205
  run.font.size = docx.shared.Pt(int(font_size))
206
-
207
  doc.save(filename)
208
  return filename
209
 
210
- # CSS for output styling
211
  css = """
212
- #output {
213
- height: 500px;
214
- overflow: auto;
215
- border: 1px solid #ccc;
216
- }
217
- .submit-btn {
218
- background-color: #cf3434 !important;
219
- color: white !important;
220
- }
221
- .submit-btn:hover {
222
- background-color: #ff2323 !important;
223
- }
224
- .download-btn {
225
- background-color: #35a6d6 !important;
226
- color: white !important;
227
- }
228
- .download-btn:hover {
229
- background-color: #22bcff !important;
230
- }
231
  """
232
 
233
- # Gradio app setup
234
  with gr.Blocks(css=css) as demo:
235
- gr.Markdown("# Qwen2VL Models: Vision and Language Processing")
236
-
237
- with gr.Tab(label="Image Input"):
238
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  with gr.Row():
240
- with gr.Column():
241
- model_choice = gr.Dropdown(
242
- label="Model Selection",
243
- choices=list(MODEL_OPTIONS.keys()),
244
- value="Latex OCR"
245
- )
246
- input_media = gr.File(
247
- label="Upload Image", type="filepath"
248
- )
249
- text_input = gr.Textbox(label="Question", placeholder="Ask a question about the image...")
250
- submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
251
-
252
- with gr.Column():
253
- output_text = gr.Textbox(label="Output Text", lines=10)
254
- plain_text_output = gr.Textbox(label="Standardized Plain Text", lines=10)
255
-
256
- submit_btn.click(
257
- qwen_inference, [model_choice, input_media, text_input], [output_text]
258
- ).then(
259
- lambda output_text: format_plain_text(output_text), [output_text], [plain_text_output]
260
- )
261
-
262
- # Add examples directly usable by clicking
263
  with gr.Row():
264
- gr.Examples(
265
- examples=[
266
- ["examples/1.png", "summarize the letter", "Text Analogy Ocrtest"],
267
- ["examples/2.jpg", "Summarize the full image in detail", "Latex OCR"],
268
- ["examples/3.png", "Describe the photo", "Qwen2VL Base"],
269
- ["examples/4.png", "summarize and solve the problem", "Math Prase"],
270
- ],
271
- inputs=[input_media, text_input, model_choice],
272
- outputs=[output_text, plain_text_output],
273
- fn=lambda img, question, model: qwen_inference(model, img, question),
274
- cache_examples=False,
275
- )
276
-
277
  with gr.Row():
278
- with gr.Column():
279
- line_spacing = gr.Dropdown(
280
- choices=[0.5, 1.0, 1.15, 1.5, 2.0, 2.5, 3.0],
281
- value=1.5,
282
- label="Line Spacing"
283
- )
284
- font_size = gr.Dropdown(
285
- choices=["8", "10", "12", "14", "16", "18", "20", "22", "24"],
286
- value="18",
287
- label="Font Size"
288
- )
289
- font_choice = gr.Dropdown(
290
- choices=[
291
- "DejaVuMathTeXGyre.ttf",
292
- "FiraCode-Medium.ttf",
293
- "InputMono-Light.ttf",
294
- "JetBrainsMono-Thin.ttf",
295
- "ProggyCrossed Regular Mac.ttf",
296
- "SourceCodePro-Black.ttf",
297
- "arial.ttf",
298
- "calibri.ttf",
299
- "mukta-malar-extralight.ttf",
300
- "noto-sans-arabic-medium.ttf",
301
- "times new roman.ttf",
302
- "ANGSA.ttf",
303
- "Book-Antiqua.ttf",
304
- "CONSOLA.TTF",
305
- "COOPBL.TTF",
306
- "Rockwell-Bold.ttf",
307
- "Candara Light.TTF",
308
- "Carlito-Regular.ttf Carlito-Regular.ttf",
309
- "Castellar.ttf",
310
- "Courier New.ttf",
311
- "LSANS.TTF",
312
- "Lucida Bright Regular.ttf",
313
- "TRTempusSansITC.ttf",
314
- "Verdana.ttf",
315
- "bell-mt.ttf",
316
- "eras-itc-light.ttf",
317
- "fonnts.com-aptos-light.ttf",
318
- "georgia.ttf",
319
- "segoeuithis.ttf",
320
- "youyuan.TTF",
321
- "TfPonetoneExpanded-7BJZA.ttf",
322
- ],
323
- value="youyuan.TTF",
324
- label="Font Choice"
325
- )
326
- alignment = gr.Dropdown(
327
- choices=["Left", "Center", "Right", "Justified"],
328
- value="Justified",
329
- label="Text Alignment"
330
- )
331
- image_size = gr.Dropdown(
332
- choices=["Small", "Medium", "Large"],
333
- value="Small",
334
- label="Image Size"
335
- )
336
- file_format = gr.Radio(["pdf", "docx"], label="File Format", value="pdf")
337
- get_document_btn = gr.Button(value="Get Document", elem_classes="download-btn")
338
-
339
- get_document_btn.click(
340
- generate_document, [input_media, output_text, file_format, font_choice, font_size, line_spacing, alignment, image_size], gr.File(label="Download Document")
341
- )
342
 
343
  demo.launch(debug=True)
 
41
  image_extensions = Image.registered_extensions()
42
 
43
  def identify_and_save_blob(blob_path):
 
44
  try:
45
  with open(blob_path, 'rb') as file:
46
  blob_content = file.read()
47
  try:
48
+ Image.open(io.BytesIO(blob_content)).verify() # Validate image
49
+ extension = ".png" # Default extension
50
  media_type = "image"
51
  except (IOError, SyntaxError):
52
  raise ValueError("Unsupported media type. Please upload a valid image.")
 
53
  filename = f"temp_{uuid.uuid4()}_media{extension}"
54
  with open(filename, "wb") as f:
55
  f.write(blob_content)
 
56
  return filename, media_type
 
57
  except FileNotFoundError:
58
  raise ValueError(f"The file {blob_path} was not found.")
59
  except Exception as e:
60
+ raise ValueError(f"Error processing file: {e}")
61
 
62
  @spaces.GPU
63
  def qwen_inference(model_name, media_input, text_input=None):
 
64
  model = models[model_name]
65
  processor = processors[model_name]
66
 
67
  if isinstance(media_input, str):
68
  media_path = media_input
69
+ if media_path.endswith(tuple(image_extensions.keys())):
70
  media_type = "image"
71
  else:
72
  try:
 
74
  except Exception as e:
75
  raise ValueError("Unsupported media type. Please upload a valid image.")
76
 
77
+ messages = [{
78
+ "role": "user",
79
+ "content": [
80
+ {"type": media_type, media_type: media_path},
81
+ {"type": "text", "text": text_input},
82
+ ],
83
+ }]
 
 
 
 
 
84
 
85
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
86
  image_inputs, _ = process_vision_info(messages)
87
+ inputs = processor(text=[text], images=image_inputs, padding=True, return_tensors="pt").to("cuda")
 
 
 
 
 
88
 
89
+ streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
 
 
90
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
 
91
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
92
  thread.start()
93
 
94
  buffer = ""
95
  for new_text in streamer:
96
  buffer += new_text
 
97
  buffer = buffer.replace("<|im_end|>", "")
98
  yield buffer
99
 
100
  def format_plain_text(output_text):
101
+ return output_text.replace("\\(", "").replace("\\)", "").replace("\\[", "").replace("\\]", "")
 
 
 
102
 
103
  def generate_document(media_path, output_text, file_format, font_choice, font_size, line_spacing, alignment, image_size):
 
104
  plain_text = format_plain_text(output_text)
105
  if file_format == "pdf":
106
  return generate_pdf(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size)
 
108
  return generate_docx(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size)
109
 
110
  def generate_pdf(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size):
 
111
  filename = f"output_{uuid.uuid4()}.pdf"
112
+ doc = SimpleDocTemplate(filename, pagesize=A4, rightMargin=inch, leftMargin=inch, topMargin=inch, bottomMargin=inch)
 
 
 
 
 
 
 
113
  styles = getSampleStyleSheet()
114
  styles["Normal"].fontName = font_choice
115
  styles["Normal"].fontSize = int(font_size)
116
  styles["Normal"].leading = int(font_size) * line_spacing
117
+ styles["Normal"].alignment = {"Left": 0, "Center": 1, "Right": 2, "Justified": 4}[alignment]
 
 
 
 
 
 
 
118
  font_path = f"font/{font_choice}"
119
  pdfmetrics.registerFont(TTFont(font_choice, font_path))
 
120
  story = []
121
+ image_sizes = {"Small": (200, 200), "Medium": (400, 400), "Large": (600, 600)}
 
 
 
 
 
 
122
  img = RLImage(media_path, width=image_sizes[image_size][0], height=image_sizes[image_size][1])
123
  story.append(img)
124
  story.append(Spacer(1, 12))
125
+ story.append(Paragraph(plain_text, styles["Normal"]))
 
 
 
 
126
  doc.build(story)
127
  return filename
128
 
129
  def generate_docx(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size):
 
130
  filename = f"output_{uuid.uuid4()}.docx"
131
  doc = docx.Document()
132
+ image_sizes = {"Small": docx.shared.Inches(2), "Medium": docx.shared.Inches(4), "Large": docx.shared.Inches(6)}
 
 
 
 
 
 
133
  doc.add_picture(media_path, width=image_sizes[image_size])
134
  doc.add_paragraph()
 
 
135
  paragraph = doc.add_paragraph()
136
  paragraph.paragraph_format.line_spacing = line_spacing
137
  paragraph.paragraph_format.alignment = {
 
140
  "Right": WD_ALIGN_PARAGRAPH.RIGHT,
141
  "Justified": WD_ALIGN_PARAGRAPH.JUSTIFY
142
  }[alignment]
143
+ run = paragraph.add_run(format_plain_text(output_text))
144
  run.font.name = font_choice
145
  run.font.size = docx.shared.Pt(int(font_size))
 
146
  doc.save(filename)
147
  return filename
148
 
149
+ # CSS for compact styling
150
  css = """
151
+ #output { height: 300px; overflow: auto; border: 1px solid #ccc; }
152
+ .submit-btn { background-color: #cf3434 !important; color: white !important; }
153
+ .submit-btn:hover { background-color: #ff2323 !important; }
154
+ .download-btn { background-color: #35a6d6 !important; color: white !important; }
155
+ .download-btn:hover { background-color: #22bcff !important; }
156
+ .compact { margin: 5px 0; }
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  """
158
 
 
159
  with gr.Blocks(css=css) as demo:
160
+ gr.Markdown("# Qwen2VL: Compact Vision & Language Processing")
161
+
162
+ with gr.Row():
163
+ with gr.Column(scale=1):
164
+ model_choice = gr.Dropdown(label="Model", choices=list(MODEL_OPTIONS.keys()), value="Latex OCR", elem_classes="compact")
165
+ input_media = gr.File(label="Upload Image", type="filepath", elem_classes="compact")
166
+ text_input = gr.Textbox(label="Question", placeholder="Ask about the image...", elem_classes="compact")
167
+ submit_btn = gr.Button("Submit", elem_classes="submit-btn compact")
168
+ with gr.Column(scale=1):
169
+ output_text = gr.Textbox(label="Output", lines=8, elem_classes="compact")
170
+ plain_text_output = gr.Textbox(label="Plain Text", lines=8, elem_classes="compact")
171
+
172
+ submit_btn.click(qwen_inference, [model_choice, input_media, text_input], [output_text]
173
+ ).then(lambda txt: format_plain_text(txt), [output_text], [plain_text_output])
174
+
175
+ # Examples section remains compact
176
+ gr.Examples(
177
+ examples=[
178
+ ["examples/1.png", "summarize the letter", "Text Analogy Ocrtest"],
179
+ ["examples/2.jpg", "Summarize the full image in detail", "Latex OCR"],
180
+ ["examples/3.png", "Describe the photo", "Qwen2VL Base"],
181
+ ["examples/4.png", "summarize and solve the problem", "Math Prase"],
182
+ ],
183
+ inputs=[input_media, text_input, model_choice],
184
+ outputs=[output_text, plain_text_output],
185
+ fn=lambda img, question, model: qwen_inference(model, img, question),
186
+ cache_examples=False
187
+ )
188
+
189
+ # Advanced options tucked into an accordion
190
+ with gr.Accordion("Advanced Document Options", open=False):
191
  with gr.Row():
192
+ line_spacing = gr.Dropdown(choices=[0.5, 1.0, 1.15, 1.5, 2.0, 2.5, 3.0], value=1.5, label="Line Spacing", elem_classes="compact")
193
+ font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18", "20", "22", "24"], value="18", label="Font Size", elem_classes="compact")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  with gr.Row():
195
+ font_choice = gr.Dropdown(
196
+ choices=["DejaVuMathTeXGyre.ttf", "FiraCode-Medium.ttf", "InputMono-Light.ttf",
197
+ "JetBrainsMono-Thin.ttf", "ProggyCrossed Regular Mac.ttf", "SourceCodePro-Black.ttf",
198
+ "arial.ttf", "calibri.ttf", "mukta-malar-extralight.ttf", "noto-sans-arabic-medium.ttf",
199
+ "times new roman.ttf", "ANGSA.ttf", "Book-Antiqua.ttf", "CONSOLA.TTF", "COOPBL.TTF",
200
+ "Rockwell-Bold.ttf", "Candara Light.TTF", "Carlito-Regular.ttf", "Castellar.ttf",
201
+ "Courier New.ttf", "LSANS.TTF", "Lucida Bright Regular.ttf", "TRTempusSansITC.ttf",
202
+ "Verdana.ttf", "bell-mt.ttf", "eras-itc-light.ttf", "fonnts.com-aptos-light.ttf",
203
+ "georgia.ttf", "segoeuithis.ttf", "youyuan.TTF", "TfPonetoneExpanded-7BJZA.ttf"],
204
+ value="youyuan.TTF", label="Font Choice", elem_classes="compact")
205
+ alignment = gr.Dropdown(choices=["Left", "Center", "Right", "Justified"], value="Justified", label="Alignment", elem_classes="compact")
 
 
206
  with gr.Row():
207
+ image_size = gr.Dropdown(choices=["Small", "Medium", "Large"], value="Small", label="Image Size", elem_classes="compact")
208
+ file_format = gr.Radio(["pdf", "docx"], label="Format", value="pdf", elem_classes="compact")
209
+ get_document_btn = gr.Button("Get Document", elem_classes="download-btn compact")
210
+
211
+ get_document_btn.click(
212
+ generate_document,
213
+ [input_media, output_text, file_format, font_choice, font_size, line_spacing, alignment, image_size],
214
+ gr.File(label="Download Document")
215
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
  demo.launch(debug=True)