Spaces:

Saurabh1207
/

VLM

Running

App Files Files Community

Saurabh Kumar commited on Sep 30, 2024

Commit

0be6d6e

verified ·

1 Parent(s): 7a40854

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -43

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
 import streamlit as st
 import torch
@@ -12,6 +12,7 @@ def init_qwen_model():
     return model, processor
 MODEL, PROCESSOR = init_qwen_model()
 # Streamlit app title
 st.title("OCR Image Text Extraction")
@@ -22,55 +23,63 @@ if uploaded_file is not None:
     # Open the uploaded image file
     image = Image.open(uploaded_file)
     st.image(image, caption="Uploaded Image", use_column_width=True)
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image": image,
-                },
-                {"type": "text", "text": "Run Optical Character recognition on the image."},
-            ],
-        }
-    ]
-    # Preparation for inference
-    text = PROCESSOR.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = PROCESSOR(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
-    inputs = inputs.to("cpu")
-    # Inference: Generation of the output
-    generated_ids = MODEL.generate(**inputs, max_new_tokens=128)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = PROCESSOR.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-    st.subheader("Extracted Text:")
-    st.write(output_text)
-    # Keyword search functionality
     st.subheader("Keyword Search")
     search_query = st.text_input("Enter keywords to search within the extracted text")
     if search_query:
-        # Check if the search query is in the extracted text
-        if search_query.lower() in output.lower():
-            highlighted_text = output.replace(search_query, f"**{search_query}**")
-            st.write(f"Matching Text: {highlighted_text}")
         else:
             st.write("No matching text found.")
 else:
-    st.info("Please upload an image to extract text.")

+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 import streamlit as st
 import torch
     return model, processor
 MODEL, PROCESSOR = init_qwen_model()
 # Streamlit app title
 st.title("OCR Image Text Extraction")
     # Open the uploaded image file
     image = Image.open(uploaded_file)
     st.image(image, caption="Uploaded Image", use_column_width=True)
+    # Add the spinner here while the model is processing
+    with st.spinner("Extracting text..."):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": image,
+                    },
+                    {"type": "text", "text": "Run Optical Character recognition on the image."},
+                ],
+            }
+        ]
+        # Preparation for inference
+        text = PROCESSOR.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = PROCESSOR(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to("cpu")
+        # Inference: Generation of the output
+        generated_ids = MODEL.generate(**inputs, max_new_tokens=128)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        structured_output = PROCESSOR.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        # Convert structured output to plain text
+        plain_text_output = " ".join(structured_output.split())  # Remove any extra spaces or line breaks
+    # Display extracted plain text after the spinner ends
+    st.subheader("Extracted Plain Text:")
+    st.write(plain_text_output)
+    # Keyword search functionality on plain text
     st.subheader("Keyword Search")
     search_query = st.text_input("Enter keywords to search within the extracted text")
     if search_query:
+        # Check if the search query is in the plain text output
+        if search_query.lower() in plain_text_output.lower():
+            # Highlight the search query in the plain text
+            highlighted_text = plain_text_output.replace(search_query, f"**{search_query}**", flags=re.IGNORECASE)
+            st.markdown(f"Matching Text: {highlighted_text}", unsafe_allow_html=True)
         else:
             st.write("No matching text found.")
 else:
+    st.info("Please upload an image to extract text.")