Spaces:

Saurabh1207
/

VLM

Sleeping

App Files Files Community

Saurabh Kumar commited on Sep 30, 2024

Commit

f90e854

verified ·

1 Parent(s): 7ca46d0

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -62

app.py CHANGED Viewed

@@ -4,12 +4,12 @@ import streamlit as st
 import torch
 from PIL import Image
-@st.cache_resource
 # default: Load the model on the available device(s)
-model = Qwen2VLForConditionalGeneration.from_pretrained(
-    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
-)
 # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
 # model = Qwen2VLForConditionalGeneration.from_pretrained(
 #     "Qwen/Qwen2-VL-7B-Instruct",
@@ -17,74 +17,73 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
 #     attn_implementation="flash_attention_2",
 #     device_map="auto",
 # )
-# default processer
-processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
 # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
 # min_pixels = 256*28*28
 # max_pixels = 1280*28*28
 # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
 @st.cache_data
 # Streamlit app title
 st.title("OCR Image Text Extraction")
 # File uploader for images
 uploaded_file = st.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"])
-if uploaded_file is not None:
-    # Open the uploaded image file
-    image = Image.open(uploaded_file)
-    st.image(image, caption="Uploaded Image", use_column_width=True)
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image": image,
-                },
-                {"type": "text", "text": "Run Optical Character recognition on the image."},
-            ],
-        }
-    ]
-    # Preparation for inference
-    text = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
-    inputs = inputs.to("cpu")
-    # Inference: Generation of the output
-    generated_ids = model.generate(**inputs, max_new_tokens=128)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-    st.subheader("Extracted Text:")
-    st.write(output_text)
-    # Keyword search functionality
-    st.subheader("Keyword Search")
-    search_query = st.text_input("Enter keywords to search within the extracted text")
-    if search_query:
-        # Check if the search query is in the extracted text
-        if search_query.lower() in extracted_text.lower():
-            highlighted_text = extracted_text.replace(search_query, f"**{search_query}**")
-            st.write(f"Matching Text: {highlighted_text}")
-        else:
-            st.write("No matching text found.")

 import torch
 from PIL import Image
 # default: Load the model on the available device(s)
+@st.cache_resource
+def init_qwen_model():
+    model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto")
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+    return model, processor
 # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
 # model = Qwen2VLForConditionalGeneration.from_pretrained(
 #     "Qwen/Qwen2-VL-7B-Instruct",
 #     attn_implementation="flash_attention_2",
 #     device_map="auto",
 # )
 # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
 # min_pixels = 256*28*28
 # max_pixels = 1280*28*28
 # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
 @st.cache_data
+def get_qwen_text(uploaded_file):
+    if uploaded_file is not None:
+        # Open the uploaded image file
+        image = Image.open(uploaded_file)
+        st.image(image, caption="Uploaded Image", use_column_width=True)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": image,
+                    },
+                    {"type": "text", "text": "Run Optical Character recognition on the image."},
+                ],
+            }
+        ]
+        # Preparation for inference
+        text = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to("cpu")
+        # Inference: Generation of the output
+        generated_ids = model.generate(**inputs, max_new_tokens=128)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        return output_text
 # Streamlit app title
 st.title("OCR Image Text Extraction")
 # File uploader for images
 uploaded_file = st.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"])
+st.subheader("Extracted Text:")
+output = get_qwen_text(uploaded_file)
+st.write(output)
+# Keyword search functionality
+st.subheader("Keyword Search")
+search_query = st.text_input("Enter keywords to search within the extracted text")
+if search_query:
+    # Check if the search query is in the extracted text
+    if search_query.lower() in output.lower():
+        highlighted_text = output.replace(search_query, f"**{search_query}**")
+        st.write(f"Matching Text: {highlighted_text}")
+    else:
+        st.write("No matching text found.")