Spaces:

awacke1
/

TorchTransformers-Diffusion-CV-SFT

Running on CPU Upgrade

App Files Files Community

awacke1 commited on 13 days ago

Commit

e1bf9f9

verified ·

1 Parent(s): de31118

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -10

app.py CHANGED Viewed

@@ -13,7 +13,6 @@ import logging
 import asyncio
 import aiofiles
 from io import BytesIO
-import threading
 # Logging setup
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -53,12 +52,16 @@ def get_gallery_files(file_types):
     return sorted([f for ext in file_types for f in glob.glob(f"*.{ext}")])
 def update_gallery():
-    media_files = get_gallery_files(["png"])
     if media_files:
         cols = st.sidebar.columns(2)
         for idx, file in enumerate(media_files[:gallery_size * 2]):
             with cols[idx % 2]:
-                st.image(Image.open(file), caption=file, use_container_width=True)
 # Model Loaders (Smaller, CPU-focused)
 def load_ocr_qwen2vl():
@@ -68,7 +71,7 @@ def load_ocr_qwen2vl():
     return processor, model
 def load_ocr_trocr():
-    model_id = "microsoft/trocr-small-handwritten"  # Smaller, ~250 MB
     processor = TrOCRProcessor.from_pretrained(model_id)
     model = VisionEncoderDecoderModel.from_pretrained(model_id, torch_dtype=torch.float32).to("cpu").eval()
     return processor, model
@@ -79,7 +82,7 @@ def load_image_gen():
     return pipeline
 def load_line_drawer():
-    # Simplified from your Torch Space (assuming edge detection)
     def edge_detection(image):
         img_np = np.array(image.convert("RGB"))
         gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
@@ -94,20 +97,23 @@ async def process_ocr(image, prompt, model_name, output_file):
     status.text(f"Processing {model_name} OCR... (0s)")
     if model_name == "Qwen2-VL-OCR-2B":
         processor, model = load_ocr_qwen2vl()
-        inputs = processor(text=prompt, images=image, return_tensors="pt").to("cpu")
         outputs = model.generate(**inputs, max_new_tokens=1024)
-        text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
     else:  # TrOCR
         processor, model = load_ocr_trocr()
         pixel_values = processor(images=image, return_tensors="pt").pixel_values.to("cpu")
         outputs = model.generate(pixel_values)
-        text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
     elapsed = int(time.time() - start_time)
     status.text(f"{model_name} OCR completed in {elapsed}s!")
     async with aiofiles.open(output_file, "w") as f:
-        await f.write(text)
     st.session_state['captured_images'].append(output_file)
-    return text
 async def process_image_gen(prompt, output_file):
     start_time = time.time()

 import asyncio
 import aiofiles
 from io import BytesIO
 # Logging setup
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
     return sorted([f for ext in file_types for f in glob.glob(f"*.{ext}")])
 def update_gallery():
+    media_files = get_gallery_files(["png", "txt"])
     if media_files:
         cols = st.sidebar.columns(2)
         for idx, file in enumerate(media_files[:gallery_size * 2]):
             with cols[idx % 2]:
+                if file.endswith(".png"):
+                    st.image(Image.open(file), caption=file, use_container_width=True)
+                elif file.endswith(".txt"):
+                    with open(file, "r") as f:
+                        st.text(f.read()[:50] + "..." if len(f.read()) > 50 else f.read(), help=file)
 # Model Loaders (Smaller, CPU-focused)
 def load_ocr_qwen2vl():
     return processor, model
 def load_ocr_trocr():
+    model_id = "microsoft/trocr-small-handwritten"  # ~250 MB
     processor = TrOCRProcessor.from_pretrained(model_id)
     model = VisionEncoderDecoderModel.from_pretrained(model_id, torch_dtype=torch.float32).to("cpu").eval()
     return processor, model
     return pipeline
 def load_line_drawer():
+    # Simplified OpenCV-based edge detection (CPU-friendly substitute for Torch Space UNet)
     def edge_detection(image):
         img_np = np.array(image.convert("RGB"))
         gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
     status.text(f"Processing {model_name} OCR... (0s)")
     if model_name == "Qwen2-VL-OCR-2B":
         processor, model = load_ocr_qwen2vl()
+        # Corrected input format: apply chat template
+        messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt}]}]
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True).to("cpu")
         outputs = model.generate(**inputs, max_new_tokens=1024)
+        result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
     else:  # TrOCR
         processor, model = load_ocr_trocr()
         pixel_values = processor(images=image, return_tensors="pt").pixel_values.to("cpu")
         outputs = model.generate(pixel_values)
+        result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
     elapsed = int(time.time() - start_time)
     status.text(f"{model_name} OCR completed in {elapsed}s!")
     async with aiofiles.open(output_file, "w") as f:
+        await f.write(result)
     st.session_state['captured_images'].append(output_file)
+    return result
 async def process_image_gen(prompt, output_file):
     start_time = time.time()