Spaces:

garychew
/

automate_url_captioner

Runtime error

App Files Files Community

garychew commited on 16 days ago

Commit

e7a9c69

verified ·

1 Parent(s): 6595301

Create app.py

Browse files

Files changed (1) hide show

app.py +59 -0

app.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import requests
+from PIL import Image
+from io import BytesIO
+from bs4 import BeautifulSoup
+from transformers import AutoProcessor, BlipForConditionalGeneration
+# Load the pretrained processor and model
+processor = # fill the pretrained model
+model = # load the blip model
+# URL of the page to scrape
+url = "https://en.wikipedia.org/wiki/IBM"
+# Download the page
+response = requests.get(url)
+# Parse the page with BeautifulSoup
+soup = BeautifulSoup(response.text, 'html.parser')
+# Find all img elements
+img_elements = soup.find_all('img')
+# Open a file to write the captions
+with open("captions.txt", "w") as caption_file:
+    # Iterate over each img element
+    for img_element in img_elements:
+        img_url = img_element.get('src')
+        # Skip if the image is an SVG or too small (likely an icon)
+        if 'svg' in img_url or '1x1' in img_url:
+            continue
+        # Correct the URL if it's malformed
+        if img_url.startswith('//'):
+            img_url = 'https:' + img_url
+        elif not img_url.startswith('http://') and not img_url.startswith('https://'):
+            continue  # Skip URLs that don't start with http:// or https://
+        try:
+            # Download the image
+            response = requests.get(img_url)
+            # Convert the image data to a PIL Image
+            raw_image = Image.open(BytesIO(response.content))
+            if raw_image.size[0] * raw_image.size[1] < 400:  # Skip very small images
+                continue
+            raw_image = raw_image.convert('RGB')
+            # Process the image
+            inputs = processor(raw_image, return_tensors="pt")
+            # Generate a caption for the image
+            out = model.generate(**inputs, max_new_tokens=50)
+            # Decode the generated tokens to text
+            caption = processor.decode(out[0], skip_special_tokens=True)
+            # Write the caption to the file, prepended by the image URL
+            caption_file.write(f"{img_url}: {caption}\n")
+        except Exception as e:
+            print(f"Error processing image {img_url}: {e}")
+            continue