Spaces:

mknolan
/

cursor_slides_internvl2

Paused

App Files Files Community

mknolan commited on Mar 11

Commit

e59dc66

verified ·

1 Parent(s): 79aa3b8

Upload InternVL2 implementation

Browse files

Files changed (25) hide show

.env.hf +12 -0
.gitattributes +1 -0
Dockerfile +48 -0
HUGGING-FACE-DEPLOYMENT.md +97 -0
LOCAL-TESTING.md +67 -0
README-HF.md +33 -0
README.md +130 -7
README_INTERNVL2.md +103 -0
app.py +389 -0
app_internvl2.py +165 -0
csv_to_html.py +77 -0
data_temp/page_2.png +3 -0
docs/aws_deployment.md +170 -0
docs/azure_deployment.md +365 -0
entrypoint.sh +20 -0
image_descriptor.py +169 -0
image_descriptor_requirements.txt +14 -0
readme.mdp +1 -0
report.html +105 -0
requirements.txt +14 -0
sample_data.csv +5 -0
server_deploy.sh +35 -0
test_internvl2.py +86 -0
upload_internvl2_to_hf.py +73 -0
upload_to_hf.py +74 -0

.env.hf ADDED Viewed

	@@ -0,0 +1,12 @@

+# Environment variables for Hugging Face Spaces deployment
+PORT=7860
+FLASK_APP=image_descriptor.py
+TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
+HF_HOME=/app/.cache/huggingface
+GRADIO_ANALYTICS_ENABLED=False
+GRADIO_SERVER_NAME=0.0.0.0
+GRADIO_SERVER_PORT=7860
+# Uncomment to run only the API without the UI
+# API_ONLY=true
+# Uncomment to run both the UI and API
+# ENABLE_UI=true

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data_temp/page_2.png filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,48 @@

+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV HF_HOME=/root/.cache/huggingface
+ENV TRANSFORMERS_CACHE=/root/.cache/huggingface/transformers
+ENV MPLCONFIGDIR=/tmp/matplotlib
+# Create necessary directories with proper permissions
+RUN mkdir -p /root/.cache/huggingface/transformers && \
+    mkdir -p /tmp/matplotlib && \
+    chmod -R 777 /root/.cache && \
+    chmod -R 777 /tmp/matplotlib
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    git \
+    curl \
+    ca-certificates \
+    python3-pip \
+    python3-dev \
+    python3-setuptools \
+    && rm -rf /var/lib/apt/lists/*
+# Create a working directory
+WORKDIR /app
+# Copy requirements file and install Python dependencies
+COPY requirements.txt .
+RUN pip3 install --no-cache-dir --upgrade pip && \
+    pip3 install --no-cache-dir -r requirements.txt && \
+    # Install additional dependencies for lmdeploy
+    pip3 install --no-cache-dir cmake && \
+    pip3 install --no-cache-dir ninja && \
+    # Install flash-attention for performance
+    pip3 install --no-cache-dir flash-attn
+# Copy the application files
+COPY . .
+# Make port 7860 available for the app
+EXPOSE 7860
+# Start the application
+CMD ["python3", "app_internvl2.py"]

HUGGING-FACE-DEPLOYMENT.md ADDED Viewed

	@@ -0,0 +1,97 @@

+# Hugging Face Spaces Deployment Guide
+This guide provides detailed steps for deploying the Image Description application to Hugging Face Spaces.
+## Prerequisites
+- A [Hugging Face](https://huggingface.co/) account
+- A GitHub repository containing your application code
+- The code should include all files created in this project
+## Step 1: Prepare Your GitHub Repository
+1. Make sure your GitHub repository contains all necessary files:
+   - `image_descriptor.py` (Flask API)
+   - `app.py` (Gradio UI)
+   - `Dockerfile`
+   - `entrypoint.sh`
+   - `image_descriptor_requirements.txt`
+   - `data_temp/` directory with sample images
+   - `.env` file with configuration
+   - `.gitignore`
+2. Commit all changes to your repository:
+   ```bash
+   git add .
+   git commit -m "Prepare for Hugging Face Spaces deployment"
+   git push
+   ```
+## Step 2: Create a New Hugging Face Space
+1. Go to [Hugging Face Spaces](https://huggingface.co/spaces)
+2. Click on "Create new Space"
+3. Fill out the form:
+   - **Owner**: Your Hugging Face username or organization
+   - **Space name**: Choose a unique name (e.g., "image-description-qwen")
+   - **License**: Choose an appropriate license (e.g., Apache 2.0)
+   - **SDK**: Select "Docker"
+   - **Hardware**: Select "A10G" for best performance with this model
+   - **Visibility**: Choose public or private based on your needs
+4. Click "Create Space"
+## Step 3: Link Your GitHub Repository
+1. On your new Space page, go to the "Settings" tab
+2. Under "Repository", click "Link external repository"
+3. Enter your GitHub repository URL
+4. Click "Link repository"
+5. Choose the branch to deploy (e.g., "main" or "master")
+6. Click "Save"
+## Step 4: Configure the Space
+1. Stay on the "Settings" tab
+2. Under "Variables", you can add environment variables if needed (most are already in the Dockerfile)
+3. Under "Advanced settings", enable "Secret HTTPS Frontend"
+## Step 5: Deploy and Monitor
+1. Go to the "Factory" tab to see the build progress
+2. Wait for the build and deployment to complete (this may take 10-20 minutes)
+3. Check the logs for any errors
+4. Once deployment is successful, go to the "App" tab to use your application
+## Step 6: Test Your Deployed Application
+1. On the "App" tab, upload an image
+2. Click "Analyze Image"
+3. Verify that the model generates descriptions correctly
+## Troubleshooting Common Issues
+- **Build Failures**: Check the "Factory" logs for detailed error messages
+- **Memory Issues**: If the model crashes, you may need to adjust the quantization settings or use a smaller model
+- **Slow Performance**: The first run will be slow as the model is downloaded; subsequent runs will be faster
+- **UI Issues**: If the UI loads but the model doesn't work, check the browser console and server logs
+## Updating Your Space
+To update your Space after making changes to your code:
+1. Make changes to your code
+2. Commit and push to GitHub
+3. The Space will automatically rebuild and deploy the updated code
+## Sharing Your Space
+- You can share your Space by giving others the URL
+- You can embed your Space in other websites using the iframe code provided in the "Embed" tab
+- You can collaborate with others by adding them as contributors in the "Settings" tab
+## Additional Resources
+- [Hugging Face Spaces Documentation](https://huggingface.co/docs/hub/spaces)
+- [Docker SDK Documentation](https://huggingface.co/docs/hub/spaces-sdks-docker)
+- [Gradio Documentation](https://www.gradio.app/docs/)

LOCAL-TESTING.md ADDED Viewed

	@@ -0,0 +1,67 @@

+# Local Testing Guide
+Before deploying to Hugging Face Spaces, you may want to test the application locally. This guide provides instructions for local testing.
+## Prerequisites
+- CUDA-capable GPU with at least 8GB VRAM
+- Python 3.8+
+- pip or conda package manager
+## Steps for Local Testing
+1. **Install Dependencies**
+```bash
+pip install -r image_descriptor_requirements.txt
+```
+2. **Run in UI Mode**
+```bash
+python app.py
+```
+This will start the Gradio UI on http://localhost:7860. You can upload images and test the model.
+3. **Run in API-only Mode**
+```bash
+FLASK_APP=image_descriptor.py flask run --host=0.0.0.0 --port=5000
+```
+This will start just the Flask API on http://localhost:5000.
+4. **Test the Docker Container**
+```bash
+# Build the container
+docker build -t image-descriptor .
+# Run the container
+docker run -p 7860:7860 --gpus all image-descriptor
+```
+The application will be available at http://localhost:7860.
+## Testing the API
+You can test the API using curl:
+```bash
+# Health check
+curl http://localhost:5000/health
+# Process an image
+curl -X POST -F "image=@data_temp/page_2.png" http://localhost:5000/describe
+```
+## Troubleshooting
+- **GPU Memory Issues**: If you encounter GPU memory errors, try reducing batch sizes or using a smaller model.
+- **Model Download Issues**: If the model download fails, try downloading it manually from Hugging Face and place it in the `.cache/huggingface/transformers` directory.
+- **Dependencies**: Make sure you have the correct CUDA version installed for your GPU.
+## Next Steps
+Once you've confirmed the application works locally, you can deploy it to Hugging Face Spaces following the instructions in the main README.md.

README-HF.md ADDED Viewed

	@@ -0,0 +1,33 @@

+# Image Description with Qwen2-VL-7B
+This Hugging Face Space uses the powerful Qwen2-VL-7B vision language model to generate detailed descriptions of images.
+## About
+Upload any image and get:
+- A basic description
+- A detailed analysis
+- A technical assessment
+The app uses the Qwen2-VL-7B model with 4-bit quantization to provide efficient and high-quality image analysis.
+## Usage
+1. Upload an image or use one of the example images
+2. Click "Analyze Image"
+3. View the three types of descriptions generated by the model
+## Examples
+The space includes sample images in the data_temp folder that you can use to test the model.
+## Technical Details
+- **Model**: Qwen2-VL-7B
+- **Framework**: Gradio UI + Flask API backend
+- **Quantization**: 4-bit for efficient inference
+- **GPU**: A10G recommended
+## Credits
+- [Qwen2-VL-7B model](https://huggingface.co/Qwen/Qwen2-VL-7B) by Qwen team

README.md CHANGED Viewed

@@ -1,12 +1,135 @@
 ---
-title: Cursor Slides Internvl2
-emoji: 🐠
-colorFrom: yellow
-colorTo: green
-sdk: gradio
-sdk_version: 5.20.1
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Image Description with Qwen-VL
+emoji: 🖼️
+colorFrom: indigo
+colorTo: purple
+sdk: docker
+sdk_version: 3.0.0
 app_file: app.py
 pinned: false
 ---
+# Image Description Application with Qwen-VL
+This application uses the advanced Qwen-VL-Chat vision language model to generate detailed descriptions for images. It's specifically set up to describe the image in the `data_temp` folder, but can also analyze any uploaded image.
+## Features
+- Loads an image from the data_temp folder or via upload
+- Generates multiple types of descriptions using state-of-the-art AI:
+  - Basic description (brief overview)
+  - Detailed analysis (comprehensive description)
+  - Technical analysis (assessment of technical aspects)
+- Displays the image (optional)
+- Uses 8-bit quantization for efficient model loading
+- Provides a user-friendly Gradio UI
+## Requirements
+- Python 3.8 or higher
+- PyTorch
+- Transformers (version 4.35.2+)
+- Pillow
+- Matplotlib
+- Accelerate
+- Bitsandbytes
+- Safetensors
+- Gradio for the web interface
+## Hardware Requirements
+This application uses a vision-language model which requires:
+- A CUDA-capable GPU with at least 8GB VRAM
+- 8GB+ system RAM
+## Deployment Options
+### 1. Hugging Face Spaces (Recommended)
+This repository is ready to be deployed on Hugging Face Spaces.
+**Steps:**
+1. Create a new Space on [Hugging Face Spaces](https://huggingface.co/spaces)
+2. Select "Docker" as the Space SDK
+3. Link this GitHub repository
+4. Select a GPU (T4 or better is recommended)
+5. Create the Space
+The application will automatically deploy with the Gradio UI frontend.
+### 2. AWS SageMaker
+For production deployment on AWS SageMaker:
+1. Package the application using the provided Dockerfile
+2. Upload the Docker image to Amazon ECR
+3. Create a SageMaker Model using the ECR image
+4. Deploy an endpoint with an instance type like ml.g4dn.xlarge
+5. Set up API Gateway for HTTP access (optional)
+Detailed AWS instructions can be found in the `docs/aws_deployment.md` file.
+### 3. Azure Machine Learning
+For Azure deployment:
+1. Create an Azure ML workspace
+2. Register the model on Azure ML
+3. Create an inference configuration
+4. Deploy to AKS or ACI with a GPU-enabled instance
+Detailed Azure instructions can be found in the `docs/azure_deployment.md` file.
+## How It Works
+The application uses the Qwen-VL-Chat model, a state-of-the-art multimodal AI model that can understand and describe images with impressive detail.
+The script:
+1. Processes the image with three different prompts:
+   - "Describe this image briefly in a single paragraph."
+   - "Analyze this image in detail. Describe the main elements, any text visible, the colors, and the overall composition."
+   - "What can you tell me about the technical aspects of this image?"
+2. Uses 8-bit quantization to reduce memory requirements
+3. Formats and displays the results
+## Repository Structure
+- `app.py` - Gradio UI for web interface
+- `Dockerfile` - For containerized deployment
+- `requirements.txt` - Python dependencies
+- `data_temp/` - Sample images for testing
+## Local Development
+1. Install the required packages:
+   ```
+   pip install -r requirements.txt
+   ```
+2. Run the Gradio UI:
+   ```
+   python app.py
+   ```
+3. Visit `http://localhost:7860` in your browser
+## Example Output
+```
+Processing image: data_temp/page_2.png
+Loading model...
+Generating descriptions...
+==== Image Description Results (Qwen-VL) ====
+Basic Description:
+The image shows a webpage or document with text content organized in multiple columns.
+Detailed Description:
+The image displays a structured document or webpage with multiple sections of text organized in a grid layout. The content appears to be technical or educational in nature, with what looks like headings and paragraphs of text. The color scheme is primarily black text on a white background, creating a clean, professional appearance. There appear to be multiple columns of information, possibly representing different topics or categories. The layout suggests this might be documentation, a reference guide, or an educational resource related to technical content.
+Technical Analysis:
+This appears to be a screenshot of a digital document or webpage. The image quality is good with clear text rendering, suggesting it was captured at an appropriate resolution. The image uses a standard document layout with what appears to be a grid or multi-column structure. The screenshot has been taken of what seems to be a text-heavy interface with minimal graphics, consistent with technical documentation or reference materials.
+```
+Note: Actual descriptions will vary based on the specific image content and may be more detailed than this example.

README_INTERNVL2.md ADDED Viewed

	@@ -0,0 +1,103 @@

+# Image Analysis with InternVL2
+This project uses the InternVL2-40B-AWQ model for high-quality image analysis, description, and understanding. It provides a Gradio web interface for users to upload images and get detailed analysis.
+## Features
+- **High-Quality Image Analysis**: Uses InternVL2-40B (4-bit quantized) for state-of-the-art image understanding
+- **Multiple Analysis Types**: General description, text extraction, chart analysis, people description, and technical analysis
+- **Simple UI**: User-friendly Gradio interface for easy image uploading and analysis
+- **Efficient Resource Usage**: 4-bit quantized model (AWQ) for reduced memory footprint and faster inference
+## Requirements
+The application requires:
+- Python 3.9+
+- CUDA-compatible GPU (recommended 24GB+ VRAM)
+- Transformers 4.37.2+
+- lmdeploy 0.5.3+
+- Gradio 3.38.0
+- Other dependencies in `requirements.txt`
+## Setup
+### Docker Setup (Recommended)
+1. **Build the Docker image**:
+   ```
+   docker build -t internvl2-image-analysis .
+   ```
+2. **Run the Docker container**:
+   ```
+   docker run --gpus all -p 7860:7860 internvl2-image-analysis
+   ```
+### Local Setup
+1. **Create a virtual environment**:
+   ```
+   python -m venv venv
+   source venv/bin/activate  # On Windows: venv\Scripts\activate
+   ```
+2. **Install dependencies**:
+   ```
+   pip install -r requirements.txt
+   ```
+3. **Run the application**:
+   ```
+   python app_internvl2.py
+   ```
+## Usage
+1. Open your browser and navigate to `http://localhost:7860`
+2. Upload an image using the upload box
+3. Choose an analysis type from the options
+4. Click "Analyze Image" and wait for the results
+### Analysis Types
+- **General**: Provides a comprehensive description of the image content
+- **Text**: Focuses on identifying and extracting text from the image
+- **Chart**: Analyzes charts, graphs, and diagrams in detail
+- **People**: Describes people in the image - appearance, actions, and expressions
+- **Technical**: Provides technical analysis of objects and their relationships
+## Testing
+To test the model directly from the command line:
+```
+python test_internvl2.py --image path/to/your/image.jpg --prompt "Describe this image in detail."
+```
+## Deployment to Hugging Face
+To deploy to Hugging Face Spaces:
+```
+python upload_internvl2_to_hf.py
+```
+## Model Details
+This application uses InternVL2-40B-AWQ, a 4-bit quantized version of InternVL2-40B. The original model consists of:
+- **Vision Component**: InternViT-6B-448px-V1-5
+- **Language Component**: Nous-Hermes-2-Yi-34B
+- **Total Parameters**: ~40B (6B vision + 34B language)
+## License
+This project is released under the same license as the InternVL2 model, which is MIT license.
+## Acknowledgements
+- [OpenGVLab](https://github.com/OpenGVLab) for creating the InternVL2 models
+- [Hugging Face](https://huggingface.co/) for model hosting
+- [lmdeploy](https://github.com/InternLM/lmdeploy) for model optimization
+- [Gradio](https://gradio.app/) for the web interface

app.py ADDED Viewed

	@@ -0,0 +1,389 @@

+import gradio as gr
+from PIL import Image
+import torch
+import os
+import time
+import numpy as np
+# Set CUDA memory configuration to avoid fragmentation
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
+# Import the models after setting memory configuration
+from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
+# Model configuration
+CLIP_MODEL_ID = "openai/clip-vit-base-patch32"  # Fast classification
+DETAILED_MODEL_ID = "Salesforce/blip-image-captioning-large"  # Use original BLIP instead of BLIP-2
+USE_GPU = torch.cuda.is_available()
+# Global variables
+clip_model = None
+clip_processor = None
+detailed_model = None
+detailed_processor = None
+def load_clip_model():
+    """Load the CLIP model for fast classification"""
+    global clip_model, clip_processor
+    # Return if already loaded
+    if clip_model is not None and clip_processor is not None:
+        return True
+    print("Loading CLIP model...")
+    try:
+        # First clear any GPU memory
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        # Load processor
+        clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_ID)
+        # Load model efficiently
+        if USE_GPU:
+            clip_model = CLIPModel.from_pretrained(CLIP_MODEL_ID).to("cuda")
+        else:
+            clip_model = CLIPModel.from_pretrained(CLIP_MODEL_ID)
+        # Set to evaluation mode
+        clip_model.eval()
+        print("CLIP model loaded successfully!")
+        return True
+    except Exception as e:
+        print(f"Error loading CLIP model: {str(e)}")
+        return False
+def load_detailed_model():
+    """Load the BLIP model for detailed image analysis"""
+    global detailed_model, detailed_processor
+    # If already loaded, return
+    if detailed_model is not None and detailed_processor is not None:
+        return True
+    print("Loading BLIP model...")
+    try:
+        # Clear memory first
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        # Load processor and model for original BLIP
+        detailed_processor = BlipProcessor.from_pretrained(DETAILED_MODEL_ID)
+        # For older models like BLIP, don't use device_map='auto' or load_in_8bit
+        # Instead, load the model and then move it to the device
+        detailed_model = BlipForConditionalGeneration.from_pretrained(
+            DETAILED_MODEL_ID,
+            torch_dtype=torch.float16 if USE_GPU else torch.float32
+        )
+        # Manually move model to GPU if available
+        if USE_GPU:
+            detailed_model = detailed_model.to("cuda")
+        # Set to evaluation mode
+        detailed_model.eval()
+        print("BLIP model loaded successfully!")
+        return True
+    except Exception as e:
+        print(f"Error loading BLIP model: {str(e)}")
+        if "CUDA out of memory" in str(e):
+            print("Not enough GPU memory for the detailed model")
+        return False
+# Categories for image classification
+CATEGORIES = [
+    "a photograph", "a painting", "a drawing", "a digital art",
+    "landscape", "portrait", "cityscape", "animals", "food", "vehicle",
+    "building", "nature", "people", "abstract art", "technology",
+    "interior", "exterior", "night scene", "beach", "mountains",
+    "forest", "water", "flowers", "sports",
+    "a person", "multiple people", "a child", "an elderly person",
+    "a dog", "a cat", "wildlife", "a bird", "a car", "a building",
+    "a presentation slide", "a graph", "a chart", "a diagram", "text document",
+    "a screenshot", "a map", "a table of data", "a scientific figure"
+]
+def get_detailed_analysis(image):
+    """Get detailed analysis from the image using BLIP model"""
+    try:
+        start_time = time.time()
+        # Make sure the model is loaded
+        if not load_detailed_model():
+            return "Couldn't load detailed analysis model."
+        # Convert numpy array to PIL Image
+        if isinstance(image, np.ndarray):
+            image_pil = Image.fromarray(image).convert('RGB')
+        else:
+            # If somehow it's already a PIL Image
+            image_pil = image.convert('RGB')
+        # Resize the image to improve performance
+        max_size = 600  # Limit to 600px on the longest side
+        width, height = image_pil.size
+        if max(width, height) > max_size:
+            if width > height:
+                new_width = max_size
+                new_height = int(height * (max_size / width))
+            else:
+                new_height = max_size
+                new_width = int(width * (max_size / height))
+            image_pil = image_pil.resize((new_width, new_height), Image.LANCZOS)
+        device = "cuda" if USE_GPU else "cpu"
+        # Using an unconditional approach first - this usually works better
+        inputs = detailed_processor(image_pil, return_tensors="pt")
+        if USE_GPU:
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            # Get a basic unconditional caption
+            output_ids = detailed_model.generate(
+                **inputs,
+                max_length=50,
+                num_beams=5,
+                do_sample=False,
+                early_stopping=True
+            )
+            base_description = detailed_processor.decode(output_ids[0], skip_special_tokens=True)
+        # ULTRA-SIMPLE single-word prompts to avoid any echoing
+        analyses = {
+            "text": None,     # Text content
+            "chart": None,    # Chart analysis
+            "subject": None   # Main subject
+        }
+        # Use the base description for context with ultra-simple prompts
+        ultra_simple_prompts = {
+            f"Text in {base_description[:20]}...": "text",
+            f"Charts in {base_description[:20]}...": "chart",
+            f"Subject of {base_description[:20]}...": "subject"
+        }
+        for prompt, analysis_type in ultra_simple_prompts.items():
+            # Process with prompt
+            inputs = detailed_processor(image_pil, text=prompt, return_tensors="pt")
+            if USE_GPU:
+                inputs = {k: v.to(device) for k, v in inputs.items()}
+            with torch.no_grad():
+                output_ids = detailed_model.generate(
+                    **inputs,
+                    max_length=75,
+                    num_beams=3,
+                    do_sample=True,
+                    temperature=0.7,
+                    repetition_penalty=1.2,
+                    early_stopping=True
+                )
+                result = detailed_processor.decode(output_ids[0], skip_special_tokens=True)
+                # SUPER AGGRESSIVE cleaning
+                # First, remove anything that looks like a prefix before a colon
+                colon_parts = result.split(":")
+                if len(colon_parts) > 1:
+                    # Take everything after the first colon
+                    result = ":".join(colon_parts[1:]).strip()
+                # Remove the base description if it appears
+                if base_description in result:
+                    result = result.replace(base_description, "").strip()
+                # Remove any part of the prompt
+                for p in ultra_simple_prompts.keys():
+                    if p in result:
+                        result = result.replace(p, "").strip()
+                # Remove the first 20 chars of base description if they appear
+                if base_description[:20] in result:
+                    result = result.replace(base_description[:20], "").strip()
+                # Remove all common question patterns and filler text
+                remove_patterns = [
+                    "text in", "charts in", "subject of",
+                    "in detail", "describe", "this image", "the image",
+                    "can you", "do you", "is there", "are there", "i can see",
+                    "i see", "there is", "there are", "it looks like",
+                    "appears to be", "seems to be", "might be", "could be",
+                    "i think", "i believe", "probably", "possibly", "maybe",
+                    "it is", "this is", "that is", "these are", "those are",
+                    "image shows", "picture shows", "image contains", "picture contains",
+                    "in the image", "in this image", "of this image", "from this image",
+                    "based on", "according to", "looking at", "from what i can see",
+                    "appears to show", "depicts", "represents", "illustrates", "demonstrates",
+                    "presents", "displays", "portrays", "reveals", "indicates", "suggests",
+                    "we can see", "you can see", "one can see"
+                ]
+                for pattern in remove_patterns:
+                    if pattern.lower() in result.lower():
+                        # Find and remove each occurrence
+                        lower_result = result.lower()
+                        while pattern.lower() in lower_result:
+                            idx = lower_result.find(pattern.lower())
+                            if idx >= 0:
+                                result = result[:idx] + result[idx+len(pattern):]
+                                lower_result = result.lower()
+                # Clean up any punctuation/formatting issues
+                result = result.strip()
+                while result and result[0] in ",.;:?!-":
+                    result = result[1:].strip()
+                # Remove "..." if it appears
+                result = result.replace("...", "").strip()
+                # Fix capitalization
+                if result and len(result) > 0:
+                    result = result[0].upper() + result[1:] if len(result) > 1 else result[0].upper()
+                analyses[analysis_type] = result
+        # Compose the final output
+        output_text = f"## Detailed Description\n{base_description}\n\n"
+        # Only show relevant sections
+        if analyses['text'] and len(analyses['text']) > 5 and not any(x in analyses['text'].lower() for x in ["no text", "not any text", "can't see", "cannot see", "don't see", "couldn't find"]):
+            output_text += f"## Text Content\n{analyses['text']}\n\n"
+        if analyses['chart'] and len(analyses['chart']) > 5 and not any(x in analyses['chart'].lower() for x in ["no chart", "not any chart", "no graph", "not any graph", "can't see", "cannot see", "don't see", "couldn't find"]):
+            output_text += f"## Chart Analysis\n{analyses['chart']}\n\n"
+        output_text += f"## Main Subject\n{analyses['subject'] or 'Unable to determine main subject.'}"
+        # Clear GPU memory
+        if USE_GPU:
+            torch.cuda.empty_cache()
+        elapsed_time = time.time() - start_time
+        return output_text
+    except Exception as e:
+        print(f"Error in detailed analysis: {str(e)}")
+        # Try to clean up memory in case of error
+        if USE_GPU:
+            torch.cuda.empty_cache()
+        return f"Error in detailed analysis: {str(e)}"
+def get_clip_classification(image):
+    """Get fast classification using CLIP"""
+    if not load_clip_model():
+        return []
+    try:
+        # Process with CLIP
+        inputs = clip_processor(
+            text=CATEGORIES,
+            images=image,
+            return_tensors="pt",
+            padding=True
+        )
+        # Move to GPU if available
+        if USE_GPU:
+            inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        # Get predictions
+        with torch.inference_mode():
+            outputs = clip_model(**inputs)
+        # Process results
+        logits_per_image = outputs.logits_per_image
+        probs = logits_per_image.softmax(dim=1)
+        # Get top predictions
+        values, indices = probs[0].topk(8)
+        # Format results
+        return [(CATEGORIES[idx], value.item() * 100) for value, idx in zip(values, indices)]
+    except Exception as e:
+        print(f"Error in CLIP classification: {str(e)}")
+        return []
+def process_image(image, get_detailed=False):
+    """Process image with both fast and detailed analysis"""
+    if image is None:
+        return "Please upload an image to analyze."
+    try:
+        # Start timing
+        start_time = time.time()
+        # Preprocess image
+        if hasattr(image, 'mode') and image.mode != 'RGB':
+            image = image.convert('RGB')
+        # Resize for efficiency
+        if max(image.size) > 600:  # Smaller max size for better performance
+            ratio = 600 / max(image.size)
+            new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
+            image = image.resize(new_size, Image.LANCZOS)
+        # Get fast classification first
+        categories = get_clip_classification(image)
+        result = "## Image Classification\n"
+        result += "This image appears to contain:\n"
+        for category, confidence in categories:
+            result += f"- {category.title()} ({confidence:.1f}%)\n"
+        # Add detailed analysis if requested
+        if get_detailed:
+            result += "\n## Detailed Analysis\n"
+            detailed_result = get_detailed_analysis(image)
+            result += detailed_result
+        # Add timing information
+        elapsed_time = time.time() - start_time
+        result += f"\n\nAnalysis completed in {elapsed_time:.2f} seconds."
+        # Clean up memory
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return result
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return f"Error processing image: {str(e)}"
+# Create interface with more options
+with gr.Blocks(title="Enhanced Image Analyzer") as demo:
+    gr.Markdown("# Enhanced Image Analyzer")
+    gr.Markdown("Upload an image and choose between fast classification or detailed analysis.")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(type="pil", label="Upload an image")
+            detailed_checkbox = gr.Checkbox(label="Get detailed analysis (slower but better quality)", value=False)
+            analyze_btn = gr.Button("Analyze Image", variant="primary")
+        with gr.Column():
+            output = gr.Markdown(label="Analysis Results")
+    analyze_btn.click(
+        fn=process_image,
+        inputs=[input_image, detailed_checkbox],
+        outputs=output
+    )
+    # Optional examples
+    if os.path.exists("data_temp"):
+        examples = [os.path.join("data_temp", f) for f in os.listdir("data_temp")
+                   if f.endswith(('.png', '.jpg', '.jpeg'))]
+        if examples:
+            gr.Examples(examples=examples, inputs=input_image)
+if __name__ == "__main__":
+    # Start with clean memory
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))

app_internvl2.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import gradio as gr
+from PIL import Image
+import os
+import time
+import numpy as np
+import torch
+import math
+# Import lmdeploy for InternVL2 model
+from lmdeploy import pipeline, TurbomindEngineConfig
+from lmdeploy.vl import load_image
+# Set environment variables
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
+# Model configuration
+MODEL_ID = "OpenGVLab/InternVL2-40B-AWQ"  # 4-bit quantized model
+USE_GPU = torch.cuda.is_available()
+# Global variables for model
+internvl2_pipeline = None
+def load_internvl2_model():
+    """Load the InternVL2 model using lmdeploy"""
+    global internvl2_pipeline
+    # If already loaded, return
+    if internvl2_pipeline is not None:
+        return True
+    print("Loading InternVL2 model...")
+    try:
+        # Configure for AWQ quantized model
+        backend_config = TurbomindEngineConfig(model_format='awq')
+        # Create pipeline
+        internvl2_pipeline = pipeline(
+            MODEL_ID,
+            backend_config=backend_config,
+            log_level='INFO'
+        )
+        print("InternVL2 model loaded successfully!")
+        return True
+    except Exception as e:
+        print(f"Error loading InternVL2 model: {str(e)}")
+        if "CUDA out of memory" in str(e):
+            print("Not enough GPU memory for the model")
+        return False
+def analyze_image(image, prompt):
+    """Analyze the image using InternVL2 model"""
+    try:
+        start_time = time.time()
+        # Make sure the model is loaded
+        if not load_internvl2_model():
+            return "Couldn't load InternVL2 model."
+        # Convert numpy array to PIL Image
+        if isinstance(image, np.ndarray):
+            image_pil = Image.fromarray(image).convert('RGB')
+        else:
+            # If somehow it's already a PIL Image
+            image_pil = image.convert('RGB')
+        # Run inference with the model
+        response = internvl2_pipeline((prompt, image_pil))
+        # Get the response text
+        result = response.text
+        elapsed_time = time.time() - start_time
+        return result
+    except Exception as e:
+        print(f"Error in image analysis: {str(e)}")
+        # Try to clean up memory in case of error
+        if USE_GPU:
+            torch.cuda.empty_cache()
+        return f"Error in image analysis: {str(e)}"
+def process_image(image, analysis_type="general"):
+    """Process the image and return the analysis"""
+    if image is None:
+        return "Please upload an image."
+    # Define prompt based on analysis type
+    if analysis_type == "general":
+        prompt = "Describe this image in detail."
+    elif analysis_type == "text":
+        prompt = "What text can you see in this image? Please transcribe it accurately."
+    elif analysis_type == "chart":
+        prompt = "Analyze any charts, graphs or diagrams in this image in detail, including trends, data points, and conclusions."
+    elif analysis_type == "people":
+        prompt = "Describe the people in this image - their appearance, actions, and expressions."
+    elif analysis_type == "technical":
+        prompt = "Provide a technical analysis of this image, including object identification, spatial relationships, and any technical elements present."
+    else:
+        prompt = "Describe this image in detail."
+    start_time = time.time()
+    # Get analysis from the model
+    analysis = analyze_image(image, prompt)
+    elapsed_time = time.time() - start_time
+    return f"{analysis}\n\nAnalysis completed in {elapsed_time:.2f} seconds."
+# Define the Gradio interface
+def create_interface():
+    with gr.Blocks(title="Image Analysis with InternVL2") as demo:
+        gr.Markdown("# Image Analysis with InternVL2-40B")
+        gr.Markdown("Upload an image to analyze it using the InternVL2-40B model.")
+        with gr.Row():
+            with gr.Column(scale=1):
+                input_image = gr.Image(type="pil", label="Upload Image")
+                analysis_type = gr.Radio(
+                    ["general", "text", "chart", "people", "technical"],
+                    label="Analysis Type",
+                    value="general"
+                )
+                submit_btn = gr.Button("Analyze Image")
+            with gr.Column(scale=2):
+                output_text = gr.Textbox(label="Analysis Result", lines=20)
+        submit_btn.click(
+            fn=process_image,
+            inputs=[input_image, analysis_type],
+            outputs=output_text
+        )
+        gr.Markdown("""
+        ## Analysis Types
+        - **General**: General description of the image
+        - **Text**: Focus on identifying and transcribing text in the image
+        - **Chart**: Detailed analysis of charts, graphs, and diagrams
+        - **People**: Description of people, their appearance and actions
+        - **Technical**: Technical analysis identifying objects and spatial relationships
+        """)
+        # Examples
+        gr.Examples(
+            examples=[
+                ["data_temp/page_2.png", "general"],
+                ["data_temp/page_2.png", "text"],
+                ["data_temp/page_2.png", "chart"]
+            ],
+            inputs=[input_image, analysis_type],
+            outputs=output_text,
+            fn=process_image,
+            cache_examples=True,
+        )
+    return demo
+# Main function
+if __name__ == "__main__":
+    # Create the Gradio interface
+    demo = create_interface()
+    # Launch the interface
+    demo.launch(share=False)

csv_to_html.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import pandas as pd
+from jinja2 import Template
+# Read the CSV file
+df = pd.read_csv('sample_data.csv')
+# Calculate column totals (excluding the first column which contains row labels)
+totals = df.iloc[:, 1:].sum()
+# Create HTML template
+html_template = """
+<!DOCTYPE html>
+<html>
+<head>
+    <title>CSV Data Report</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 20px;
+        }
+        table {
+            border-collapse: collapse;
+            width: 100%;
+            margin-bottom: 20px;
+        }
+        th, td {
+            border: 1px solid #ddd;
+            padding: 8px;
+            text-align: right;
+        }
+        th {
+            background-color: #f2f2f2;
+        }
+        .total-row {
+            font-weight: bold;
+            background-color: #e6e6e6;
+        }
+        h1 {
+            color: #333;
+        }
+    </style>
+</head>
+<body>
+    <h1>CSV Data Report</h1>
+    <h2>Raw Data</h2>
+    {{ raw_data | safe }}
+    <h2>Column Totals</h2>
+    <table>
+        <tr>
+            {% for column in totals.index %}
+            <th>{{ column }}</th>
+            {% endfor %}
+        </tr>
+        <tr class="total-row">
+            {% for value in totals.values %}
+            <td>{{ "{:,.2f}".format(value) }}</td>
+            {% endfor %}
+        </tr>
+    </table>
+</body>
+</html>
+"""
+# Convert the raw data to HTML
+raw_data_html = df.to_html(index=False, classes='table')
+# Create the template and render it
+template = Template(html_template)
+html_output = template.render(raw_data=raw_data_html, totals=totals)
+# Save the HTML output
+with open('report.html', 'w') as f:
+    f.write(html_output)
+print("Report has been generated as 'report.html'")

data_temp/page_2.png ADDED Viewed

Git LFS Details

SHA256: 20fab46eac6ed2f67da21966eabaa80d15857a4f9d069d7c8ee29d9eb6498990
Pointer size: 131 Bytes
Size of remote file: 274 kB

docs/aws_deployment.md ADDED Viewed

	@@ -0,0 +1,170 @@

+# AWS SageMaker Deployment Guide
+This guide provides step-by-step instructions for deploying the Image Description application to AWS SageMaker.
+## Prerequisites
+- AWS account with SageMaker permissions
+- AWS CLI installed and configured
+- Docker installed on your local machine
+- The source code from this repository
+## Step 1: Create an Amazon ECR Repository
+```bash
+aws ecr create-repository --repository-name image-descriptor
+```
+Note the repository URI returned by this command. You'll use it in the next step.
+## Step 2: Build and Push the Docker Image
+1. Log in to ECR:
+```bash
+aws ecr get-login-password --region your-region | docker login --username AWS --password-stdin your-account-id.dkr.ecr.your-region.amazonaws.com
+```
+2. Build the Docker image:
+```bash
+docker build -t image-descriptor .
+```
+3. Tag and push the image:
+```bash
+docker tag image-descriptor:latest your-account-id.dkr.ecr.your-region.amazonaws.com/image-descriptor:latest
+docker push your-account-id.dkr.ecr.your-region.amazonaws.com/image-descriptor:latest
+```
+## Step 3: Create a SageMaker Model
+1. Create a model.json file:
+```json
+{
+    "ModelName": "QwenVLImageDescriptor",
+    "PrimaryContainer": {
+        "Image": "your-account-id.dkr.ecr.your-region.amazonaws.com/image-descriptor:latest",
+        "Environment": {
+            "PORT": "8080"
+        }
+    },
+    "ExecutionRoleArn": "arn:aws:iam::your-account-id:role/service-role/AmazonSageMaker-ExecutionRole"
+}
+```
+2. Create the SageMaker model:
+```bash
+aws sagemaker create-model --cli-input-json file://model.json
+```
+## Step 4: Create an Endpoint Configuration
+1. Create a config.json file:
+```json
+{
+    "EndpointConfigName": "QwenVLImageDescriptorConfig",
+    "ProductionVariants": [
+        {
+            "VariantName": "AllTraffic",
+            "ModelName": "QwenVLImageDescriptor",
+            "InstanceType": "ml.g5.2xlarge",
+            "InitialInstanceCount": 1
+        }
+    ]
+}
+```
+2. Create the endpoint configuration:
+```bash
+aws sagemaker create-endpoint-config --cli-input-json file://config.json
+```
+## Step 5: Create the Endpoint
+```bash
+aws sagemaker create-endpoint --endpoint-name qwen-vl-image-descriptor --endpoint-config-name QwenVLImageDescriptorConfig
+```
+This will take several minutes to deploy.
+## Step 6: Invoke the Endpoint
+You can invoke the endpoint using the AWS SDK or AWS CLI.
+Using Python SDK:
+```python
+import boto3
+import json
+import base64
+from PIL import Image
+import io
+# Initialize the SageMaker runtime client
+runtime = boto3.client('sagemaker-runtime')
+# Load and encode the image
+with open('data_temp/page_2.png', 'rb') as f:
+    image_data = f.read()
+image_b64 = base64.b64encode(image_data).decode('utf-8')
+# Create the request payload
+payload = {
+    'image_data': image_b64
+}
+# Invoke the endpoint
+response = runtime.invoke_endpoint(
+    EndpointName='qwen-vl-image-descriptor',
+    ContentType='application/json',
+    Body=json.dumps(payload)
+)
+# Parse the response
+result = json.loads(response['Body'].read().decode())
+print(json.dumps(result, indent=2))
+```
+## Step 7: Set Up API Gateway (Optional)
+For public HTTP access, set up an API Gateway:
+1. Create a new REST API in API Gateway
+2. Create a new resource and POST method
+3. Configure the integration to use the SageMaker endpoint
+4. Deploy the API to a stage
+5. Note the API Gateway URL for client use
+## Cost Optimization
+To optimize costs:
+1. Use SageMaker Serverless Inference instead of a dedicated endpoint
+2. Implement auto-scaling for your endpoint
+3. Use Spot Instances for non-critical workloads
+4. Schedule endpoints to be active only during business hours
+## Monitoring
+Set up CloudWatch Alarms to monitor:
+1. Endpoint invocation metrics
+2. Error rates
+3. Latency
+4. Instance utilization
+## Cleanup
+To avoid ongoing charges, delete resources when not in use:
+```bash
+aws sagemaker delete-endpoint --endpoint-name qwen-vl-image-descriptor
+aws sagemaker delete-endpoint-config --endpoint-config-name QwenVLImageDescriptorConfig
+aws sagemaker delete-model --model-name QwenVLImageDescriptor
+```

docs/azure_deployment.md ADDED Viewed

	@@ -0,0 +1,365 @@

+# Azure Machine Learning Deployment Guide
+This guide provides step-by-step instructions for deploying the Image Description application to Azure Machine Learning.
+## Prerequisites
+- Azure subscription
+- Azure CLI installed and configured
+- Azure Machine Learning workspace
+- The source code from this repository
+## Step 1: Set Up Azure Machine Learning
+1. Create a Resource Group (if you don't have one):
+```bash
+az group create --name image-descriptor-rg --location eastus
+```
+2. Create an Azure Machine Learning workspace:
+```bash
+az ml workspace create --workspace-name image-descriptor-ws \
+    --resource-group image-descriptor-rg \
+    --location eastus
+```
+## Step 2: Create a Compute Cluster
+Create a GPU-enabled compute cluster for training and inference:
+```bash
+az ml compute create --name gpu-cluster \
+    --workspace-name image-descriptor-ws \
+    --resource-group image-descriptor-rg \
+    --type AmlCompute \
+    --min-instances 0 \
+    --max-instances 1 \
+    --size Standard_NC6s_v3
+```
+## Step 3: Prepare Environment Configuration
+Create an environment.yml file to define dependencies:
+```yaml
+name: image_descriptor_env
+channels:
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.9
+  - pip=23.0
+  - pytorch=2.0.0
+  - torchvision=0.15.0
+  - pip:
+    - transformers>=4.36.0
+    - accelerate>=0.25.0
+    - bitsandbytes>=0.41.0
+    - safetensors>=0.4.0
+    - flask>=2.3.2
+    - flask-cors>=4.0.0
+    - gunicorn>=21.2.0
+    - pillow>=10.0.0
+    - matplotlib>=3.7.0
+    - python-dotenv>=1.0.0
+    - azureml-core>=1.48.0
+    - azureml-defaults>=1.48.0
+    - inference-schema>=1.4.1
+```
+## Step 4: Create a Model Entry Script
+Create a file called `score.py` to handle Azure ML model inference:
+```python
+import json
+import os
+import io
+import base64
+import logging
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Global variables
+model = None
+processor = None
+tokenizer = None
+def init():
+    """Initialize the model when the service starts"""
+    global model, processor, tokenizer
+    logger.info("Loading model...")
+    model_id = "Qwen/Qwen2-VL-7B"
+    # Load model components with quantization for efficiency
+    processor = AutoProcessor.from_pretrained(model_id)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    # Load model with 4-bit quantization to reduce memory requirements
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch.bfloat16,
+        load_in_4bit=True,
+        device_map="auto"
+    )
+    logger.info("Model loaded successfully")
+def run(raw_data):
+    """Process an image and generate descriptions
+    Args:
+        raw_data: A JSON string containing the image as base64 encoded data
+    Returns:
+        A JSON string containing the descriptions
+    """
+    global model, processor, tokenizer
+    try:
+        # Parse input
+        data = json.loads(raw_data)
+        # Get the image data (from base64 or URL)
+        if 'image_data' in data:
+            image_bytes = base64.b64decode(data['image_data'])
+            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
+            logger.info("Loaded image from base64 data")
+        elif 'image_url' in data:
+            # Handle image URLs (for Azure Storage or public URLs)
+            from urllib.request import urlopen
+            with urlopen(data['image_url']) as response:
+                image_bytes = response.read()
+            image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
+            logger.info(f"Loaded image from URL: {data['image_url']}")
+        else:
+            return json.dumps({"error": "No image data or URL provided"})
+        # Process the image
+        inputs = processor(
+            images=image,
+            return_tensors="pt"
+        ).to(model.device)
+        # Basic description prompt
+        prompt_basic = "Describe this image briefly."
+        input_ids_basic = tokenizer(prompt_basic, return_tensors="pt").input_ids.to(model.device)
+        # Detailed description prompt
+        prompt_detailed = "Analyze this image in detail. Describe the main elements, any text visible, the colors, and the overall composition."
+        input_ids_detailed = tokenizer(prompt_detailed, return_tensors="pt").input_ids.to(model.device)
+        # Technical analysis prompt
+        prompt_technical = "What can you tell me about the technical aspects of this image?"
+        input_ids_technical = tokenizer(prompt_technical, return_tensors="pt").input_ids.to(model.device)
+        # Generate outputs for each prompt
+        # Basic description
+        with torch.no_grad():
+            output_basic = model.generate(
+                **inputs,
+                input_ids=input_ids_basic,
+                max_new_tokens=150,
+                do_sample=False
+            )
+        basic_description = tokenizer.decode(output_basic[0], skip_special_tokens=True).replace(prompt_basic, "").strip()
+        # Detailed description
+        with torch.no_grad():
+            output_detailed = model.generate(
+                **inputs,
+                input_ids=input_ids_detailed,
+                max_new_tokens=300,
+                do_sample=False
+            )
+        detailed_description = tokenizer.decode(output_detailed[0], skip_special_tokens=True).replace(prompt_detailed, "").strip()
+        # Technical analysis
+        with torch.no_grad():
+            output_technical = model.generate(
+                **inputs,
+                input_ids=input_ids_technical,
+                max_new_tokens=200,
+                do_sample=False
+            )
+        technical_analysis = tokenizer.decode(output_technical[0], skip_special_tokens=True).replace(prompt_technical, "").strip()
+        # Return the results
+        return json.dumps({
+            "success": True,
+            "basic_description": basic_description,
+            "detailed_description": detailed_description,
+            "technical_analysis": technical_analysis
+        })
+    except Exception as e:
+        logger.error(f"Error processing image: {str(e)}", exc_info=True)
+        return json.dumps({"error": f"Error generating description: {str(e)}"})
+```
+## Step 5: Register the Model
+1. Create a model.yml file:
+```yaml
+$schema: https://azuremlschemas.azureedge.net/latest/model.schema.json
+name: qwen-vl-image-descriptor
+version: 1
+description: Qwen2-VL-7B model for image description
+path: .
+```
+2. Register the model:
+```bash
+az ml model create --file model.yml \
+    --workspace-name image-descriptor-ws \
+    --resource-group image-descriptor-rg
+```
+## Step 6: Deploy as an Online Endpoint
+1. Create an endpoint.yml file:
+```yaml
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
+name: image-descriptor-endpoint
+description: Endpoint for image description
+auth_mode: key
+```
+2. Create a deployment.yml file:
+```yaml
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
+name: qwen-vl-deployment
+endpoint_name: image-descriptor-endpoint
+model: azureml:qwen-vl-image-descriptor:1
+environment:
+  conda_file: environment.yml
+  image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04:latest
+instance_type: Standard_NC6s_v3
+instance_count: 1
+request_settings:
+  max_concurrent_requests_per_instance: 1
+  request_timeout_ms: 120000
+```
+3. Create the endpoint:
+```bash
+az ml online-endpoint create --file endpoint.yml \
+    --workspace-name image-descriptor-ws \
+    --resource-group image-descriptor-rg
+```
+4. Create the deployment:
+```bash
+az ml online-deployment create --file deployment.yml \
+    --workspace-name image-descriptor-ws \
+    --resource-group image-descriptor-rg
+```
+5. Allocate 100% traffic to the deployment:
+```bash
+az ml online-endpoint update --name image-descriptor-endpoint \
+    --traffic "qwen-vl-deployment=100" \
+    --workspace-name image-descriptor-ws \
+    --resource-group image-descriptor-rg
+```
+## Step 7: Test the Endpoint
+You can test the endpoint using the Azure ML SDK:
+```python
+import json
+import base64
+from azure.ai.ml import MLClient
+from azure.identity import DefaultAzureCredential
+from azure.ai.ml.entities import ManagedOnlineEndpoint
+# Get a handle to the workspace
+credential = DefaultAzureCredential()
+ml_client = MLClient(
+    credential=credential,
+    subscription_id="your-subscription-id",
+    resource_group_name="image-descriptor-rg",
+    workspace_name="image-descriptor-ws"
+)
+# Get endpoint
+endpoint = ml_client.online_endpoints.get("image-descriptor-endpoint")
+# Load and encode the image
+with open('data_temp/page_2.png', 'rb') as f:
+    image_data = f.read()
+image_b64 = base64.b64encode(image_data).decode('utf-8')
+# Create the request payload
+payload = {
+    'image_data': image_b64
+}
+# Invoke the endpoint
+response = ml_client.online_endpoints.invoke(
+    endpoint_name="image-descriptor-endpoint",
+    request_file=json.dumps(payload),
+    deployment_name="qwen-vl-deployment"
+)
+# Parse the response
+result = json.loads(response)
+print(json.dumps(result, indent=2))
+```
+## Cost Optimization
+To optimize costs:
+1. Use a smaller compute size if possible
+2. Scale to zero instances when not in use
+3. Set up autoscaling rules
+4. Consider reserved instances for long-term deployments
+## Monitoring
+Monitor your endpoint using:
+1. Azure Monitor
+2. Application Insights
+3. Azure ML metrics dashboard
+4. Set up alerts for anomalies
+## Cleanup
+To avoid ongoing charges, delete resources when not in use:
+```bash
+# Delete the endpoint
+az ml online-endpoint delete --name image-descriptor-endpoint \
+    --workspace-name image-descriptor-ws \
+    --resource-group image-descriptor-rg -y
+# Delete compute cluster
+az ml compute delete --name gpu-cluster \
+    --workspace-name image-descriptor-ws \
+    --resource-group image-descriptor-rg -y
+# Delete workspace (optional)
+az ml workspace delete --name image-descriptor-ws \
+    --resource-group image-descriptor-rg -y
+# Delete resource group (optional)
+az group delete --name image-descriptor-rg -y
+```

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/bin/bash
+set -e
+# Load environment variables
+source .env
+# Check if we should run in UI or API mode
+if [ "$ENABLE_UI" = "true" ]; then
+    echo "Starting Gradio UI with API backend..."
+    exec python app.py --api-server &
+    sleep 5  # Give the API server time to start
+    exec python app.py
+elif [ "$API_ONLY" = "true" ]; then
+    echo "Starting API server only..."
+    exec gunicorn --bind 0.0.0.0:$PORT --workers 1 --timeout 300 "image_descriptor:app"
+else
+    # Default behavior for Hugging Face Spaces: run both
+    echo "Starting service in Hugging Face Spaces mode..."
+    exec python app.py
+fi

image_descriptor.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import os
+import torch
+import io
+from PIL import Image
+import matplotlib.pyplot as plt
+from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPImageProcessor
+import logging
+import time
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Global variables to store the model and processors
+model = None
+processor = None
+tokenizer = None
+def describe_image(image_path=None, image_data=None, show_image=False):
+    """
+    Load an image and generate a description using Qwen2-VL-7B model.
+    Args:
+        image_path (str, optional): Path to the image file
+        image_data (bytes, optional): Raw image data
+        show_image (bool): Whether to display the image
+    Returns:
+        dict: Descriptions of the image
+    """
+    global model, processor, tokenizer
+    # Initialize model if not already loaded
+    if model is None or processor is None or tokenizer is None:
+        load_model()
+    # Check if we have valid input
+    if image_path is None and image_data is None:
+        return {"error": "No image provided"}
+    try:
+        # Load the image
+        if image_path is not None:
+            if not os.path.exists(image_path):
+                return {"error": f"Image not found at {image_path}"}
+            logger.info(f"Processing image from path: {image_path}")
+            image = Image.open(image_path).convert('RGB')
+        else:
+            logger.info("Processing image from uploaded data")
+            image = Image.open(io.BytesIO(image_data)).convert('RGB')
+        # Display the image if requested (for local testing only)
+        if show_image:
+            plt.figure(figsize=(10, 8))
+            plt.imshow(image)
+            plt.axis('off')
+            if image_path:
+                plt.title(os.path.basename(image_path))
+            plt.show()
+        # Process the image
+        logger.info("Generating descriptions...")
+        # Process image and create inputs for various prompts
+        pixel_values = processor(images=image, return_tensors="pt").to(model.device)
+        # Basic description prompt
+        prompt_basic = "Describe this image briefly."
+        input_ids_basic = tokenizer(prompt_basic, return_tensors="pt").input_ids.to(model.device)
+        # Detailed description prompt
+        prompt_detailed = "Analyze this image in detail. Describe the main elements, any text visible, the colors, and the overall composition."
+        input_ids_detailed = tokenizer(prompt_detailed, return_tensors="pt").input_ids.to(model.device)
+        # Technical analysis prompt
+        prompt_technical = "What can you tell me about the technical aspects of this image?"
+        input_ids_technical = tokenizer(prompt_technical, return_tensors="pt").input_ids.to(model.device)
+        # Generate outputs for each prompt
+        # Basic description
+        with torch.no_grad():
+            output_basic = model.generate(
+                input_ids=input_ids_basic,
+                pixel_values=pixel_values.pixel_values,
+                max_new_tokens=150,
+                do_sample=False
+            )
+        basic_description = tokenizer.decode(output_basic[0], skip_special_tokens=True).replace(prompt_basic, "").strip()
+        # Detailed description
+        with torch.no_grad():
+            output_detailed = model.generate(
+                input_ids=input_ids_detailed,
+                pixel_values=pixel_values.pixel_values,
+                max_new_tokens=300,
+                do_sample=False
+            )
+        detailed_description = tokenizer.decode(output_detailed[0], skip_special_tokens=True).replace(prompt_detailed, "").strip()
+        # Technical analysis
+        with torch.no_grad():
+            output_technical = model.generate(
+                input_ids=input_ids_technical,
+                pixel_values=pixel_values.pixel_values,
+                max_new_tokens=200,
+                do_sample=False
+            )
+        technical_analysis = tokenizer.decode(output_technical[0], skip_special_tokens=True).replace(prompt_technical, "").strip()
+        return {
+            "success": True,
+            "basic_description": basic_description,
+            "detailed_description": detailed_description,
+            "technical_analysis": technical_analysis
+        }
+    except Exception as e:
+        logger.error(f"Error processing image: {str(e)}", exc_info=True)
+        return {"error": f"Error generating description: {str(e)}"}
+def load_model():
+    """Load the model and related components"""
+    global model, processor, tokenizer
+    try:
+        logger.info("Loading model...")
+        model_id = "Qwen/Qwen2-VL-7B"
+        # Use explicit processor class instead of AutoProcessor
+        processor = CLIPImageProcessor.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        # Load model with 4-bit quantization to reduce memory requirements
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.bfloat16,
+            load_in_4bit=True,
+            device_map="auto"
+        )
+        logger.info("Model loaded successfully")
+        return True
+    except Exception as e:
+        logger.error(f"Error loading model: {str(e)}", exc_info=True)
+        return False
+def main():
+    """Run in command-line mode"""
+    # Path to the image
+    image_folder = "data_temp"
+    image_name = "page_2.png"
+    image_path = os.path.join(image_folder, image_name)
+    # Get the description
+    result = describe_image(image_path=image_path, show_image=True)
+    # Print the results
+    if "error" not in result:
+        print("\n==== Image Description Results (Qwen2-VL-7B) ====")
+        print(f"\nBasic Description:\n{result['basic_description']}")
+        print(f"\nDetailed Description:\n{result['detailed_description']}")
+        print(f"\nTechnical Analysis:\n{result['technical_analysis']}")
+    else:
+        print(result["error"])  # Print error message if there was an issue
+if __name__ == "__main__":
+    main()

image_descriptor_requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch==2.0.1
+torchvision==0.15.2
+transformers==4.36.2
+pillow==10.0.1
+matplotlib==3.7.3
+accelerate==0.27.2
+bitsandbytes==0.41.3
+safetensors==0.4.1
+flask==2.3.3
+flask-cors==4.0.0
+gunicorn==21.2.0
+python-dotenv==1.0.0
+gradio==4.12.0
+requests==2.31.0

readme.mdp ADDED Viewed

	@@ -0,0 +1 @@


1	+ placeholder

report.html ADDED Viewed

	@@ -0,0 +1,105 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>CSV Data Report</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 20px;
+        }
+        table {
+            border-collapse: collapse;
+            width: 100%;
+            margin-bottom: 20px;
+        }
+        th, td {
+            border: 1px solid #ddd;
+            padding: 8px;
+            text-align: right;
+        }
+        th {
+            background-color: #f2f2f2;
+        }
+        .total-row {
+            font-weight: bold;
+            background-color: #e6e6e6;
+        }
+        h1 {
+            color: #333;
+        }
+    </style>
+</head>
+<body>
+    <h1>CSV Data Report</h1>
+    <h2>Raw Data</h2>
+    <table border="1" class="dataframe table">
+  <thead>
+    <tr style="text-align: right;">
+      <th>Product</th>
+      <th>Electronics</th>
+      <th>Clothing</th>
+      <th>Food</th>
+      <th>Books</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>January</td>
+      <td>1200</td>
+      <td>800</td>
+      <td>500</td>
+      <td>300</td>
+    </tr>
+    <tr>
+      <td>February</td>
+      <td>950</td>
+      <td>750</td>
+      <td>600</td>
+      <td>400</td>
+    </tr>
+    <tr>
+      <td>March</td>
+      <td>1500</td>
+      <td>900</td>
+      <td>550</td>
+      <td>350</td>
+    </tr>
+    <tr>
+      <td>April</td>
+      <td>1100</td>
+      <td>850</td>
+      <td>450</td>
+      <td>250</td>
+    </tr>
+  </tbody>
+</table>
+    <h2>Column Totals</h2>
+    <table>
+        <tr>
+            <th>Electronics</th>
+            <th>Clothing</th>
+            <th>Food</th>
+            <th>Books</th>
+        </tr>
+        <tr class="total-row">
+            <td>4,750.00</td>
+            <td>3,300.00</td>
+            <td>2,100.00</td>
+            <td>1,300.00</td>
+        </tr>
+    </table>
+</body>
+</html>

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch==2.1.0
+torchvision==0.16.0
+transformers>=4.37.2
+lmdeploy>=0.5.3
+pillow==10.0.1
+numpy>=1.24.0
+scipy==1.11.3
+gradio==3.38.0
+requests==2.31.0
+accelerate==0.27.2
+bitsandbytes==0.41.3
+safetensors==0.4.1
+openai
+huggingface_hub

sample_data.csv ADDED Viewed

	@@ -0,0 +1,5 @@

+Product,Electronics,Clothing,Food,Books
+January,1200,800,500,300
+February,950,750,600,400
+March,1500,900,550,350
+April,1100,850,450,250

server_deploy.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/bin/bash
+# Server deployment script for Image Description Application
+echo "Starting Image Description API server deployment"
+# Create virtual environment if it doesn't exist
+if [ ! -d "venv" ]; then
+    echo "Creating virtual environment..."
+    python3 -m venv venv
+fi
+# Activate virtual environment
+echo "Activating virtual environment..."
+source venv/bin/activate
+# Install requirements
+echo "Installing dependencies..."
+pip install -r image_descriptor_requirements.txt
+# Create necessary directories
+echo "Setting up directories..."
+mkdir -p uploads
+mkdir -p logs
+# Start the server with Gunicorn
+echo "Starting server with Gunicorn..."
+gunicorn --bind 0.0.0.0:8000 --workers 1 --timeout 300 "image_descriptor:app" \
+    --log-level info \
+    --access-logfile logs/access.log \
+    --error-logfile logs/error.log \
+    --daemon
+echo "Server started in the background on port 8000"
+echo "You can test it with: curl http://localhost:8000/health"
+echo "To process an image: curl -X POST -F \"image=@data_temp/page_2.png\" http://localhost:8000/describe"

test_internvl2.py ADDED Viewed

	@@ -0,0 +1,86 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Test script for InternVL2-40B-AWQ model using lmdeploy
+"""
+import argparse
+import os
+import time
+from PIL import Image
+from lmdeploy import pipeline, TurbomindEngineConfig
+from lmdeploy.vl import load_image
+def parse_args():
+    parser = argparse.ArgumentParser(description="Test InternVL2 model")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="OpenGVLab/InternVL2-40B-AWQ",
+        help="Model name or path"
+    )
+    parser.add_argument(
+        "--image",
+        type=str,
+        required=True,
+        help="Path to the test image"
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="Describe this image in detail.",
+        help="Prompt for the model"
+    )
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    print(f"Loading model: {args.model}")
+    start_time = time.time()
+    # Configure the engine for AWQ quantized model
+    backend_config = TurbomindEngineConfig(model_format='awq')
+    # Create pipeline
+    pipe = pipeline(args.model, backend_config=backend_config, log_level='INFO')
+    load_time = time.time() - start_time
+    print(f"Model loaded in {load_time:.2f} seconds")
+    # Load image
+    try:
+        if args.image.startswith(('http://', 'https://')):
+            image = load_image(args.image)
+            print(f"Loaded image from URL: {args.image}")
+        else:
+            image_path = os.path.abspath(args.image)
+            if not os.path.exists(image_path):
+                raise FileNotFoundError(f"Image not found: {image_path}")
+            image = Image.open(image_path).convert('RGB')
+            print(f"Loaded image from path: {image_path}")
+    except Exception as e:
+        print(f"Error loading image: {e}")
+        return
+    # Run inference
+    print(f"Running inference with prompt: '{args.prompt}'")
+    start_time = time.time()
+    response = pipe((args.prompt, image))
+    inference_time = time.time() - start_time
+    print(f"Inference completed in {inference_time:.2f} seconds")
+    # Print result
+    print("\n--- RESULT ---")
+    print(response.text)
+    print("-------------\n")
+if __name__ == "__main__":
+    main()

upload_internvl2_to_hf.py ADDED Viewed

	@@ -0,0 +1,73 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Script to upload the InternVL2 files to Hugging Face Spaces
+"""
+import os
+import sys
+import getpass
+from huggingface_hub import HfApi, create_repo, upload_folder
+# Default repository name
+DEFAULT_REPO = "mknolan/cursor_slides_internvl2"
+def main():
+    """Main function to upload files to Hugging Face Spaces"""
+    # Get Hugging Face token with WRITE access
+    token = getpass.getpass("Enter your Hugging Face token (with WRITE access): ")
+    # Get repository name
+    repo_name = input("Enter repository name (default: {}): ".format(DEFAULT_REPO)) or DEFAULT_REPO
+    print("Uploading to Space: {}".format(repo_name))
+    # Initialize Hugging Face API
+    api = HfApi(token=token)
+    try:
+        # Try to get the repository, create if it doesn't exist
+        try:
+            repo = api.repo_info(repo_id=repo_name, repo_type="space")
+            print("Repo {} ready".format(repo_name))
+        except Exception:
+            print("Creating new Space: {}".format(repo_name))
+            create_repo(
+                repo_id=repo_name,
+                token=token,
+                repo_type="space",
+                space_sdk="gradio",
+                private=False
+            )
+        # Upload the entire folder at once using upload_folder
+        print("Uploading files to Hugging Face Space...")
+        upload_folder(
+            folder_path=".",  # Current directory
+            repo_id=repo_name,
+            repo_type="space",
+            ignore_patterns=[
+                ".git*",
+                "*__pycache__*",
+                "*.pyc",
+                ".DS_Store",
+                "*.ipynb_checkpoints*",
+                "venv",
+                ".env"
+            ],
+            commit_message="Upload InternVL2 implementation",
+            token=token
+        )
+        print("Upload completed!")
+        print("Check your Space at: https://huggingface.co/spaces/{}".format(repo_name))
+    except Exception as e:
+        print("Error: {}".format(e))
+        return 1
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

upload_to_hf.py ADDED Viewed

	@@ -0,0 +1,74 @@

+#!/usr/bin/env python3
+"""
+Script to upload the current directory to a Hugging Face Space
+"""
+import os
+import sys
+from huggingface_hub import HfApi, create_repo
+# Configuration
+SPACE_NAME = "mknolan/cursor_slides"  # Change this to your space name
+TOKEN = input("Enter your Hugging Face token (with WRITE access): ")
+# Initialize API
+api = HfApi(token=TOKEN)
+def upload_directory():
+    """Upload all files in the current directory to HF Space"""
+    print("Uploading to Space: {}".format(SPACE_NAME))
+    # Create repo if it doesn't exist (this is idempotent)
+    try:
+        create_repo(
+            repo_id=SPACE_NAME,
+            token=TOKEN,
+            repo_type="space",
+            exist_ok=True,
+            space_sdk="docker"
+        )
+        print("Repo {} ready".format(SPACE_NAME))
+    except Exception as e:
+        print("Note: Repo already exists or {}".format(str(e)))
+    # Gather all files to upload
+    files_to_upload = []
+    for root, _, files in os.walk("."):
+        # Skip .git directory and any other hidden directories
+        if "/.git" in root or "/.__pycache__" in root:
+            continue
+        for file in files:
+            # Skip hidden files and .git files
+            if file.startswith(".git") or file.startswith("."):
+                continue
+            path = os.path.join(root, file)
+            # Skip this upload script itself
+            if path == "./upload_to_hf.py":
+                continue
+            files_to_upload.append(path)
+    print("Found {} files to upload".format(len(files_to_upload)))
+    # Upload each file
+    for i, path in enumerate(files_to_upload):
+        print("[{}/{}] Uploading {}...".format(i+1, len(files_to_upload), path))
+        try:
+            # Path in repo (remove leading ./)
+            path_in_repo = path[2:] if path.startswith("./") else path
+            api.upload_file(
+                path_or_fileobj=path,
+                path_in_repo=path_in_repo,
+                repo_id=SPACE_NAME,
+                repo_type="space"
+            )
+        except Exception as e:
+            print("Error uploading {}: {}".format(path, str(e)))
+    print("Upload completed!")
+    print("Check your Space at: https://huggingface.co/spaces/{}".format(SPACE_NAME))
+if __name__ == "__main__":
+    upload_directory()