mknolan commited on
Commit
e59dc66
·
verified ·
1 Parent(s): 79aa3b8

Upload InternVL2 implementation

Browse files
.env.hf ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables for Hugging Face Spaces deployment
2
+ PORT=7860
3
+ FLASK_APP=image_descriptor.py
4
+ TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
5
+ HF_HOME=/app/.cache/huggingface
6
+ GRADIO_ANALYTICS_ENABLED=False
7
+ GRADIO_SERVER_NAME=0.0.0.0
8
+ GRADIO_SERVER_PORT=7860
9
+ # Uncomment to run only the API without the UI
10
+ # API_ONLY=true
11
+ # Uncomment to run both the UI and API
12
+ # ENABLE_UI=true
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data_temp/page_2.png filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
2
+
3
+ # Set environment variables
4
+ ENV DEBIAN_FRONTEND=noninteractive
5
+ ENV PYTHONUNBUFFERED=1
6
+ ENV HF_HOME=/root/.cache/huggingface
7
+ ENV TRANSFORMERS_CACHE=/root/.cache/huggingface/transformers
8
+ ENV MPLCONFIGDIR=/tmp/matplotlib
9
+
10
+ # Create necessary directories with proper permissions
11
+ RUN mkdir -p /root/.cache/huggingface/transformers && \
12
+ mkdir -p /tmp/matplotlib && \
13
+ chmod -R 777 /root/.cache && \
14
+ chmod -R 777 /tmp/matplotlib
15
+
16
+ # Install system dependencies
17
+ RUN apt-get update && apt-get install -y --no-install-recommends \
18
+ build-essential \
19
+ git \
20
+ curl \
21
+ ca-certificates \
22
+ python3-pip \
23
+ python3-dev \
24
+ python3-setuptools \
25
+ && rm -rf /var/lib/apt/lists/*
26
+
27
+ # Create a working directory
28
+ WORKDIR /app
29
+
30
+ # Copy requirements file and install Python dependencies
31
+ COPY requirements.txt .
32
+
33
+ RUN pip3 install --no-cache-dir --upgrade pip && \
34
+ pip3 install --no-cache-dir -r requirements.txt && \
35
+ # Install additional dependencies for lmdeploy
36
+ pip3 install --no-cache-dir cmake && \
37
+ pip3 install --no-cache-dir ninja && \
38
+ # Install flash-attention for performance
39
+ pip3 install --no-cache-dir flash-attn
40
+
41
+ # Copy the application files
42
+ COPY . .
43
+
44
+ # Make port 7860 available for the app
45
+ EXPOSE 7860
46
+
47
+ # Start the application
48
+ CMD ["python3", "app_internvl2.py"]
HUGGING-FACE-DEPLOYMENT.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces Deployment Guide
2
+
3
+ This guide provides detailed steps for deploying the Image Description application to Hugging Face Spaces.
4
+
5
+ ## Prerequisites
6
+
7
+ - A [Hugging Face](https://huggingface.co/) account
8
+ - A GitHub repository containing your application code
9
+ - The code should include all files created in this project
10
+
11
+ ## Step 1: Prepare Your GitHub Repository
12
+
13
+ 1. Make sure your GitHub repository contains all necessary files:
14
+ - `image_descriptor.py` (Flask API)
15
+ - `app.py` (Gradio UI)
16
+ - `Dockerfile`
17
+ - `entrypoint.sh`
18
+ - `image_descriptor_requirements.txt`
19
+ - `data_temp/` directory with sample images
20
+ - `.env` file with configuration
21
+ - `.gitignore`
22
+
23
+ 2. Commit all changes to your repository:
24
+ ```bash
25
+ git add .
26
+ git commit -m "Prepare for Hugging Face Spaces deployment"
27
+ git push
28
+ ```
29
+
30
+ ## Step 2: Create a New Hugging Face Space
31
+
32
+ 1. Go to [Hugging Face Spaces](https://huggingface.co/spaces)
33
+ 2. Click on "Create new Space"
34
+ 3. Fill out the form:
35
+ - **Owner**: Your Hugging Face username or organization
36
+ - **Space name**: Choose a unique name (e.g., "image-description-qwen")
37
+ - **License**: Choose an appropriate license (e.g., Apache 2.0)
38
+ - **SDK**: Select "Docker"
39
+ - **Hardware**: Select "A10G" for best performance with this model
40
+ - **Visibility**: Choose public or private based on your needs
41
+
42
+ 4. Click "Create Space"
43
+
44
+ ## Step 3: Link Your GitHub Repository
45
+
46
+ 1. On your new Space page, go to the "Settings" tab
47
+ 2. Under "Repository", click "Link external repository"
48
+ 3. Enter your GitHub repository URL
49
+ 4. Click "Link repository"
50
+ 5. Choose the branch to deploy (e.g., "main" or "master")
51
+ 6. Click "Save"
52
+
53
+ ## Step 4: Configure the Space
54
+
55
+ 1. Stay on the "Settings" tab
56
+ 2. Under "Variables", you can add environment variables if needed (most are already in the Dockerfile)
57
+ 3. Under "Advanced settings", enable "Secret HTTPS Frontend"
58
+
59
+ ## Step 5: Deploy and Monitor
60
+
61
+ 1. Go to the "Factory" tab to see the build progress
62
+ 2. Wait for the build and deployment to complete (this may take 10-20 minutes)
63
+ 3. Check the logs for any errors
64
+ 4. Once deployment is successful, go to the "App" tab to use your application
65
+
66
+ ## Step 6: Test Your Deployed Application
67
+
68
+ 1. On the "App" tab, upload an image
69
+ 2. Click "Analyze Image"
70
+ 3. Verify that the model generates descriptions correctly
71
+
72
+ ## Troubleshooting Common Issues
73
+
74
+ - **Build Failures**: Check the "Factory" logs for detailed error messages
75
+ - **Memory Issues**: If the model crashes, you may need to adjust the quantization settings or use a smaller model
76
+ - **Slow Performance**: The first run will be slow as the model is downloaded; subsequent runs will be faster
77
+ - **UI Issues**: If the UI loads but the model doesn't work, check the browser console and server logs
78
+
79
+ ## Updating Your Space
80
+
81
+ To update your Space after making changes to your code:
82
+
83
+ 1. Make changes to your code
84
+ 2. Commit and push to GitHub
85
+ 3. The Space will automatically rebuild and deploy the updated code
86
+
87
+ ## Sharing Your Space
88
+
89
+ - You can share your Space by giving others the URL
90
+ - You can embed your Space in other websites using the iframe code provided in the "Embed" tab
91
+ - You can collaborate with others by adding them as contributors in the "Settings" tab
92
+
93
+ ## Additional Resources
94
+
95
+ - [Hugging Face Spaces Documentation](https://huggingface.co/docs/hub/spaces)
96
+ - [Docker SDK Documentation](https://huggingface.co/docs/hub/spaces-sdks-docker)
97
+ - [Gradio Documentation](https://www.gradio.app/docs/)
LOCAL-TESTING.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local Testing Guide
2
+
3
+ Before deploying to Hugging Face Spaces, you may want to test the application locally. This guide provides instructions for local testing.
4
+
5
+ ## Prerequisites
6
+
7
+ - CUDA-capable GPU with at least 8GB VRAM
8
+ - Python 3.8+
9
+ - pip or conda package manager
10
+
11
+ ## Steps for Local Testing
12
+
13
+ 1. **Install Dependencies**
14
+
15
+ ```bash
16
+ pip install -r image_descriptor_requirements.txt
17
+ ```
18
+
19
+ 2. **Run in UI Mode**
20
+
21
+ ```bash
22
+ python app.py
23
+ ```
24
+
25
+ This will start the Gradio UI on http://localhost:7860. You can upload images and test the model.
26
+
27
+ 3. **Run in API-only Mode**
28
+
29
+ ```bash
30
+ FLASK_APP=image_descriptor.py flask run --host=0.0.0.0 --port=5000
31
+ ```
32
+
33
+ This will start just the Flask API on http://localhost:5000.
34
+
35
+ 4. **Test the Docker Container**
36
+
37
+ ```bash
38
+ # Build the container
39
+ docker build -t image-descriptor .
40
+
41
+ # Run the container
42
+ docker run -p 7860:7860 --gpus all image-descriptor
43
+ ```
44
+
45
+ The application will be available at http://localhost:7860.
46
+
47
+ ## Testing the API
48
+
49
+ You can test the API using curl:
50
+
51
+ ```bash
52
+ # Health check
53
+ curl http://localhost:5000/health
54
+
55
+ # Process an image
56
+ curl -X POST -F "image=@data_temp/page_2.png" http://localhost:5000/describe
57
+ ```
58
+
59
+ ## Troubleshooting
60
+
61
+ - **GPU Memory Issues**: If you encounter GPU memory errors, try reducing batch sizes or using a smaller model.
62
+ - **Model Download Issues**: If the model download fails, try downloading it manually from Hugging Face and place it in the `.cache/huggingface/transformers` directory.
63
+ - **Dependencies**: Make sure you have the correct CUDA version installed for your GPU.
64
+
65
+ ## Next Steps
66
+
67
+ Once you've confirmed the application works locally, you can deploy it to Hugging Face Spaces following the instructions in the main README.md.
README-HF.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Image Description with Qwen2-VL-7B
2
+
3
+ This Hugging Face Space uses the powerful Qwen2-VL-7B vision language model to generate detailed descriptions of images.
4
+
5
+ ## About
6
+
7
+ Upload any image and get:
8
+ - A basic description
9
+ - A detailed analysis
10
+ - A technical assessment
11
+
12
+ The app uses the Qwen2-VL-7B model with 4-bit quantization to provide efficient and high-quality image analysis.
13
+
14
+ ## Usage
15
+
16
+ 1. Upload an image or use one of the example images
17
+ 2. Click "Analyze Image"
18
+ 3. View the three types of descriptions generated by the model
19
+
20
+ ## Examples
21
+
22
+ The space includes sample images in the data_temp folder that you can use to test the model.
23
+
24
+ ## Technical Details
25
+
26
+ - **Model**: Qwen2-VL-7B
27
+ - **Framework**: Gradio UI + Flask API backend
28
+ - **Quantization**: 4-bit for efficient inference
29
+ - **GPU**: A10G recommended
30
+
31
+ ## Credits
32
+
33
+ - [Qwen2-VL-7B model](https://huggingface.co/Qwen/Qwen2-VL-7B) by Qwen team
README.md CHANGED
@@ -1,12 +1,135 @@
1
  ---
2
- title: Cursor Slides Internvl2
3
- emoji: 🐠
4
- colorFrom: yellow
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 5.20.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Image Description with Qwen-VL
3
+ emoji: 🖼️
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: docker
7
+ sdk_version: 3.0.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ # Image Description Application with Qwen-VL
13
+
14
+ This application uses the advanced Qwen-VL-Chat vision language model to generate detailed descriptions for images. It's specifically set up to describe the image in the `data_temp` folder, but can also analyze any uploaded image.
15
+
16
+ ## Features
17
+
18
+ - Loads an image from the data_temp folder or via upload
19
+ - Generates multiple types of descriptions using state-of-the-art AI:
20
+ - Basic description (brief overview)
21
+ - Detailed analysis (comprehensive description)
22
+ - Technical analysis (assessment of technical aspects)
23
+ - Displays the image (optional)
24
+ - Uses 8-bit quantization for efficient model loading
25
+ - Provides a user-friendly Gradio UI
26
+
27
+ ## Requirements
28
+
29
+ - Python 3.8 or higher
30
+ - PyTorch
31
+ - Transformers (version 4.35.2+)
32
+ - Pillow
33
+ - Matplotlib
34
+ - Accelerate
35
+ - Bitsandbytes
36
+ - Safetensors
37
+ - Gradio for the web interface
38
+
39
+ ## Hardware Requirements
40
+
41
+ This application uses a vision-language model which requires:
42
+ - A CUDA-capable GPU with at least 8GB VRAM
43
+ - 8GB+ system RAM
44
+
45
+ ## Deployment Options
46
+
47
+ ### 1. Hugging Face Spaces (Recommended)
48
+
49
+ This repository is ready to be deployed on Hugging Face Spaces.
50
+
51
+ **Steps:**
52
+ 1. Create a new Space on [Hugging Face Spaces](https://huggingface.co/spaces)
53
+ 2. Select "Docker" as the Space SDK
54
+ 3. Link this GitHub repository
55
+ 4. Select a GPU (T4 or better is recommended)
56
+ 5. Create the Space
57
+
58
+ The application will automatically deploy with the Gradio UI frontend.
59
+
60
+ ### 2. AWS SageMaker
61
+
62
+ For production deployment on AWS SageMaker:
63
+
64
+ 1. Package the application using the provided Dockerfile
65
+ 2. Upload the Docker image to Amazon ECR
66
+ 3. Create a SageMaker Model using the ECR image
67
+ 4. Deploy an endpoint with an instance type like ml.g4dn.xlarge
68
+ 5. Set up API Gateway for HTTP access (optional)
69
+
70
+ Detailed AWS instructions can be found in the `docs/aws_deployment.md` file.
71
+
72
+ ### 3. Azure Machine Learning
73
+
74
+ For Azure deployment:
75
+
76
+ 1. Create an Azure ML workspace
77
+ 2. Register the model on Azure ML
78
+ 3. Create an inference configuration
79
+ 4. Deploy to AKS or ACI with a GPU-enabled instance
80
+
81
+ Detailed Azure instructions can be found in the `docs/azure_deployment.md` file.
82
+
83
+ ## How It Works
84
+
85
+ The application uses the Qwen-VL-Chat model, a state-of-the-art multimodal AI model that can understand and describe images with impressive detail.
86
+
87
+ The script:
88
+ 1. Processes the image with three different prompts:
89
+ - "Describe this image briefly in a single paragraph."
90
+ - "Analyze this image in detail. Describe the main elements, any text visible, the colors, and the overall composition."
91
+ - "What can you tell me about the technical aspects of this image?"
92
+ 2. Uses 8-bit quantization to reduce memory requirements
93
+ 3. Formats and displays the results
94
+
95
+ ## Repository Structure
96
+
97
+ - `app.py` - Gradio UI for web interface
98
+ - `Dockerfile` - For containerized deployment
99
+ - `requirements.txt` - Python dependencies
100
+ - `data_temp/` - Sample images for testing
101
+
102
+ ## Local Development
103
+
104
+ 1. Install the required packages:
105
+ ```
106
+ pip install -r requirements.txt
107
+ ```
108
+
109
+ 2. Run the Gradio UI:
110
+ ```
111
+ python app.py
112
+ ```
113
+
114
+ 3. Visit `http://localhost:7860` in your browser
115
+
116
+ ## Example Output
117
+
118
+ ```
119
+ Processing image: data_temp/page_2.png
120
+ Loading model...
121
+ Generating descriptions...
122
+
123
+ ==== Image Description Results (Qwen-VL) ====
124
+
125
+ Basic Description:
126
+ The image shows a webpage or document with text content organized in multiple columns.
127
+
128
+ Detailed Description:
129
+ The image displays a structured document or webpage with multiple sections of text organized in a grid layout. The content appears to be technical or educational in nature, with what looks like headings and paragraphs of text. The color scheme is primarily black text on a white background, creating a clean, professional appearance. There appear to be multiple columns of information, possibly representing different topics or categories. The layout suggests this might be documentation, a reference guide, or an educational resource related to technical content.
130
+
131
+ Technical Analysis:
132
+ This appears to be a screenshot of a digital document or webpage. The image quality is good with clear text rendering, suggesting it was captured at an appropriate resolution. The image uses a standard document layout with what appears to be a grid or multi-column structure. The screenshot has been taken of what seems to be a text-heavy interface with minimal graphics, consistent with technical documentation or reference materials.
133
+ ```
134
+
135
+ Note: Actual descriptions will vary based on the specific image content and may be more detailed than this example.
README_INTERNVL2.md ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Image Analysis with InternVL2
2
+
3
+ This project uses the InternVL2-40B-AWQ model for high-quality image analysis, description, and understanding. It provides a Gradio web interface for users to upload images and get detailed analysis.
4
+
5
+ ## Features
6
+
7
+ - **High-Quality Image Analysis**: Uses InternVL2-40B (4-bit quantized) for state-of-the-art image understanding
8
+ - **Multiple Analysis Types**: General description, text extraction, chart analysis, people description, and technical analysis
9
+ - **Simple UI**: User-friendly Gradio interface for easy image uploading and analysis
10
+ - **Efficient Resource Usage**: 4-bit quantized model (AWQ) for reduced memory footprint and faster inference
11
+
12
+ ## Requirements
13
+
14
+ The application requires:
15
+
16
+ - Python 3.9+
17
+ - CUDA-compatible GPU (recommended 24GB+ VRAM)
18
+ - Transformers 4.37.2+
19
+ - lmdeploy 0.5.3+
20
+ - Gradio 3.38.0
21
+ - Other dependencies in `requirements.txt`
22
+
23
+ ## Setup
24
+
25
+ ### Docker Setup (Recommended)
26
+
27
+ 1. **Build the Docker image**:
28
+ ```
29
+ docker build -t internvl2-image-analysis .
30
+ ```
31
+
32
+ 2. **Run the Docker container**:
33
+ ```
34
+ docker run --gpus all -p 7860:7860 internvl2-image-analysis
35
+ ```
36
+
37
+ ### Local Setup
38
+
39
+ 1. **Create a virtual environment**:
40
+ ```
41
+ python -m venv venv
42
+ source venv/bin/activate # On Windows: venv\Scripts\activate
43
+ ```
44
+
45
+ 2. **Install dependencies**:
46
+ ```
47
+ pip install -r requirements.txt
48
+ ```
49
+
50
+ 3. **Run the application**:
51
+ ```
52
+ python app_internvl2.py
53
+ ```
54
+
55
+ ## Usage
56
+
57
+ 1. Open your browser and navigate to `http://localhost:7860`
58
+ 2. Upload an image using the upload box
59
+ 3. Choose an analysis type from the options
60
+ 4. Click "Analyze Image" and wait for the results
61
+
62
+ ### Analysis Types
63
+
64
+ - **General**: Provides a comprehensive description of the image content
65
+ - **Text**: Focuses on identifying and extracting text from the image
66
+ - **Chart**: Analyzes charts, graphs, and diagrams in detail
67
+ - **People**: Describes people in the image - appearance, actions, and expressions
68
+ - **Technical**: Provides technical analysis of objects and their relationships
69
+
70
+ ## Testing
71
+
72
+ To test the model directly from the command line:
73
+
74
+ ```
75
+ python test_internvl2.py --image path/to/your/image.jpg --prompt "Describe this image in detail."
76
+ ```
77
+
78
+ ## Deployment to Hugging Face
79
+
80
+ To deploy to Hugging Face Spaces:
81
+
82
+ ```
83
+ python upload_internvl2_to_hf.py
84
+ ```
85
+
86
+ ## Model Details
87
+
88
+ This application uses InternVL2-40B-AWQ, a 4-bit quantized version of InternVL2-40B. The original model consists of:
89
+
90
+ - **Vision Component**: InternViT-6B-448px-V1-5
91
+ - **Language Component**: Nous-Hermes-2-Yi-34B
92
+ - **Total Parameters**: ~40B (6B vision + 34B language)
93
+
94
+ ## License
95
+
96
+ This project is released under the same license as the InternVL2 model, which is MIT license.
97
+
98
+ ## Acknowledgements
99
+
100
+ - [OpenGVLab](https://github.com/OpenGVLab) for creating the InternVL2 models
101
+ - [Hugging Face](https://huggingface.co/) for model hosting
102
+ - [lmdeploy](https://github.com/InternLM/lmdeploy) for model optimization
103
+ - [Gradio](https://gradio.app/) for the web interface
app.py ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PIL import Image
3
+ import torch
4
+ import os
5
+ import time
6
+ import numpy as np
7
+
8
+ # Set CUDA memory configuration to avoid fragmentation
9
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
10
+
11
+ # Import the models after setting memory configuration
12
+ from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
13
+
14
+ # Model configuration
15
+ CLIP_MODEL_ID = "openai/clip-vit-base-patch32" # Fast classification
16
+ DETAILED_MODEL_ID = "Salesforce/blip-image-captioning-large" # Use original BLIP instead of BLIP-2
17
+ USE_GPU = torch.cuda.is_available()
18
+
19
+ # Global variables
20
+ clip_model = None
21
+ clip_processor = None
22
+ detailed_model = None
23
+ detailed_processor = None
24
+
25
+ def load_clip_model():
26
+ """Load the CLIP model for fast classification"""
27
+ global clip_model, clip_processor
28
+
29
+ # Return if already loaded
30
+ if clip_model is not None and clip_processor is not None:
31
+ return True
32
+
33
+ print("Loading CLIP model...")
34
+ try:
35
+ # First clear any GPU memory
36
+ if torch.cuda.is_available():
37
+ torch.cuda.empty_cache()
38
+
39
+ # Load processor
40
+ clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_ID)
41
+
42
+ # Load model efficiently
43
+ if USE_GPU:
44
+ clip_model = CLIPModel.from_pretrained(CLIP_MODEL_ID).to("cuda")
45
+ else:
46
+ clip_model = CLIPModel.from_pretrained(CLIP_MODEL_ID)
47
+
48
+ # Set to evaluation mode
49
+ clip_model.eval()
50
+ print("CLIP model loaded successfully!")
51
+ return True
52
+ except Exception as e:
53
+ print(f"Error loading CLIP model: {str(e)}")
54
+ return False
55
+
56
+ def load_detailed_model():
57
+ """Load the BLIP model for detailed image analysis"""
58
+ global detailed_model, detailed_processor
59
+
60
+ # If already loaded, return
61
+ if detailed_model is not None and detailed_processor is not None:
62
+ return True
63
+
64
+ print("Loading BLIP model...")
65
+ try:
66
+ # Clear memory first
67
+ if torch.cuda.is_available():
68
+ torch.cuda.empty_cache()
69
+
70
+ # Load processor and model for original BLIP
71
+ detailed_processor = BlipProcessor.from_pretrained(DETAILED_MODEL_ID)
72
+
73
+ # For older models like BLIP, don't use device_map='auto' or load_in_8bit
74
+ # Instead, load the model and then move it to the device
75
+ detailed_model = BlipForConditionalGeneration.from_pretrained(
76
+ DETAILED_MODEL_ID,
77
+ torch_dtype=torch.float16 if USE_GPU else torch.float32
78
+ )
79
+
80
+ # Manually move model to GPU if available
81
+ if USE_GPU:
82
+ detailed_model = detailed_model.to("cuda")
83
+
84
+ # Set to evaluation mode
85
+ detailed_model.eval()
86
+ print("BLIP model loaded successfully!")
87
+ return True
88
+ except Exception as e:
89
+ print(f"Error loading BLIP model: {str(e)}")
90
+ if "CUDA out of memory" in str(e):
91
+ print("Not enough GPU memory for the detailed model")
92
+ return False
93
+
94
+ # Categories for image classification
95
+ CATEGORIES = [
96
+ "a photograph", "a painting", "a drawing", "a digital art",
97
+ "landscape", "portrait", "cityscape", "animals", "food", "vehicle",
98
+ "building", "nature", "people", "abstract art", "technology",
99
+ "interior", "exterior", "night scene", "beach", "mountains",
100
+ "forest", "water", "flowers", "sports",
101
+ "a person", "multiple people", "a child", "an elderly person",
102
+ "a dog", "a cat", "wildlife", "a bird", "a car", "a building",
103
+ "a presentation slide", "a graph", "a chart", "a diagram", "text document",
104
+ "a screenshot", "a map", "a table of data", "a scientific figure"
105
+ ]
106
+
107
+ def get_detailed_analysis(image):
108
+ """Get detailed analysis from the image using BLIP model"""
109
+ try:
110
+ start_time = time.time()
111
+
112
+ # Make sure the model is loaded
113
+ if not load_detailed_model():
114
+ return "Couldn't load detailed analysis model."
115
+
116
+ # Convert numpy array to PIL Image
117
+ if isinstance(image, np.ndarray):
118
+ image_pil = Image.fromarray(image).convert('RGB')
119
+ else:
120
+ # If somehow it's already a PIL Image
121
+ image_pil = image.convert('RGB')
122
+
123
+ # Resize the image to improve performance
124
+ max_size = 600 # Limit to 600px on the longest side
125
+ width, height = image_pil.size
126
+ if max(width, height) > max_size:
127
+ if width > height:
128
+ new_width = max_size
129
+ new_height = int(height * (max_size / width))
130
+ else:
131
+ new_height = max_size
132
+ new_width = int(width * (max_size / height))
133
+ image_pil = image_pil.resize((new_width, new_height), Image.LANCZOS)
134
+
135
+ device = "cuda" if USE_GPU else "cpu"
136
+
137
+ # Using an unconditional approach first - this usually works better
138
+ inputs = detailed_processor(image_pil, return_tensors="pt")
139
+ if USE_GPU:
140
+ inputs = {k: v.to(device) for k, v in inputs.items()}
141
+
142
+ with torch.no_grad():
143
+ # Get a basic unconditional caption
144
+ output_ids = detailed_model.generate(
145
+ **inputs,
146
+ max_length=50,
147
+ num_beams=5,
148
+ do_sample=False,
149
+ early_stopping=True
150
+ )
151
+ base_description = detailed_processor.decode(output_ids[0], skip_special_tokens=True)
152
+
153
+ # ULTRA-SIMPLE single-word prompts to avoid any echoing
154
+ analyses = {
155
+ "text": None, # Text content
156
+ "chart": None, # Chart analysis
157
+ "subject": None # Main subject
158
+ }
159
+
160
+ # Use the base description for context with ultra-simple prompts
161
+ ultra_simple_prompts = {
162
+ f"Text in {base_description[:20]}...": "text",
163
+ f"Charts in {base_description[:20]}...": "chart",
164
+ f"Subject of {base_description[:20]}...": "subject"
165
+ }
166
+
167
+ for prompt, analysis_type in ultra_simple_prompts.items():
168
+ # Process with prompt
169
+ inputs = detailed_processor(image_pil, text=prompt, return_tensors="pt")
170
+
171
+ if USE_GPU:
172
+ inputs = {k: v.to(device) for k, v in inputs.items()}
173
+
174
+ with torch.no_grad():
175
+ output_ids = detailed_model.generate(
176
+ **inputs,
177
+ max_length=75,
178
+ num_beams=3,
179
+ do_sample=True,
180
+ temperature=0.7,
181
+ repetition_penalty=1.2,
182
+ early_stopping=True
183
+ )
184
+
185
+ result = detailed_processor.decode(output_ids[0], skip_special_tokens=True)
186
+
187
+ # SUPER AGGRESSIVE cleaning
188
+ # First, remove anything that looks like a prefix before a colon
189
+ colon_parts = result.split(":")
190
+ if len(colon_parts) > 1:
191
+ # Take everything after the first colon
192
+ result = ":".join(colon_parts[1:]).strip()
193
+
194
+ # Remove the base description if it appears
195
+ if base_description in result:
196
+ result = result.replace(base_description, "").strip()
197
+
198
+ # Remove any part of the prompt
199
+ for p in ultra_simple_prompts.keys():
200
+ if p in result:
201
+ result = result.replace(p, "").strip()
202
+
203
+ # Remove the first 20 chars of base description if they appear
204
+ if base_description[:20] in result:
205
+ result = result.replace(base_description[:20], "").strip()
206
+
207
+ # Remove all common question patterns and filler text
208
+ remove_patterns = [
209
+ "text in", "charts in", "subject of",
210
+ "in detail", "describe", "this image", "the image",
211
+ "can you", "do you", "is there", "are there", "i can see",
212
+ "i see", "there is", "there are", "it looks like",
213
+ "appears to be", "seems to be", "might be", "could be",
214
+ "i think", "i believe", "probably", "possibly", "maybe",
215
+ "it is", "this is", "that is", "these are", "those are",
216
+ "image shows", "picture shows", "image contains", "picture contains",
217
+ "in the image", "in this image", "of this image", "from this image",
218
+ "based on", "according to", "looking at", "from what i can see",
219
+ "appears to show", "depicts", "represents", "illustrates", "demonstrates",
220
+ "presents", "displays", "portrays", "reveals", "indicates", "suggests",
221
+ "we can see", "you can see", "one can see"
222
+ ]
223
+
224
+ for pattern in remove_patterns:
225
+ if pattern.lower() in result.lower():
226
+ # Find and remove each occurrence
227
+ lower_result = result.lower()
228
+ while pattern.lower() in lower_result:
229
+ idx = lower_result.find(pattern.lower())
230
+ if idx >= 0:
231
+ result = result[:idx] + result[idx+len(pattern):]
232
+ lower_result = result.lower()
233
+
234
+ # Clean up any punctuation/formatting issues
235
+ result = result.strip()
236
+ while result and result[0] in ",.;:?!-":
237
+ result = result[1:].strip()
238
+
239
+ # Remove "..." if it appears
240
+ result = result.replace("...", "").strip()
241
+
242
+ # Fix capitalization
243
+ if result and len(result) > 0:
244
+ result = result[0].upper() + result[1:] if len(result) > 1 else result[0].upper()
245
+
246
+ analyses[analysis_type] = result
247
+
248
+ # Compose the final output
249
+ output_text = f"## Detailed Description\n{base_description}\n\n"
250
+
251
+ # Only show relevant sections
252
+ if analyses['text'] and len(analyses['text']) > 5 and not any(x in analyses['text'].lower() for x in ["no text", "not any text", "can't see", "cannot see", "don't see", "couldn't find"]):
253
+ output_text += f"## Text Content\n{analyses['text']}\n\n"
254
+
255
+ if analyses['chart'] and len(analyses['chart']) > 5 and not any(x in analyses['chart'].lower() for x in ["no chart", "not any chart", "no graph", "not any graph", "can't see", "cannot see", "don't see", "couldn't find"]):
256
+ output_text += f"## Chart Analysis\n{analyses['chart']}\n\n"
257
+
258
+ output_text += f"## Main Subject\n{analyses['subject'] or 'Unable to determine main subject.'}"
259
+
260
+ # Clear GPU memory
261
+ if USE_GPU:
262
+ torch.cuda.empty_cache()
263
+
264
+ elapsed_time = time.time() - start_time
265
+ return output_text
266
+
267
+ except Exception as e:
268
+ print(f"Error in detailed analysis: {str(e)}")
269
+ # Try to clean up memory in case of error
270
+ if USE_GPU:
271
+ torch.cuda.empty_cache()
272
+ return f"Error in detailed analysis: {str(e)}"
273
+
274
+ def get_clip_classification(image):
275
+ """Get fast classification using CLIP"""
276
+ if not load_clip_model():
277
+ return []
278
+
279
+ try:
280
+ # Process with CLIP
281
+ inputs = clip_processor(
282
+ text=CATEGORIES,
283
+ images=image,
284
+ return_tensors="pt",
285
+ padding=True
286
+ )
287
+
288
+ # Move to GPU if available
289
+ if USE_GPU:
290
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
291
+
292
+ # Get predictions
293
+ with torch.inference_mode():
294
+ outputs = clip_model(**inputs)
295
+
296
+ # Process results
297
+ logits_per_image = outputs.logits_per_image
298
+ probs = logits_per_image.softmax(dim=1)
299
+
300
+ # Get top predictions
301
+ values, indices = probs[0].topk(8)
302
+
303
+ # Format results
304
+ return [(CATEGORIES[idx], value.item() * 100) for value, idx in zip(values, indices)]
305
+ except Exception as e:
306
+ print(f"Error in CLIP classification: {str(e)}")
307
+ return []
308
+
309
+ def process_image(image, get_detailed=False):
310
+ """Process image with both fast and detailed analysis"""
311
+ if image is None:
312
+ return "Please upload an image to analyze."
313
+
314
+ try:
315
+ # Start timing
316
+ start_time = time.time()
317
+
318
+ # Preprocess image
319
+ if hasattr(image, 'mode') and image.mode != 'RGB':
320
+ image = image.convert('RGB')
321
+
322
+ # Resize for efficiency
323
+ if max(image.size) > 600: # Smaller max size for better performance
324
+ ratio = 600 / max(image.size)
325
+ new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
326
+ image = image.resize(new_size, Image.LANCZOS)
327
+
328
+ # Get fast classification first
329
+ categories = get_clip_classification(image)
330
+
331
+ result = "## Image Classification\n"
332
+ result += "This image appears to contain:\n"
333
+ for category, confidence in categories:
334
+ result += f"- {category.title()} ({confidence:.1f}%)\n"
335
+
336
+ # Add detailed analysis if requested
337
+ if get_detailed:
338
+ result += "\n## Detailed Analysis\n"
339
+ detailed_result = get_detailed_analysis(image)
340
+ result += detailed_result
341
+
342
+ # Add timing information
343
+ elapsed_time = time.time() - start_time
344
+ result += f"\n\nAnalysis completed in {elapsed_time:.2f} seconds."
345
+
346
+ # Clean up memory
347
+ if torch.cuda.is_available():
348
+ torch.cuda.empty_cache()
349
+
350
+ return result
351
+
352
+ except Exception as e:
353
+ print(f"Error: {str(e)}")
354
+ if torch.cuda.is_available():
355
+ torch.cuda.empty_cache()
356
+ return f"Error processing image: {str(e)}"
357
+
358
+ # Create interface with more options
359
+ with gr.Blocks(title="Enhanced Image Analyzer") as demo:
360
+ gr.Markdown("# Enhanced Image Analyzer")
361
+ gr.Markdown("Upload an image and choose between fast classification or detailed analysis.")
362
+
363
+ with gr.Row():
364
+ with gr.Column():
365
+ input_image = gr.Image(type="pil", label="Upload an image")
366
+ detailed_checkbox = gr.Checkbox(label="Get detailed analysis (slower but better quality)", value=False)
367
+ analyze_btn = gr.Button("Analyze Image", variant="primary")
368
+
369
+ with gr.Column():
370
+ output = gr.Markdown(label="Analysis Results")
371
+
372
+ analyze_btn.click(
373
+ fn=process_image,
374
+ inputs=[input_image, detailed_checkbox],
375
+ outputs=output
376
+ )
377
+
378
+ # Optional examples
379
+ if os.path.exists("data_temp"):
380
+ examples = [os.path.join("data_temp", f) for f in os.listdir("data_temp")
381
+ if f.endswith(('.png', '.jpg', '.jpeg'))]
382
+ if examples:
383
+ gr.Examples(examples=examples, inputs=input_image)
384
+
385
+ if __name__ == "__main__":
386
+ # Start with clean memory
387
+ if torch.cuda.is_available():
388
+ torch.cuda.empty_cache()
389
+ demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
app_internvl2.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PIL import Image
3
+ import os
4
+ import time
5
+ import numpy as np
6
+ import torch
7
+ import math
8
+
9
+ # Import lmdeploy for InternVL2 model
10
+ from lmdeploy import pipeline, TurbomindEngineConfig
11
+ from lmdeploy.vl import load_image
12
+
13
+ # Set environment variables
14
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
15
+
16
+ # Model configuration
17
+ MODEL_ID = "OpenGVLab/InternVL2-40B-AWQ" # 4-bit quantized model
18
+ USE_GPU = torch.cuda.is_available()
19
+
20
+ # Global variables for model
21
+ internvl2_pipeline = None
22
+
23
+ def load_internvl2_model():
24
+ """Load the InternVL2 model using lmdeploy"""
25
+ global internvl2_pipeline
26
+
27
+ # If already loaded, return
28
+ if internvl2_pipeline is not None:
29
+ return True
30
+
31
+ print("Loading InternVL2 model...")
32
+ try:
33
+ # Configure for AWQ quantized model
34
+ backend_config = TurbomindEngineConfig(model_format='awq')
35
+
36
+ # Create pipeline
37
+ internvl2_pipeline = pipeline(
38
+ MODEL_ID,
39
+ backend_config=backend_config,
40
+ log_level='INFO'
41
+ )
42
+
43
+ print("InternVL2 model loaded successfully!")
44
+ return True
45
+ except Exception as e:
46
+ print(f"Error loading InternVL2 model: {str(e)}")
47
+ if "CUDA out of memory" in str(e):
48
+ print("Not enough GPU memory for the model")
49
+ return False
50
+
51
+ def analyze_image(image, prompt):
52
+ """Analyze the image using InternVL2 model"""
53
+ try:
54
+ start_time = time.time()
55
+
56
+ # Make sure the model is loaded
57
+ if not load_internvl2_model():
58
+ return "Couldn't load InternVL2 model."
59
+
60
+ # Convert numpy array to PIL Image
61
+ if isinstance(image, np.ndarray):
62
+ image_pil = Image.fromarray(image).convert('RGB')
63
+ else:
64
+ # If somehow it's already a PIL Image
65
+ image_pil = image.convert('RGB')
66
+
67
+ # Run inference with the model
68
+ response = internvl2_pipeline((prompt, image_pil))
69
+
70
+ # Get the response text
71
+ result = response.text
72
+
73
+ elapsed_time = time.time() - start_time
74
+ return result
75
+
76
+ except Exception as e:
77
+ print(f"Error in image analysis: {str(e)}")
78
+ # Try to clean up memory in case of error
79
+ if USE_GPU:
80
+ torch.cuda.empty_cache()
81
+ return f"Error in image analysis: {str(e)}"
82
+
83
+ def process_image(image, analysis_type="general"):
84
+ """Process the image and return the analysis"""
85
+ if image is None:
86
+ return "Please upload an image."
87
+
88
+ # Define prompt based on analysis type
89
+ if analysis_type == "general":
90
+ prompt = "Describe this image in detail."
91
+ elif analysis_type == "text":
92
+ prompt = "What text can you see in this image? Please transcribe it accurately."
93
+ elif analysis_type == "chart":
94
+ prompt = "Analyze any charts, graphs or diagrams in this image in detail, including trends, data points, and conclusions."
95
+ elif analysis_type == "people":
96
+ prompt = "Describe the people in this image - their appearance, actions, and expressions."
97
+ elif analysis_type == "technical":
98
+ prompt = "Provide a technical analysis of this image, including object identification, spatial relationships, and any technical elements present."
99
+ else:
100
+ prompt = "Describe this image in detail."
101
+
102
+ start_time = time.time()
103
+
104
+ # Get analysis from the model
105
+ analysis = analyze_image(image, prompt)
106
+
107
+ elapsed_time = time.time() - start_time
108
+ return f"{analysis}\n\nAnalysis completed in {elapsed_time:.2f} seconds."
109
+
110
+ # Define the Gradio interface
111
+ def create_interface():
112
+ with gr.Blocks(title="Image Analysis with InternVL2") as demo:
113
+ gr.Markdown("# Image Analysis with InternVL2-40B")
114
+ gr.Markdown("Upload an image to analyze it using the InternVL2-40B model.")
115
+
116
+ with gr.Row():
117
+ with gr.Column(scale=1):
118
+ input_image = gr.Image(type="pil", label="Upload Image")
119
+ analysis_type = gr.Radio(
120
+ ["general", "text", "chart", "people", "technical"],
121
+ label="Analysis Type",
122
+ value="general"
123
+ )
124
+ submit_btn = gr.Button("Analyze Image")
125
+
126
+ with gr.Column(scale=2):
127
+ output_text = gr.Textbox(label="Analysis Result", lines=20)
128
+
129
+ submit_btn.click(
130
+ fn=process_image,
131
+ inputs=[input_image, analysis_type],
132
+ outputs=output_text
133
+ )
134
+
135
+ gr.Markdown("""
136
+ ## Analysis Types
137
+ - **General**: General description of the image
138
+ - **Text**: Focus on identifying and transcribing text in the image
139
+ - **Chart**: Detailed analysis of charts, graphs, and diagrams
140
+ - **People**: Description of people, their appearance and actions
141
+ - **Technical**: Technical analysis identifying objects and spatial relationships
142
+ """)
143
+
144
+ # Examples
145
+ gr.Examples(
146
+ examples=[
147
+ ["data_temp/page_2.png", "general"],
148
+ ["data_temp/page_2.png", "text"],
149
+ ["data_temp/page_2.png", "chart"]
150
+ ],
151
+ inputs=[input_image, analysis_type],
152
+ outputs=output_text,
153
+ fn=process_image,
154
+ cache_examples=True,
155
+ )
156
+
157
+ return demo
158
+
159
+ # Main function
160
+ if __name__ == "__main__":
161
+ # Create the Gradio interface
162
+ demo = create_interface()
163
+
164
+ # Launch the interface
165
+ demo.launch(share=False)
csv_to_html.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from jinja2 import Template
3
+
4
+ # Read the CSV file
5
+ df = pd.read_csv('sample_data.csv')
6
+
7
+ # Calculate column totals (excluding the first column which contains row labels)
8
+ totals = df.iloc[:, 1:].sum()
9
+
10
+ # Create HTML template
11
+ html_template = """
12
+ <!DOCTYPE html>
13
+ <html>
14
+ <head>
15
+ <title>CSV Data Report</title>
16
+ <style>
17
+ body {
18
+ font-family: Arial, sans-serif;
19
+ margin: 20px;
20
+ }
21
+ table {
22
+ border-collapse: collapse;
23
+ width: 100%;
24
+ margin-bottom: 20px;
25
+ }
26
+ th, td {
27
+ border: 1px solid #ddd;
28
+ padding: 8px;
29
+ text-align: right;
30
+ }
31
+ th {
32
+ background-color: #f2f2f2;
33
+ }
34
+ .total-row {
35
+ font-weight: bold;
36
+ background-color: #e6e6e6;
37
+ }
38
+ h1 {
39
+ color: #333;
40
+ }
41
+ </style>
42
+ </head>
43
+ <body>
44
+ <h1>CSV Data Report</h1>
45
+
46
+ <h2>Raw Data</h2>
47
+ {{ raw_data | safe }}
48
+
49
+ <h2>Column Totals</h2>
50
+ <table>
51
+ <tr>
52
+ {% for column in totals.index %}
53
+ <th>{{ column }}</th>
54
+ {% endfor %}
55
+ </tr>
56
+ <tr class="total-row">
57
+ {% for value in totals.values %}
58
+ <td>{{ "{:,.2f}".format(value) }}</td>
59
+ {% endfor %}
60
+ </tr>
61
+ </table>
62
+ </body>
63
+ </html>
64
+ """
65
+
66
+ # Convert the raw data to HTML
67
+ raw_data_html = df.to_html(index=False, classes='table')
68
+
69
+ # Create the template and render it
70
+ template = Template(html_template)
71
+ html_output = template.render(raw_data=raw_data_html, totals=totals)
72
+
73
+ # Save the HTML output
74
+ with open('report.html', 'w') as f:
75
+ f.write(html_output)
76
+
77
+ print("Report has been generated as 'report.html'")
data_temp/page_2.png ADDED

Git LFS Details

  • SHA256: 20fab46eac6ed2f67da21966eabaa80d15857a4f9d069d7c8ee29d9eb6498990
  • Pointer size: 131 Bytes
  • Size of remote file: 274 kB
docs/aws_deployment.md ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AWS SageMaker Deployment Guide
2
+
3
+ This guide provides step-by-step instructions for deploying the Image Description application to AWS SageMaker.
4
+
5
+ ## Prerequisites
6
+
7
+ - AWS account with SageMaker permissions
8
+ - AWS CLI installed and configured
9
+ - Docker installed on your local machine
10
+ - The source code from this repository
11
+
12
+ ## Step 1: Create an Amazon ECR Repository
13
+
14
+ ```bash
15
+ aws ecr create-repository --repository-name image-descriptor
16
+ ```
17
+
18
+ Note the repository URI returned by this command. You'll use it in the next step.
19
+
20
+ ## Step 2: Build and Push the Docker Image
21
+
22
+ 1. Log in to ECR:
23
+
24
+ ```bash
25
+ aws ecr get-login-password --region your-region | docker login --username AWS --password-stdin your-account-id.dkr.ecr.your-region.amazonaws.com
26
+ ```
27
+
28
+ 2. Build the Docker image:
29
+
30
+ ```bash
31
+ docker build -t image-descriptor .
32
+ ```
33
+
34
+ 3. Tag and push the image:
35
+
36
+ ```bash
37
+ docker tag image-descriptor:latest your-account-id.dkr.ecr.your-region.amazonaws.com/image-descriptor:latest
38
+ docker push your-account-id.dkr.ecr.your-region.amazonaws.com/image-descriptor:latest
39
+ ```
40
+
41
+ ## Step 3: Create a SageMaker Model
42
+
43
+ 1. Create a model.json file:
44
+
45
+ ```json
46
+ {
47
+ "ModelName": "QwenVLImageDescriptor",
48
+ "PrimaryContainer": {
49
+ "Image": "your-account-id.dkr.ecr.your-region.amazonaws.com/image-descriptor:latest",
50
+ "Environment": {
51
+ "PORT": "8080"
52
+ }
53
+ },
54
+ "ExecutionRoleArn": "arn:aws:iam::your-account-id:role/service-role/AmazonSageMaker-ExecutionRole"
55
+ }
56
+ ```
57
+
58
+ 2. Create the SageMaker model:
59
+
60
+ ```bash
61
+ aws sagemaker create-model --cli-input-json file://model.json
62
+ ```
63
+
64
+ ## Step 4: Create an Endpoint Configuration
65
+
66
+ 1. Create a config.json file:
67
+
68
+ ```json
69
+ {
70
+ "EndpointConfigName": "QwenVLImageDescriptorConfig",
71
+ "ProductionVariants": [
72
+ {
73
+ "VariantName": "AllTraffic",
74
+ "ModelName": "QwenVLImageDescriptor",
75
+ "InstanceType": "ml.g5.2xlarge",
76
+ "InitialInstanceCount": 1
77
+ }
78
+ ]
79
+ }
80
+ ```
81
+
82
+ 2. Create the endpoint configuration:
83
+
84
+ ```bash
85
+ aws sagemaker create-endpoint-config --cli-input-json file://config.json
86
+ ```
87
+
88
+ ## Step 5: Create the Endpoint
89
+
90
+ ```bash
91
+ aws sagemaker create-endpoint --endpoint-name qwen-vl-image-descriptor --endpoint-config-name QwenVLImageDescriptorConfig
92
+ ```
93
+
94
+ This will take several minutes to deploy.
95
+
96
+ ## Step 6: Invoke the Endpoint
97
+
98
+ You can invoke the endpoint using the AWS SDK or AWS CLI.
99
+
100
+ Using Python SDK:
101
+
102
+ ```python
103
+ import boto3
104
+ import json
105
+ import base64
106
+ from PIL import Image
107
+ import io
108
+
109
+ # Initialize the SageMaker runtime client
110
+ runtime = boto3.client('sagemaker-runtime')
111
+
112
+ # Load and encode the image
113
+ with open('data_temp/page_2.png', 'rb') as f:
114
+ image_data = f.read()
115
+ image_b64 = base64.b64encode(image_data).decode('utf-8')
116
+
117
+ # Create the request payload
118
+ payload = {
119
+ 'image_data': image_b64
120
+ }
121
+
122
+ # Invoke the endpoint
123
+ response = runtime.invoke_endpoint(
124
+ EndpointName='qwen-vl-image-descriptor',
125
+ ContentType='application/json',
126
+ Body=json.dumps(payload)
127
+ )
128
+
129
+ # Parse the response
130
+ result = json.loads(response['Body'].read().decode())
131
+ print(json.dumps(result, indent=2))
132
+ ```
133
+
134
+ ## Step 7: Set Up API Gateway (Optional)
135
+
136
+ For public HTTP access, set up an API Gateway:
137
+
138
+ 1. Create a new REST API in API Gateway
139
+ 2. Create a new resource and POST method
140
+ 3. Configure the integration to use the SageMaker endpoint
141
+ 4. Deploy the API to a stage
142
+ 5. Note the API Gateway URL for client use
143
+
144
+ ## Cost Optimization
145
+
146
+ To optimize costs:
147
+
148
+ 1. Use SageMaker Serverless Inference instead of a dedicated endpoint
149
+ 2. Implement auto-scaling for your endpoint
150
+ 3. Use Spot Instances for non-critical workloads
151
+ 4. Schedule endpoints to be active only during business hours
152
+
153
+ ## Monitoring
154
+
155
+ Set up CloudWatch Alarms to monitor:
156
+
157
+ 1. Endpoint invocation metrics
158
+ 2. Error rates
159
+ 3. Latency
160
+ 4. Instance utilization
161
+
162
+ ## Cleanup
163
+
164
+ To avoid ongoing charges, delete resources when not in use:
165
+
166
+ ```bash
167
+ aws sagemaker delete-endpoint --endpoint-name qwen-vl-image-descriptor
168
+ aws sagemaker delete-endpoint-config --endpoint-config-name QwenVLImageDescriptorConfig
169
+ aws sagemaker delete-model --model-name QwenVLImageDescriptor
170
+ ```
docs/azure_deployment.md ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Azure Machine Learning Deployment Guide
2
+
3
+ This guide provides step-by-step instructions for deploying the Image Description application to Azure Machine Learning.
4
+
5
+ ## Prerequisites
6
+
7
+ - Azure subscription
8
+ - Azure CLI installed and configured
9
+ - Azure Machine Learning workspace
10
+ - The source code from this repository
11
+
12
+ ## Step 1: Set Up Azure Machine Learning
13
+
14
+ 1. Create a Resource Group (if you don't have one):
15
+
16
+ ```bash
17
+ az group create --name image-descriptor-rg --location eastus
18
+ ```
19
+
20
+ 2. Create an Azure Machine Learning workspace:
21
+
22
+ ```bash
23
+ az ml workspace create --workspace-name image-descriptor-ws \
24
+ --resource-group image-descriptor-rg \
25
+ --location eastus
26
+ ```
27
+
28
+ ## Step 2: Create a Compute Cluster
29
+
30
+ Create a GPU-enabled compute cluster for training and inference:
31
+
32
+ ```bash
33
+ az ml compute create --name gpu-cluster \
34
+ --workspace-name image-descriptor-ws \
35
+ --resource-group image-descriptor-rg \
36
+ --type AmlCompute \
37
+ --min-instances 0 \
38
+ --max-instances 1 \
39
+ --size Standard_NC6s_v3
40
+ ```
41
+
42
+ ## Step 3: Prepare Environment Configuration
43
+
44
+ Create an environment.yml file to define dependencies:
45
+
46
+ ```yaml
47
+ name: image_descriptor_env
48
+ channels:
49
+ - pytorch
50
+ - conda-forge
51
+ - defaults
52
+ dependencies:
53
+ - python=3.9
54
+ - pip=23.0
55
+ - pytorch=2.0.0
56
+ - torchvision=0.15.0
57
+ - pip:
58
+ - transformers>=4.36.0
59
+ - accelerate>=0.25.0
60
+ - bitsandbytes>=0.41.0
61
+ - safetensors>=0.4.0
62
+ - flask>=2.3.2
63
+ - flask-cors>=4.0.0
64
+ - gunicorn>=21.2.0
65
+ - pillow>=10.0.0
66
+ - matplotlib>=3.7.0
67
+ - python-dotenv>=1.0.0
68
+ - azureml-core>=1.48.0
69
+ - azureml-defaults>=1.48.0
70
+ - inference-schema>=1.4.1
71
+ ```
72
+
73
+ ## Step 4: Create a Model Entry Script
74
+
75
+ Create a file called `score.py` to handle Azure ML model inference:
76
+
77
+ ```python
78
+ import json
79
+ import os
80
+ import io
81
+ import base64
82
+ import logging
83
+ import torch
84
+ from PIL import Image
85
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
86
+
87
+ # Configure logging
88
+ logging.basicConfig(level=logging.INFO)
89
+ logger = logging.getLogger(__name__)
90
+
91
+ # Global variables
92
+ model = None
93
+ processor = None
94
+ tokenizer = None
95
+
96
+ def init():
97
+ """Initialize the model when the service starts"""
98
+ global model, processor, tokenizer
99
+
100
+ logger.info("Loading model...")
101
+ model_id = "Qwen/Qwen2-VL-7B"
102
+
103
+ # Load model components with quantization for efficiency
104
+ processor = AutoProcessor.from_pretrained(model_id)
105
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
106
+
107
+ # Load model with 4-bit quantization to reduce memory requirements
108
+ model = AutoModelForCausalLM.from_pretrained(
109
+ model_id,
110
+ torch_dtype=torch.bfloat16,
111
+ load_in_4bit=True,
112
+ device_map="auto"
113
+ )
114
+ logger.info("Model loaded successfully")
115
+
116
+ def run(raw_data):
117
+ """Process an image and generate descriptions
118
+
119
+ Args:
120
+ raw_data: A JSON string containing the image as base64 encoded data
121
+
122
+ Returns:
123
+ A JSON string containing the descriptions
124
+ """
125
+ global model, processor, tokenizer
126
+
127
+ try:
128
+ # Parse input
129
+ data = json.loads(raw_data)
130
+
131
+ # Get the image data (from base64 or URL)
132
+ if 'image_data' in data:
133
+ image_bytes = base64.b64decode(data['image_data'])
134
+ image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
135
+ logger.info("Loaded image from base64 data")
136
+ elif 'image_url' in data:
137
+ # Handle image URLs (for Azure Storage or public URLs)
138
+ from urllib.request import urlopen
139
+ with urlopen(data['image_url']) as response:
140
+ image_bytes = response.read()
141
+ image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
142
+ logger.info(f"Loaded image from URL: {data['image_url']}")
143
+ else:
144
+ return json.dumps({"error": "No image data or URL provided"})
145
+
146
+ # Process the image
147
+ inputs = processor(
148
+ images=image,
149
+ return_tensors="pt"
150
+ ).to(model.device)
151
+
152
+ # Basic description prompt
153
+ prompt_basic = "Describe this image briefly."
154
+ input_ids_basic = tokenizer(prompt_basic, return_tensors="pt").input_ids.to(model.device)
155
+
156
+ # Detailed description prompt
157
+ prompt_detailed = "Analyze this image in detail. Describe the main elements, any text visible, the colors, and the overall composition."
158
+ input_ids_detailed = tokenizer(prompt_detailed, return_tensors="pt").input_ids.to(model.device)
159
+
160
+ # Technical analysis prompt
161
+ prompt_technical = "What can you tell me about the technical aspects of this image?"
162
+ input_ids_technical = tokenizer(prompt_technical, return_tensors="pt").input_ids.to(model.device)
163
+
164
+ # Generate outputs for each prompt
165
+ # Basic description
166
+ with torch.no_grad():
167
+ output_basic = model.generate(
168
+ **inputs,
169
+ input_ids=input_ids_basic,
170
+ max_new_tokens=150,
171
+ do_sample=False
172
+ )
173
+ basic_description = tokenizer.decode(output_basic[0], skip_special_tokens=True).replace(prompt_basic, "").strip()
174
+
175
+ # Detailed description
176
+ with torch.no_grad():
177
+ output_detailed = model.generate(
178
+ **inputs,
179
+ input_ids=input_ids_detailed,
180
+ max_new_tokens=300,
181
+ do_sample=False
182
+ )
183
+ detailed_description = tokenizer.decode(output_detailed[0], skip_special_tokens=True).replace(prompt_detailed, "").strip()
184
+
185
+ # Technical analysis
186
+ with torch.no_grad():
187
+ output_technical = model.generate(
188
+ **inputs,
189
+ input_ids=input_ids_technical,
190
+ max_new_tokens=200,
191
+ do_sample=False
192
+ )
193
+ technical_analysis = tokenizer.decode(output_technical[0], skip_special_tokens=True).replace(prompt_technical, "").strip()
194
+
195
+ # Return the results
196
+ return json.dumps({
197
+ "success": True,
198
+ "basic_description": basic_description,
199
+ "detailed_description": detailed_description,
200
+ "technical_analysis": technical_analysis
201
+ })
202
+
203
+ except Exception as e:
204
+ logger.error(f"Error processing image: {str(e)}", exc_info=True)
205
+ return json.dumps({"error": f"Error generating description: {str(e)}"})
206
+ ```
207
+
208
+ ## Step 5: Register the Model
209
+
210
+ 1. Create a model.yml file:
211
+
212
+ ```yaml
213
+ $schema: https://azuremlschemas.azureedge.net/latest/model.schema.json
214
+ name: qwen-vl-image-descriptor
215
+ version: 1
216
+ description: Qwen2-VL-7B model for image description
217
+ path: .
218
+ ```
219
+
220
+ 2. Register the model:
221
+
222
+ ```bash
223
+ az ml model create --file model.yml \
224
+ --workspace-name image-descriptor-ws \
225
+ --resource-group image-descriptor-rg
226
+ ```
227
+
228
+ ## Step 6: Deploy as an Online Endpoint
229
+
230
+ 1. Create an endpoint.yml file:
231
+
232
+ ```yaml
233
+ $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
234
+ name: image-descriptor-endpoint
235
+ description: Endpoint for image description
236
+ auth_mode: key
237
+ ```
238
+
239
+ 2. Create a deployment.yml file:
240
+
241
+ ```yaml
242
+ $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
243
+ name: qwen-vl-deployment
244
+ endpoint_name: image-descriptor-endpoint
245
+ model: azureml:qwen-vl-image-descriptor:1
246
+ environment:
247
+ conda_file: environment.yml
248
+ image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04:latest
249
+ instance_type: Standard_NC6s_v3
250
+ instance_count: 1
251
+ request_settings:
252
+ max_concurrent_requests_per_instance: 1
253
+ request_timeout_ms: 120000
254
+ ```
255
+
256
+ 3. Create the endpoint:
257
+
258
+ ```bash
259
+ az ml online-endpoint create --file endpoint.yml \
260
+ --workspace-name image-descriptor-ws \
261
+ --resource-group image-descriptor-rg
262
+ ```
263
+
264
+ 4. Create the deployment:
265
+
266
+ ```bash
267
+ az ml online-deployment create --file deployment.yml \
268
+ --workspace-name image-descriptor-ws \
269
+ --resource-group image-descriptor-rg
270
+ ```
271
+
272
+ 5. Allocate 100% traffic to the deployment:
273
+
274
+ ```bash
275
+ az ml online-endpoint update --name image-descriptor-endpoint \
276
+ --traffic "qwen-vl-deployment=100" \
277
+ --workspace-name image-descriptor-ws \
278
+ --resource-group image-descriptor-rg
279
+ ```
280
+
281
+ ## Step 7: Test the Endpoint
282
+
283
+ You can test the endpoint using the Azure ML SDK:
284
+
285
+ ```python
286
+ import json
287
+ import base64
288
+ from azure.ai.ml import MLClient
289
+ from azure.identity import DefaultAzureCredential
290
+ from azure.ai.ml.entities import ManagedOnlineEndpoint
291
+
292
+ # Get a handle to the workspace
293
+ credential = DefaultAzureCredential()
294
+ ml_client = MLClient(
295
+ credential=credential,
296
+ subscription_id="your-subscription-id",
297
+ resource_group_name="image-descriptor-rg",
298
+ workspace_name="image-descriptor-ws"
299
+ )
300
+
301
+ # Get endpoint
302
+ endpoint = ml_client.online_endpoints.get("image-descriptor-endpoint")
303
+
304
+ # Load and encode the image
305
+ with open('data_temp/page_2.png', 'rb') as f:
306
+ image_data = f.read()
307
+ image_b64 = base64.b64encode(image_data).decode('utf-8')
308
+
309
+ # Create the request payload
310
+ payload = {
311
+ 'image_data': image_b64
312
+ }
313
+
314
+ # Invoke the endpoint
315
+ response = ml_client.online_endpoints.invoke(
316
+ endpoint_name="image-descriptor-endpoint",
317
+ request_file=json.dumps(payload),
318
+ deployment_name="qwen-vl-deployment"
319
+ )
320
+
321
+ # Parse the response
322
+ result = json.loads(response)
323
+ print(json.dumps(result, indent=2))
324
+ ```
325
+
326
+ ## Cost Optimization
327
+
328
+ To optimize costs:
329
+
330
+ 1. Use a smaller compute size if possible
331
+ 2. Scale to zero instances when not in use
332
+ 3. Set up autoscaling rules
333
+ 4. Consider reserved instances for long-term deployments
334
+
335
+ ## Monitoring
336
+
337
+ Monitor your endpoint using:
338
+
339
+ 1. Azure Monitor
340
+ 2. Application Insights
341
+ 3. Azure ML metrics dashboard
342
+ 4. Set up alerts for anomalies
343
+
344
+ ## Cleanup
345
+
346
+ To avoid ongoing charges, delete resources when not in use:
347
+
348
+ ```bash
349
+ # Delete the endpoint
350
+ az ml online-endpoint delete --name image-descriptor-endpoint \
351
+ --workspace-name image-descriptor-ws \
352
+ --resource-group image-descriptor-rg -y
353
+
354
+ # Delete compute cluster
355
+ az ml compute delete --name gpu-cluster \
356
+ --workspace-name image-descriptor-ws \
357
+ --resource-group image-descriptor-rg -y
358
+
359
+ # Delete workspace (optional)
360
+ az ml workspace delete --name image-descriptor-ws \
361
+ --resource-group image-descriptor-rg -y
362
+
363
+ # Delete resource group (optional)
364
+ az group delete --name image-descriptor-rg -y
365
+ ```
entrypoint.sh ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ # Load environment variables
5
+ source .env
6
+
7
+ # Check if we should run in UI or API mode
8
+ if [ "$ENABLE_UI" = "true" ]; then
9
+ echo "Starting Gradio UI with API backend..."
10
+ exec python app.py --api-server &
11
+ sleep 5 # Give the API server time to start
12
+ exec python app.py
13
+ elif [ "$API_ONLY" = "true" ]; then
14
+ echo "Starting API server only..."
15
+ exec gunicorn --bind 0.0.0.0:$PORT --workers 1 --timeout 300 "image_descriptor:app"
16
+ else
17
+ # Default behavior for Hugging Face Spaces: run both
18
+ echo "Starting service in Hugging Face Spaces mode..."
19
+ exec python app.py
20
+ fi
image_descriptor.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import io
4
+ from PIL import Image
5
+ import matplotlib.pyplot as plt
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPImageProcessor
7
+ import logging
8
+ import time
9
+
10
+ # Configure logging
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
14
+ )
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Global variables to store the model and processors
18
+ model = None
19
+ processor = None
20
+ tokenizer = None
21
+
22
+ def describe_image(image_path=None, image_data=None, show_image=False):
23
+ """
24
+ Load an image and generate a description using Qwen2-VL-7B model.
25
+
26
+ Args:
27
+ image_path (str, optional): Path to the image file
28
+ image_data (bytes, optional): Raw image data
29
+ show_image (bool): Whether to display the image
30
+
31
+ Returns:
32
+ dict: Descriptions of the image
33
+ """
34
+ global model, processor, tokenizer
35
+
36
+ # Initialize model if not already loaded
37
+ if model is None or processor is None or tokenizer is None:
38
+ load_model()
39
+
40
+ # Check if we have valid input
41
+ if image_path is None and image_data is None:
42
+ return {"error": "No image provided"}
43
+
44
+ try:
45
+ # Load the image
46
+ if image_path is not None:
47
+ if not os.path.exists(image_path):
48
+ return {"error": f"Image not found at {image_path}"}
49
+ logger.info(f"Processing image from path: {image_path}")
50
+ image = Image.open(image_path).convert('RGB')
51
+ else:
52
+ logger.info("Processing image from uploaded data")
53
+ image = Image.open(io.BytesIO(image_data)).convert('RGB')
54
+
55
+ # Display the image if requested (for local testing only)
56
+ if show_image:
57
+ plt.figure(figsize=(10, 8))
58
+ plt.imshow(image)
59
+ plt.axis('off')
60
+ if image_path:
61
+ plt.title(os.path.basename(image_path))
62
+ plt.show()
63
+
64
+ # Process the image
65
+ logger.info("Generating descriptions...")
66
+
67
+ # Process image and create inputs for various prompts
68
+ pixel_values = processor(images=image, return_tensors="pt").to(model.device)
69
+
70
+ # Basic description prompt
71
+ prompt_basic = "Describe this image briefly."
72
+ input_ids_basic = tokenizer(prompt_basic, return_tensors="pt").input_ids.to(model.device)
73
+
74
+ # Detailed description prompt
75
+ prompt_detailed = "Analyze this image in detail. Describe the main elements, any text visible, the colors, and the overall composition."
76
+ input_ids_detailed = tokenizer(prompt_detailed, return_tensors="pt").input_ids.to(model.device)
77
+
78
+ # Technical analysis prompt
79
+ prompt_technical = "What can you tell me about the technical aspects of this image?"
80
+ input_ids_technical = tokenizer(prompt_technical, return_tensors="pt").input_ids.to(model.device)
81
+
82
+ # Generate outputs for each prompt
83
+ # Basic description
84
+ with torch.no_grad():
85
+ output_basic = model.generate(
86
+ input_ids=input_ids_basic,
87
+ pixel_values=pixel_values.pixel_values,
88
+ max_new_tokens=150,
89
+ do_sample=False
90
+ )
91
+ basic_description = tokenizer.decode(output_basic[0], skip_special_tokens=True).replace(prompt_basic, "").strip()
92
+
93
+ # Detailed description
94
+ with torch.no_grad():
95
+ output_detailed = model.generate(
96
+ input_ids=input_ids_detailed,
97
+ pixel_values=pixel_values.pixel_values,
98
+ max_new_tokens=300,
99
+ do_sample=False
100
+ )
101
+ detailed_description = tokenizer.decode(output_detailed[0], skip_special_tokens=True).replace(prompt_detailed, "").strip()
102
+
103
+ # Technical analysis
104
+ with torch.no_grad():
105
+ output_technical = model.generate(
106
+ input_ids=input_ids_technical,
107
+ pixel_values=pixel_values.pixel_values,
108
+ max_new_tokens=200,
109
+ do_sample=False
110
+ )
111
+ technical_analysis = tokenizer.decode(output_technical[0], skip_special_tokens=True).replace(prompt_technical, "").strip()
112
+
113
+ return {
114
+ "success": True,
115
+ "basic_description": basic_description,
116
+ "detailed_description": detailed_description,
117
+ "technical_analysis": technical_analysis
118
+ }
119
+
120
+ except Exception as e:
121
+ logger.error(f"Error processing image: {str(e)}", exc_info=True)
122
+ return {"error": f"Error generating description: {str(e)}"}
123
+
124
+ def load_model():
125
+ """Load the model and related components"""
126
+ global model, processor, tokenizer
127
+
128
+ try:
129
+ logger.info("Loading model...")
130
+ model_id = "Qwen/Qwen2-VL-7B"
131
+
132
+ # Use explicit processor class instead of AutoProcessor
133
+ processor = CLIPImageProcessor.from_pretrained(model_id)
134
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
135
+
136
+ # Load model with 4-bit quantization to reduce memory requirements
137
+ model = AutoModelForCausalLM.from_pretrained(
138
+ model_id,
139
+ torch_dtype=torch.bfloat16,
140
+ load_in_4bit=True,
141
+ device_map="auto"
142
+ )
143
+ logger.info("Model loaded successfully")
144
+ return True
145
+ except Exception as e:
146
+ logger.error(f"Error loading model: {str(e)}", exc_info=True)
147
+ return False
148
+
149
+ def main():
150
+ """Run in command-line mode"""
151
+ # Path to the image
152
+ image_folder = "data_temp"
153
+ image_name = "page_2.png"
154
+ image_path = os.path.join(image_folder, image_name)
155
+
156
+ # Get the description
157
+ result = describe_image(image_path=image_path, show_image=True)
158
+
159
+ # Print the results
160
+ if "error" not in result:
161
+ print("\n==== Image Description Results (Qwen2-VL-7B) ====")
162
+ print(f"\nBasic Description:\n{result['basic_description']}")
163
+ print(f"\nDetailed Description:\n{result['detailed_description']}")
164
+ print(f"\nTechnical Analysis:\n{result['technical_analysis']}")
165
+ else:
166
+ print(result["error"]) # Print error message if there was an issue
167
+
168
+ if __name__ == "__main__":
169
+ main()
image_descriptor_requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.0.1
2
+ torchvision==0.15.2
3
+ transformers==4.36.2
4
+ pillow==10.0.1
5
+ matplotlib==3.7.3
6
+ accelerate==0.27.2
7
+ bitsandbytes==0.41.3
8
+ safetensors==0.4.1
9
+ flask==2.3.3
10
+ flask-cors==4.0.0
11
+ gunicorn==21.2.0
12
+ python-dotenv==1.0.0
13
+ gradio==4.12.0
14
+ requests==2.31.0
readme.mdp ADDED
@@ -0,0 +1 @@
 
 
1
+ placeholder
report.html ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ <!DOCTYPE html>
3
+ <html>
4
+ <head>
5
+ <title>CSV Data Report</title>
6
+ <style>
7
+ body {
8
+ font-family: Arial, sans-serif;
9
+ margin: 20px;
10
+ }
11
+ table {
12
+ border-collapse: collapse;
13
+ width: 100%;
14
+ margin-bottom: 20px;
15
+ }
16
+ th, td {
17
+ border: 1px solid #ddd;
18
+ padding: 8px;
19
+ text-align: right;
20
+ }
21
+ th {
22
+ background-color: #f2f2f2;
23
+ }
24
+ .total-row {
25
+ font-weight: bold;
26
+ background-color: #e6e6e6;
27
+ }
28
+ h1 {
29
+ color: #333;
30
+ }
31
+ </style>
32
+ </head>
33
+ <body>
34
+ <h1>CSV Data Report</h1>
35
+
36
+ <h2>Raw Data</h2>
37
+ <table border="1" class="dataframe table">
38
+ <thead>
39
+ <tr style="text-align: right;">
40
+ <th>Product</th>
41
+ <th>Electronics</th>
42
+ <th>Clothing</th>
43
+ <th>Food</th>
44
+ <th>Books</th>
45
+ </tr>
46
+ </thead>
47
+ <tbody>
48
+ <tr>
49
+ <td>January</td>
50
+ <td>1200</td>
51
+ <td>800</td>
52
+ <td>500</td>
53
+ <td>300</td>
54
+ </tr>
55
+ <tr>
56
+ <td>February</td>
57
+ <td>950</td>
58
+ <td>750</td>
59
+ <td>600</td>
60
+ <td>400</td>
61
+ </tr>
62
+ <tr>
63
+ <td>March</td>
64
+ <td>1500</td>
65
+ <td>900</td>
66
+ <td>550</td>
67
+ <td>350</td>
68
+ </tr>
69
+ <tr>
70
+ <td>April</td>
71
+ <td>1100</td>
72
+ <td>850</td>
73
+ <td>450</td>
74
+ <td>250</td>
75
+ </tr>
76
+ </tbody>
77
+ </table>
78
+
79
+ <h2>Column Totals</h2>
80
+ <table>
81
+ <tr>
82
+
83
+ <th>Electronics</th>
84
+
85
+ <th>Clothing</th>
86
+
87
+ <th>Food</th>
88
+
89
+ <th>Books</th>
90
+
91
+ </tr>
92
+ <tr class="total-row">
93
+
94
+ <td>4,750.00</td>
95
+
96
+ <td>3,300.00</td>
97
+
98
+ <td>2,100.00</td>
99
+
100
+ <td>1,300.00</td>
101
+
102
+ </tr>
103
+ </table>
104
+ </body>
105
+ </html>
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.1.0
2
+ torchvision==0.16.0
3
+ transformers>=4.37.2
4
+ lmdeploy>=0.5.3
5
+ pillow==10.0.1
6
+ numpy>=1.24.0
7
+ scipy==1.11.3
8
+ gradio==3.38.0
9
+ requests==2.31.0
10
+ accelerate==0.27.2
11
+ bitsandbytes==0.41.3
12
+ safetensors==0.4.1
13
+ openai
14
+ huggingface_hub
sample_data.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Product,Electronics,Clothing,Food,Books
2
+ January,1200,800,500,300
3
+ February,950,750,600,400
4
+ March,1500,900,550,350
5
+ April,1100,850,450,250
server_deploy.sh ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Server deployment script for Image Description Application
4
+ echo "Starting Image Description API server deployment"
5
+
6
+ # Create virtual environment if it doesn't exist
7
+ if [ ! -d "venv" ]; then
8
+ echo "Creating virtual environment..."
9
+ python3 -m venv venv
10
+ fi
11
+
12
+ # Activate virtual environment
13
+ echo "Activating virtual environment..."
14
+ source venv/bin/activate
15
+
16
+ # Install requirements
17
+ echo "Installing dependencies..."
18
+ pip install -r image_descriptor_requirements.txt
19
+
20
+ # Create necessary directories
21
+ echo "Setting up directories..."
22
+ mkdir -p uploads
23
+ mkdir -p logs
24
+
25
+ # Start the server with Gunicorn
26
+ echo "Starting server with Gunicorn..."
27
+ gunicorn --bind 0.0.0.0:8000 --workers 1 --timeout 300 "image_descriptor:app" \
28
+ --log-level info \
29
+ --access-logfile logs/access.log \
30
+ --error-logfile logs/error.log \
31
+ --daemon
32
+
33
+ echo "Server started in the background on port 8000"
34
+ echo "You can test it with: curl http://localhost:8000/health"
35
+ echo "To process an image: curl -X POST -F \"image=@data_temp/page_2.png\" http://localhost:8000/describe"
test_internvl2.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Test script for InternVL2-40B-AWQ model using lmdeploy
6
+ """
7
+
8
+ import argparse
9
+ import os
10
+ import time
11
+ from PIL import Image
12
+
13
+ from lmdeploy import pipeline, TurbomindEngineConfig
14
+ from lmdeploy.vl import load_image
15
+
16
+
17
+ def parse_args():
18
+ parser = argparse.ArgumentParser(description="Test InternVL2 model")
19
+ parser.add_argument(
20
+ "--model",
21
+ type=str,
22
+ default="OpenGVLab/InternVL2-40B-AWQ",
23
+ help="Model name or path"
24
+ )
25
+ parser.add_argument(
26
+ "--image",
27
+ type=str,
28
+ required=True,
29
+ help="Path to the test image"
30
+ )
31
+ parser.add_argument(
32
+ "--prompt",
33
+ type=str,
34
+ default="Describe this image in detail.",
35
+ help="Prompt for the model"
36
+ )
37
+ return parser.parse_args()
38
+
39
+
40
+ def main():
41
+ args = parse_args()
42
+
43
+ print(f"Loading model: {args.model}")
44
+ start_time = time.time()
45
+
46
+ # Configure the engine for AWQ quantized model
47
+ backend_config = TurbomindEngineConfig(model_format='awq')
48
+
49
+ # Create pipeline
50
+ pipe = pipeline(args.model, backend_config=backend_config, log_level='INFO')
51
+
52
+ load_time = time.time() - start_time
53
+ print(f"Model loaded in {load_time:.2f} seconds")
54
+
55
+ # Load image
56
+ try:
57
+ if args.image.startswith(('http://', 'https://')):
58
+ image = load_image(args.image)
59
+ print(f"Loaded image from URL: {args.image}")
60
+ else:
61
+ image_path = os.path.abspath(args.image)
62
+ if not os.path.exists(image_path):
63
+ raise FileNotFoundError(f"Image not found: {image_path}")
64
+ image = Image.open(image_path).convert('RGB')
65
+ print(f"Loaded image from path: {image_path}")
66
+ except Exception as e:
67
+ print(f"Error loading image: {e}")
68
+ return
69
+
70
+ # Run inference
71
+ print(f"Running inference with prompt: '{args.prompt}'")
72
+ start_time = time.time()
73
+
74
+ response = pipe((args.prompt, image))
75
+
76
+ inference_time = time.time() - start_time
77
+ print(f"Inference completed in {inference_time:.2f} seconds")
78
+
79
+ # Print result
80
+ print("\n--- RESULT ---")
81
+ print(response.text)
82
+ print("-------------\n")
83
+
84
+
85
+ if __name__ == "__main__":
86
+ main()
upload_internvl2_to_hf.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Script to upload the InternVL2 files to Hugging Face Spaces
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import getpass
11
+ from huggingface_hub import HfApi, create_repo, upload_folder
12
+
13
+ # Default repository name
14
+ DEFAULT_REPO = "mknolan/cursor_slides_internvl2"
15
+
16
+ def main():
17
+ """Main function to upload files to Hugging Face Spaces"""
18
+ # Get Hugging Face token with WRITE access
19
+ token = getpass.getpass("Enter your Hugging Face token (with WRITE access): ")
20
+
21
+ # Get repository name
22
+ repo_name = input("Enter repository name (default: {}): ".format(DEFAULT_REPO)) or DEFAULT_REPO
23
+
24
+ print("Uploading to Space: {}".format(repo_name))
25
+
26
+ # Initialize Hugging Face API
27
+ api = HfApi(token=token)
28
+
29
+ try:
30
+ # Try to get the repository, create if it doesn't exist
31
+ try:
32
+ repo = api.repo_info(repo_id=repo_name, repo_type="space")
33
+ print("Repo {} ready".format(repo_name))
34
+ except Exception:
35
+ print("Creating new Space: {}".format(repo_name))
36
+ create_repo(
37
+ repo_id=repo_name,
38
+ token=token,
39
+ repo_type="space",
40
+ space_sdk="gradio",
41
+ private=False
42
+ )
43
+
44
+ # Upload the entire folder at once using upload_folder
45
+ print("Uploading files to Hugging Face Space...")
46
+ upload_folder(
47
+ folder_path=".", # Current directory
48
+ repo_id=repo_name,
49
+ repo_type="space",
50
+ ignore_patterns=[
51
+ ".git*",
52
+ "*__pycache__*",
53
+ "*.pyc",
54
+ ".DS_Store",
55
+ "*.ipynb_checkpoints*",
56
+ "venv",
57
+ ".env"
58
+ ],
59
+ commit_message="Upload InternVL2 implementation",
60
+ token=token
61
+ )
62
+
63
+ print("Upload completed!")
64
+ print("Check your Space at: https://huggingface.co/spaces/{}".format(repo_name))
65
+
66
+ except Exception as e:
67
+ print("Error: {}".format(e))
68
+ return 1
69
+
70
+ return 0
71
+
72
+ if __name__ == "__main__":
73
+ sys.exit(main())
upload_to_hf.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to upload the current directory to a Hugging Face Space
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ from huggingface_hub import HfApi, create_repo
9
+
10
+ # Configuration
11
+ SPACE_NAME = "mknolan/cursor_slides" # Change this to your space name
12
+ TOKEN = input("Enter your Hugging Face token (with WRITE access): ")
13
+
14
+ # Initialize API
15
+ api = HfApi(token=TOKEN)
16
+
17
+ def upload_directory():
18
+ """Upload all files in the current directory to HF Space"""
19
+ print("Uploading to Space: {}".format(SPACE_NAME))
20
+
21
+ # Create repo if it doesn't exist (this is idempotent)
22
+ try:
23
+ create_repo(
24
+ repo_id=SPACE_NAME,
25
+ token=TOKEN,
26
+ repo_type="space",
27
+ exist_ok=True,
28
+ space_sdk="docker"
29
+ )
30
+ print("Repo {} ready".format(SPACE_NAME))
31
+ except Exception as e:
32
+ print("Note: Repo already exists or {}".format(str(e)))
33
+
34
+ # Gather all files to upload
35
+ files_to_upload = []
36
+ for root, _, files in os.walk("."):
37
+ # Skip .git directory and any other hidden directories
38
+ if "/.git" in root or "/.__pycache__" in root:
39
+ continue
40
+
41
+ for file in files:
42
+ # Skip hidden files and .git files
43
+ if file.startswith(".git") or file.startswith("."):
44
+ continue
45
+
46
+ path = os.path.join(root, file)
47
+ # Skip this upload script itself
48
+ if path == "./upload_to_hf.py":
49
+ continue
50
+
51
+ files_to_upload.append(path)
52
+
53
+ print("Found {} files to upload".format(len(files_to_upload)))
54
+
55
+ # Upload each file
56
+ for i, path in enumerate(files_to_upload):
57
+ print("[{}/{}] Uploading {}...".format(i+1, len(files_to_upload), path))
58
+ try:
59
+ # Path in repo (remove leading ./)
60
+ path_in_repo = path[2:] if path.startswith("./") else path
61
+ api.upload_file(
62
+ path_or_fileobj=path,
63
+ path_in_repo=path_in_repo,
64
+ repo_id=SPACE_NAME,
65
+ repo_type="space"
66
+ )
67
+ except Exception as e:
68
+ print("Error uploading {}: {}".format(path, str(e)))
69
+
70
+ print("Upload completed!")
71
+ print("Check your Space at: https://huggingface.co/spaces/{}".format(SPACE_NAME))
72
+
73
+ if __name__ == "__main__":
74
+ upload_directory()