Upload InternVL2 implementation
Browse files- .env.hf +12 -0
- .gitattributes +1 -0
- Dockerfile +48 -0
- HUGGING-FACE-DEPLOYMENT.md +97 -0
- LOCAL-TESTING.md +67 -0
- README-HF.md +33 -0
- README.md +130 -7
- README_INTERNVL2.md +103 -0
- app.py +389 -0
- app_internvl2.py +165 -0
- csv_to_html.py +77 -0
- data_temp/page_2.png +3 -0
- docs/aws_deployment.md +170 -0
- docs/azure_deployment.md +365 -0
- entrypoint.sh +20 -0
- image_descriptor.py +169 -0
- image_descriptor_requirements.txt +14 -0
- readme.mdp +1 -0
- report.html +105 -0
- requirements.txt +14 -0
- sample_data.csv +5 -0
- server_deploy.sh +35 -0
- test_internvl2.py +86 -0
- upload_internvl2_to_hf.py +73 -0
- upload_to_hf.py +74 -0
.env.hf
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Environment variables for Hugging Face Spaces deployment
|
2 |
+
PORT=7860
|
3 |
+
FLASK_APP=image_descriptor.py
|
4 |
+
TRANSFORMERS_CACHE=/app/.cache/huggingface/transformers
|
5 |
+
HF_HOME=/app/.cache/huggingface
|
6 |
+
GRADIO_ANALYTICS_ENABLED=False
|
7 |
+
GRADIO_SERVER_NAME=0.0.0.0
|
8 |
+
GRADIO_SERVER_PORT=7860
|
9 |
+
# Uncomment to run only the API without the UI
|
10 |
+
# API_ONLY=true
|
11 |
+
# Uncomment to run both the UI and API
|
12 |
+
# ENABLE_UI=true
|
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data_temp/page_2.png filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
|
2 |
+
|
3 |
+
# Set environment variables
|
4 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
5 |
+
ENV PYTHONUNBUFFERED=1
|
6 |
+
ENV HF_HOME=/root/.cache/huggingface
|
7 |
+
ENV TRANSFORMERS_CACHE=/root/.cache/huggingface/transformers
|
8 |
+
ENV MPLCONFIGDIR=/tmp/matplotlib
|
9 |
+
|
10 |
+
# Create necessary directories with proper permissions
|
11 |
+
RUN mkdir -p /root/.cache/huggingface/transformers && \
|
12 |
+
mkdir -p /tmp/matplotlib && \
|
13 |
+
chmod -R 777 /root/.cache && \
|
14 |
+
chmod -R 777 /tmp/matplotlib
|
15 |
+
|
16 |
+
# Install system dependencies
|
17 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
18 |
+
build-essential \
|
19 |
+
git \
|
20 |
+
curl \
|
21 |
+
ca-certificates \
|
22 |
+
python3-pip \
|
23 |
+
python3-dev \
|
24 |
+
python3-setuptools \
|
25 |
+
&& rm -rf /var/lib/apt/lists/*
|
26 |
+
|
27 |
+
# Create a working directory
|
28 |
+
WORKDIR /app
|
29 |
+
|
30 |
+
# Copy requirements file and install Python dependencies
|
31 |
+
COPY requirements.txt .
|
32 |
+
|
33 |
+
RUN pip3 install --no-cache-dir --upgrade pip && \
|
34 |
+
pip3 install --no-cache-dir -r requirements.txt && \
|
35 |
+
# Install additional dependencies for lmdeploy
|
36 |
+
pip3 install --no-cache-dir cmake && \
|
37 |
+
pip3 install --no-cache-dir ninja && \
|
38 |
+
# Install flash-attention for performance
|
39 |
+
pip3 install --no-cache-dir flash-attn
|
40 |
+
|
41 |
+
# Copy the application files
|
42 |
+
COPY . .
|
43 |
+
|
44 |
+
# Make port 7860 available for the app
|
45 |
+
EXPOSE 7860
|
46 |
+
|
47 |
+
# Start the application
|
48 |
+
CMD ["python3", "app_internvl2.py"]
|
HUGGING-FACE-DEPLOYMENT.md
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Hugging Face Spaces Deployment Guide
|
2 |
+
|
3 |
+
This guide provides detailed steps for deploying the Image Description application to Hugging Face Spaces.
|
4 |
+
|
5 |
+
## Prerequisites
|
6 |
+
|
7 |
+
- A [Hugging Face](https://huggingface.co/) account
|
8 |
+
- A GitHub repository containing your application code
|
9 |
+
- The code should include all files created in this project
|
10 |
+
|
11 |
+
## Step 1: Prepare Your GitHub Repository
|
12 |
+
|
13 |
+
1. Make sure your GitHub repository contains all necessary files:
|
14 |
+
- `image_descriptor.py` (Flask API)
|
15 |
+
- `app.py` (Gradio UI)
|
16 |
+
- `Dockerfile`
|
17 |
+
- `entrypoint.sh`
|
18 |
+
- `image_descriptor_requirements.txt`
|
19 |
+
- `data_temp/` directory with sample images
|
20 |
+
- `.env` file with configuration
|
21 |
+
- `.gitignore`
|
22 |
+
|
23 |
+
2. Commit all changes to your repository:
|
24 |
+
```bash
|
25 |
+
git add .
|
26 |
+
git commit -m "Prepare for Hugging Face Spaces deployment"
|
27 |
+
git push
|
28 |
+
```
|
29 |
+
|
30 |
+
## Step 2: Create a New Hugging Face Space
|
31 |
+
|
32 |
+
1. Go to [Hugging Face Spaces](https://huggingface.co/spaces)
|
33 |
+
2. Click on "Create new Space"
|
34 |
+
3. Fill out the form:
|
35 |
+
- **Owner**: Your Hugging Face username or organization
|
36 |
+
- **Space name**: Choose a unique name (e.g., "image-description-qwen")
|
37 |
+
- **License**: Choose an appropriate license (e.g., Apache 2.0)
|
38 |
+
- **SDK**: Select "Docker"
|
39 |
+
- **Hardware**: Select "A10G" for best performance with this model
|
40 |
+
- **Visibility**: Choose public or private based on your needs
|
41 |
+
|
42 |
+
4. Click "Create Space"
|
43 |
+
|
44 |
+
## Step 3: Link Your GitHub Repository
|
45 |
+
|
46 |
+
1. On your new Space page, go to the "Settings" tab
|
47 |
+
2. Under "Repository", click "Link external repository"
|
48 |
+
3. Enter your GitHub repository URL
|
49 |
+
4. Click "Link repository"
|
50 |
+
5. Choose the branch to deploy (e.g., "main" or "master")
|
51 |
+
6. Click "Save"
|
52 |
+
|
53 |
+
## Step 4: Configure the Space
|
54 |
+
|
55 |
+
1. Stay on the "Settings" tab
|
56 |
+
2. Under "Variables", you can add environment variables if needed (most are already in the Dockerfile)
|
57 |
+
3. Under "Advanced settings", enable "Secret HTTPS Frontend"
|
58 |
+
|
59 |
+
## Step 5: Deploy and Monitor
|
60 |
+
|
61 |
+
1. Go to the "Factory" tab to see the build progress
|
62 |
+
2. Wait for the build and deployment to complete (this may take 10-20 minutes)
|
63 |
+
3. Check the logs for any errors
|
64 |
+
4. Once deployment is successful, go to the "App" tab to use your application
|
65 |
+
|
66 |
+
## Step 6: Test Your Deployed Application
|
67 |
+
|
68 |
+
1. On the "App" tab, upload an image
|
69 |
+
2. Click "Analyze Image"
|
70 |
+
3. Verify that the model generates descriptions correctly
|
71 |
+
|
72 |
+
## Troubleshooting Common Issues
|
73 |
+
|
74 |
+
- **Build Failures**: Check the "Factory" logs for detailed error messages
|
75 |
+
- **Memory Issues**: If the model crashes, you may need to adjust the quantization settings or use a smaller model
|
76 |
+
- **Slow Performance**: The first run will be slow as the model is downloaded; subsequent runs will be faster
|
77 |
+
- **UI Issues**: If the UI loads but the model doesn't work, check the browser console and server logs
|
78 |
+
|
79 |
+
## Updating Your Space
|
80 |
+
|
81 |
+
To update your Space after making changes to your code:
|
82 |
+
|
83 |
+
1. Make changes to your code
|
84 |
+
2. Commit and push to GitHub
|
85 |
+
3. The Space will automatically rebuild and deploy the updated code
|
86 |
+
|
87 |
+
## Sharing Your Space
|
88 |
+
|
89 |
+
- You can share your Space by giving others the URL
|
90 |
+
- You can embed your Space in other websites using the iframe code provided in the "Embed" tab
|
91 |
+
- You can collaborate with others by adding them as contributors in the "Settings" tab
|
92 |
+
|
93 |
+
## Additional Resources
|
94 |
+
|
95 |
+
- [Hugging Face Spaces Documentation](https://huggingface.co/docs/hub/spaces)
|
96 |
+
- [Docker SDK Documentation](https://huggingface.co/docs/hub/spaces-sdks-docker)
|
97 |
+
- [Gradio Documentation](https://www.gradio.app/docs/)
|
LOCAL-TESTING.md
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Local Testing Guide
|
2 |
+
|
3 |
+
Before deploying to Hugging Face Spaces, you may want to test the application locally. This guide provides instructions for local testing.
|
4 |
+
|
5 |
+
## Prerequisites
|
6 |
+
|
7 |
+
- CUDA-capable GPU with at least 8GB VRAM
|
8 |
+
- Python 3.8+
|
9 |
+
- pip or conda package manager
|
10 |
+
|
11 |
+
## Steps for Local Testing
|
12 |
+
|
13 |
+
1. **Install Dependencies**
|
14 |
+
|
15 |
+
```bash
|
16 |
+
pip install -r image_descriptor_requirements.txt
|
17 |
+
```
|
18 |
+
|
19 |
+
2. **Run in UI Mode**
|
20 |
+
|
21 |
+
```bash
|
22 |
+
python app.py
|
23 |
+
```
|
24 |
+
|
25 |
+
This will start the Gradio UI on http://localhost:7860. You can upload images and test the model.
|
26 |
+
|
27 |
+
3. **Run in API-only Mode**
|
28 |
+
|
29 |
+
```bash
|
30 |
+
FLASK_APP=image_descriptor.py flask run --host=0.0.0.0 --port=5000
|
31 |
+
```
|
32 |
+
|
33 |
+
This will start just the Flask API on http://localhost:5000.
|
34 |
+
|
35 |
+
4. **Test the Docker Container**
|
36 |
+
|
37 |
+
```bash
|
38 |
+
# Build the container
|
39 |
+
docker build -t image-descriptor .
|
40 |
+
|
41 |
+
# Run the container
|
42 |
+
docker run -p 7860:7860 --gpus all image-descriptor
|
43 |
+
```
|
44 |
+
|
45 |
+
The application will be available at http://localhost:7860.
|
46 |
+
|
47 |
+
## Testing the API
|
48 |
+
|
49 |
+
You can test the API using curl:
|
50 |
+
|
51 |
+
```bash
|
52 |
+
# Health check
|
53 |
+
curl http://localhost:5000/health
|
54 |
+
|
55 |
+
# Process an image
|
56 |
+
curl -X POST -F "image=@data_temp/page_2.png" http://localhost:5000/describe
|
57 |
+
```
|
58 |
+
|
59 |
+
## Troubleshooting
|
60 |
+
|
61 |
+
- **GPU Memory Issues**: If you encounter GPU memory errors, try reducing batch sizes or using a smaller model.
|
62 |
+
- **Model Download Issues**: If the model download fails, try downloading it manually from Hugging Face and place it in the `.cache/huggingface/transformers` directory.
|
63 |
+
- **Dependencies**: Make sure you have the correct CUDA version installed for your GPU.
|
64 |
+
|
65 |
+
## Next Steps
|
66 |
+
|
67 |
+
Once you've confirmed the application works locally, you can deploy it to Hugging Face Spaces following the instructions in the main README.md.
|
README-HF.md
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Image Description with Qwen2-VL-7B
|
2 |
+
|
3 |
+
This Hugging Face Space uses the powerful Qwen2-VL-7B vision language model to generate detailed descriptions of images.
|
4 |
+
|
5 |
+
## About
|
6 |
+
|
7 |
+
Upload any image and get:
|
8 |
+
- A basic description
|
9 |
+
- A detailed analysis
|
10 |
+
- A technical assessment
|
11 |
+
|
12 |
+
The app uses the Qwen2-VL-7B model with 4-bit quantization to provide efficient and high-quality image analysis.
|
13 |
+
|
14 |
+
## Usage
|
15 |
+
|
16 |
+
1. Upload an image or use one of the example images
|
17 |
+
2. Click "Analyze Image"
|
18 |
+
3. View the three types of descriptions generated by the model
|
19 |
+
|
20 |
+
## Examples
|
21 |
+
|
22 |
+
The space includes sample images in the data_temp folder that you can use to test the model.
|
23 |
+
|
24 |
+
## Technical Details
|
25 |
+
|
26 |
+
- **Model**: Qwen2-VL-7B
|
27 |
+
- **Framework**: Gradio UI + Flask API backend
|
28 |
+
- **Quantization**: 4-bit for efficient inference
|
29 |
+
- **GPU**: A10G recommended
|
30 |
+
|
31 |
+
## Credits
|
32 |
+
|
33 |
+
- [Qwen2-VL-7B model](https://huggingface.co/Qwen/Qwen2-VL-7B) by Qwen team
|
README.md
CHANGED
@@ -1,12 +1,135 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk:
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Image Description with Qwen-VL
|
3 |
+
emoji: 🖼️
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: purple
|
6 |
+
sdk: docker
|
7 |
+
sdk_version: 3.0.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
# Image Description Application with Qwen-VL
|
13 |
+
|
14 |
+
This application uses the advanced Qwen-VL-Chat vision language model to generate detailed descriptions for images. It's specifically set up to describe the image in the `data_temp` folder, but can also analyze any uploaded image.
|
15 |
+
|
16 |
+
## Features
|
17 |
+
|
18 |
+
- Loads an image from the data_temp folder or via upload
|
19 |
+
- Generates multiple types of descriptions using state-of-the-art AI:
|
20 |
+
- Basic description (brief overview)
|
21 |
+
- Detailed analysis (comprehensive description)
|
22 |
+
- Technical analysis (assessment of technical aspects)
|
23 |
+
- Displays the image (optional)
|
24 |
+
- Uses 8-bit quantization for efficient model loading
|
25 |
+
- Provides a user-friendly Gradio UI
|
26 |
+
|
27 |
+
## Requirements
|
28 |
+
|
29 |
+
- Python 3.8 or higher
|
30 |
+
- PyTorch
|
31 |
+
- Transformers (version 4.35.2+)
|
32 |
+
- Pillow
|
33 |
+
- Matplotlib
|
34 |
+
- Accelerate
|
35 |
+
- Bitsandbytes
|
36 |
+
- Safetensors
|
37 |
+
- Gradio for the web interface
|
38 |
+
|
39 |
+
## Hardware Requirements
|
40 |
+
|
41 |
+
This application uses a vision-language model which requires:
|
42 |
+
- A CUDA-capable GPU with at least 8GB VRAM
|
43 |
+
- 8GB+ system RAM
|
44 |
+
|
45 |
+
## Deployment Options
|
46 |
+
|
47 |
+
### 1. Hugging Face Spaces (Recommended)
|
48 |
+
|
49 |
+
This repository is ready to be deployed on Hugging Face Spaces.
|
50 |
+
|
51 |
+
**Steps:**
|
52 |
+
1. Create a new Space on [Hugging Face Spaces](https://huggingface.co/spaces)
|
53 |
+
2. Select "Docker" as the Space SDK
|
54 |
+
3. Link this GitHub repository
|
55 |
+
4. Select a GPU (T4 or better is recommended)
|
56 |
+
5. Create the Space
|
57 |
+
|
58 |
+
The application will automatically deploy with the Gradio UI frontend.
|
59 |
+
|
60 |
+
### 2. AWS SageMaker
|
61 |
+
|
62 |
+
For production deployment on AWS SageMaker:
|
63 |
+
|
64 |
+
1. Package the application using the provided Dockerfile
|
65 |
+
2. Upload the Docker image to Amazon ECR
|
66 |
+
3. Create a SageMaker Model using the ECR image
|
67 |
+
4. Deploy an endpoint with an instance type like ml.g4dn.xlarge
|
68 |
+
5. Set up API Gateway for HTTP access (optional)
|
69 |
+
|
70 |
+
Detailed AWS instructions can be found in the `docs/aws_deployment.md` file.
|
71 |
+
|
72 |
+
### 3. Azure Machine Learning
|
73 |
+
|
74 |
+
For Azure deployment:
|
75 |
+
|
76 |
+
1. Create an Azure ML workspace
|
77 |
+
2. Register the model on Azure ML
|
78 |
+
3. Create an inference configuration
|
79 |
+
4. Deploy to AKS or ACI with a GPU-enabled instance
|
80 |
+
|
81 |
+
Detailed Azure instructions can be found in the `docs/azure_deployment.md` file.
|
82 |
+
|
83 |
+
## How It Works
|
84 |
+
|
85 |
+
The application uses the Qwen-VL-Chat model, a state-of-the-art multimodal AI model that can understand and describe images with impressive detail.
|
86 |
+
|
87 |
+
The script:
|
88 |
+
1. Processes the image with three different prompts:
|
89 |
+
- "Describe this image briefly in a single paragraph."
|
90 |
+
- "Analyze this image in detail. Describe the main elements, any text visible, the colors, and the overall composition."
|
91 |
+
- "What can you tell me about the technical aspects of this image?"
|
92 |
+
2. Uses 8-bit quantization to reduce memory requirements
|
93 |
+
3. Formats and displays the results
|
94 |
+
|
95 |
+
## Repository Structure
|
96 |
+
|
97 |
+
- `app.py` - Gradio UI for web interface
|
98 |
+
- `Dockerfile` - For containerized deployment
|
99 |
+
- `requirements.txt` - Python dependencies
|
100 |
+
- `data_temp/` - Sample images for testing
|
101 |
+
|
102 |
+
## Local Development
|
103 |
+
|
104 |
+
1. Install the required packages:
|
105 |
+
```
|
106 |
+
pip install -r requirements.txt
|
107 |
+
```
|
108 |
+
|
109 |
+
2. Run the Gradio UI:
|
110 |
+
```
|
111 |
+
python app.py
|
112 |
+
```
|
113 |
+
|
114 |
+
3. Visit `http://localhost:7860` in your browser
|
115 |
+
|
116 |
+
## Example Output
|
117 |
+
|
118 |
+
```
|
119 |
+
Processing image: data_temp/page_2.png
|
120 |
+
Loading model...
|
121 |
+
Generating descriptions...
|
122 |
+
|
123 |
+
==== Image Description Results (Qwen-VL) ====
|
124 |
+
|
125 |
+
Basic Description:
|
126 |
+
The image shows a webpage or document with text content organized in multiple columns.
|
127 |
+
|
128 |
+
Detailed Description:
|
129 |
+
The image displays a structured document or webpage with multiple sections of text organized in a grid layout. The content appears to be technical or educational in nature, with what looks like headings and paragraphs of text. The color scheme is primarily black text on a white background, creating a clean, professional appearance. There appear to be multiple columns of information, possibly representing different topics or categories. The layout suggests this might be documentation, a reference guide, or an educational resource related to technical content.
|
130 |
+
|
131 |
+
Technical Analysis:
|
132 |
+
This appears to be a screenshot of a digital document or webpage. The image quality is good with clear text rendering, suggesting it was captured at an appropriate resolution. The image uses a standard document layout with what appears to be a grid or multi-column structure. The screenshot has been taken of what seems to be a text-heavy interface with minimal graphics, consistent with technical documentation or reference materials.
|
133 |
+
```
|
134 |
+
|
135 |
+
Note: Actual descriptions will vary based on the specific image content and may be more detailed than this example.
|
README_INTERNVL2.md
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Image Analysis with InternVL2
|
2 |
+
|
3 |
+
This project uses the InternVL2-40B-AWQ model for high-quality image analysis, description, and understanding. It provides a Gradio web interface for users to upload images and get detailed analysis.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- **High-Quality Image Analysis**: Uses InternVL2-40B (4-bit quantized) for state-of-the-art image understanding
|
8 |
+
- **Multiple Analysis Types**: General description, text extraction, chart analysis, people description, and technical analysis
|
9 |
+
- **Simple UI**: User-friendly Gradio interface for easy image uploading and analysis
|
10 |
+
- **Efficient Resource Usage**: 4-bit quantized model (AWQ) for reduced memory footprint and faster inference
|
11 |
+
|
12 |
+
## Requirements
|
13 |
+
|
14 |
+
The application requires:
|
15 |
+
|
16 |
+
- Python 3.9+
|
17 |
+
- CUDA-compatible GPU (recommended 24GB+ VRAM)
|
18 |
+
- Transformers 4.37.2+
|
19 |
+
- lmdeploy 0.5.3+
|
20 |
+
- Gradio 3.38.0
|
21 |
+
- Other dependencies in `requirements.txt`
|
22 |
+
|
23 |
+
## Setup
|
24 |
+
|
25 |
+
### Docker Setup (Recommended)
|
26 |
+
|
27 |
+
1. **Build the Docker image**:
|
28 |
+
```
|
29 |
+
docker build -t internvl2-image-analysis .
|
30 |
+
```
|
31 |
+
|
32 |
+
2. **Run the Docker container**:
|
33 |
+
```
|
34 |
+
docker run --gpus all -p 7860:7860 internvl2-image-analysis
|
35 |
+
```
|
36 |
+
|
37 |
+
### Local Setup
|
38 |
+
|
39 |
+
1. **Create a virtual environment**:
|
40 |
+
```
|
41 |
+
python -m venv venv
|
42 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
43 |
+
```
|
44 |
+
|
45 |
+
2. **Install dependencies**:
|
46 |
+
```
|
47 |
+
pip install -r requirements.txt
|
48 |
+
```
|
49 |
+
|
50 |
+
3. **Run the application**:
|
51 |
+
```
|
52 |
+
python app_internvl2.py
|
53 |
+
```
|
54 |
+
|
55 |
+
## Usage
|
56 |
+
|
57 |
+
1. Open your browser and navigate to `http://localhost:7860`
|
58 |
+
2. Upload an image using the upload box
|
59 |
+
3. Choose an analysis type from the options
|
60 |
+
4. Click "Analyze Image" and wait for the results
|
61 |
+
|
62 |
+
### Analysis Types
|
63 |
+
|
64 |
+
- **General**: Provides a comprehensive description of the image content
|
65 |
+
- **Text**: Focuses on identifying and extracting text from the image
|
66 |
+
- **Chart**: Analyzes charts, graphs, and diagrams in detail
|
67 |
+
- **People**: Describes people in the image - appearance, actions, and expressions
|
68 |
+
- **Technical**: Provides technical analysis of objects and their relationships
|
69 |
+
|
70 |
+
## Testing
|
71 |
+
|
72 |
+
To test the model directly from the command line:
|
73 |
+
|
74 |
+
```
|
75 |
+
python test_internvl2.py --image path/to/your/image.jpg --prompt "Describe this image in detail."
|
76 |
+
```
|
77 |
+
|
78 |
+
## Deployment to Hugging Face
|
79 |
+
|
80 |
+
To deploy to Hugging Face Spaces:
|
81 |
+
|
82 |
+
```
|
83 |
+
python upload_internvl2_to_hf.py
|
84 |
+
```
|
85 |
+
|
86 |
+
## Model Details
|
87 |
+
|
88 |
+
This application uses InternVL2-40B-AWQ, a 4-bit quantized version of InternVL2-40B. The original model consists of:
|
89 |
+
|
90 |
+
- **Vision Component**: InternViT-6B-448px-V1-5
|
91 |
+
- **Language Component**: Nous-Hermes-2-Yi-34B
|
92 |
+
- **Total Parameters**: ~40B (6B vision + 34B language)
|
93 |
+
|
94 |
+
## License
|
95 |
+
|
96 |
+
This project is released under the same license as the InternVL2 model, which is MIT license.
|
97 |
+
|
98 |
+
## Acknowledgements
|
99 |
+
|
100 |
+
- [OpenGVLab](https://github.com/OpenGVLab) for creating the InternVL2 models
|
101 |
+
- [Hugging Face](https://huggingface.co/) for model hosting
|
102 |
+
- [lmdeploy](https://github.com/InternLM/lmdeploy) for model optimization
|
103 |
+
- [Gradio](https://gradio.app/) for the web interface
|
app.py
ADDED
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from PIL import Image
|
3 |
+
import torch
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
# Set CUDA memory configuration to avoid fragmentation
|
9 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
|
10 |
+
|
11 |
+
# Import the models after setting memory configuration
|
12 |
+
from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
|
13 |
+
|
14 |
+
# Model configuration
|
15 |
+
CLIP_MODEL_ID = "openai/clip-vit-base-patch32" # Fast classification
|
16 |
+
DETAILED_MODEL_ID = "Salesforce/blip-image-captioning-large" # Use original BLIP instead of BLIP-2
|
17 |
+
USE_GPU = torch.cuda.is_available()
|
18 |
+
|
19 |
+
# Global variables
|
20 |
+
clip_model = None
|
21 |
+
clip_processor = None
|
22 |
+
detailed_model = None
|
23 |
+
detailed_processor = None
|
24 |
+
|
25 |
+
def load_clip_model():
|
26 |
+
"""Load the CLIP model for fast classification"""
|
27 |
+
global clip_model, clip_processor
|
28 |
+
|
29 |
+
# Return if already loaded
|
30 |
+
if clip_model is not None and clip_processor is not None:
|
31 |
+
return True
|
32 |
+
|
33 |
+
print("Loading CLIP model...")
|
34 |
+
try:
|
35 |
+
# First clear any GPU memory
|
36 |
+
if torch.cuda.is_available():
|
37 |
+
torch.cuda.empty_cache()
|
38 |
+
|
39 |
+
# Load processor
|
40 |
+
clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_ID)
|
41 |
+
|
42 |
+
# Load model efficiently
|
43 |
+
if USE_GPU:
|
44 |
+
clip_model = CLIPModel.from_pretrained(CLIP_MODEL_ID).to("cuda")
|
45 |
+
else:
|
46 |
+
clip_model = CLIPModel.from_pretrained(CLIP_MODEL_ID)
|
47 |
+
|
48 |
+
# Set to evaluation mode
|
49 |
+
clip_model.eval()
|
50 |
+
print("CLIP model loaded successfully!")
|
51 |
+
return True
|
52 |
+
except Exception as e:
|
53 |
+
print(f"Error loading CLIP model: {str(e)}")
|
54 |
+
return False
|
55 |
+
|
56 |
+
def load_detailed_model():
|
57 |
+
"""Load the BLIP model for detailed image analysis"""
|
58 |
+
global detailed_model, detailed_processor
|
59 |
+
|
60 |
+
# If already loaded, return
|
61 |
+
if detailed_model is not None and detailed_processor is not None:
|
62 |
+
return True
|
63 |
+
|
64 |
+
print("Loading BLIP model...")
|
65 |
+
try:
|
66 |
+
# Clear memory first
|
67 |
+
if torch.cuda.is_available():
|
68 |
+
torch.cuda.empty_cache()
|
69 |
+
|
70 |
+
# Load processor and model for original BLIP
|
71 |
+
detailed_processor = BlipProcessor.from_pretrained(DETAILED_MODEL_ID)
|
72 |
+
|
73 |
+
# For older models like BLIP, don't use device_map='auto' or load_in_8bit
|
74 |
+
# Instead, load the model and then move it to the device
|
75 |
+
detailed_model = BlipForConditionalGeneration.from_pretrained(
|
76 |
+
DETAILED_MODEL_ID,
|
77 |
+
torch_dtype=torch.float16 if USE_GPU else torch.float32
|
78 |
+
)
|
79 |
+
|
80 |
+
# Manually move model to GPU if available
|
81 |
+
if USE_GPU:
|
82 |
+
detailed_model = detailed_model.to("cuda")
|
83 |
+
|
84 |
+
# Set to evaluation mode
|
85 |
+
detailed_model.eval()
|
86 |
+
print("BLIP model loaded successfully!")
|
87 |
+
return True
|
88 |
+
except Exception as e:
|
89 |
+
print(f"Error loading BLIP model: {str(e)}")
|
90 |
+
if "CUDA out of memory" in str(e):
|
91 |
+
print("Not enough GPU memory for the detailed model")
|
92 |
+
return False
|
93 |
+
|
94 |
+
# Categories for image classification
|
95 |
+
CATEGORIES = [
|
96 |
+
"a photograph", "a painting", "a drawing", "a digital art",
|
97 |
+
"landscape", "portrait", "cityscape", "animals", "food", "vehicle",
|
98 |
+
"building", "nature", "people", "abstract art", "technology",
|
99 |
+
"interior", "exterior", "night scene", "beach", "mountains",
|
100 |
+
"forest", "water", "flowers", "sports",
|
101 |
+
"a person", "multiple people", "a child", "an elderly person",
|
102 |
+
"a dog", "a cat", "wildlife", "a bird", "a car", "a building",
|
103 |
+
"a presentation slide", "a graph", "a chart", "a diagram", "text document",
|
104 |
+
"a screenshot", "a map", "a table of data", "a scientific figure"
|
105 |
+
]
|
106 |
+
|
107 |
+
def get_detailed_analysis(image):
|
108 |
+
"""Get detailed analysis from the image using BLIP model"""
|
109 |
+
try:
|
110 |
+
start_time = time.time()
|
111 |
+
|
112 |
+
# Make sure the model is loaded
|
113 |
+
if not load_detailed_model():
|
114 |
+
return "Couldn't load detailed analysis model."
|
115 |
+
|
116 |
+
# Convert numpy array to PIL Image
|
117 |
+
if isinstance(image, np.ndarray):
|
118 |
+
image_pil = Image.fromarray(image).convert('RGB')
|
119 |
+
else:
|
120 |
+
# If somehow it's already a PIL Image
|
121 |
+
image_pil = image.convert('RGB')
|
122 |
+
|
123 |
+
# Resize the image to improve performance
|
124 |
+
max_size = 600 # Limit to 600px on the longest side
|
125 |
+
width, height = image_pil.size
|
126 |
+
if max(width, height) > max_size:
|
127 |
+
if width > height:
|
128 |
+
new_width = max_size
|
129 |
+
new_height = int(height * (max_size / width))
|
130 |
+
else:
|
131 |
+
new_height = max_size
|
132 |
+
new_width = int(width * (max_size / height))
|
133 |
+
image_pil = image_pil.resize((new_width, new_height), Image.LANCZOS)
|
134 |
+
|
135 |
+
device = "cuda" if USE_GPU else "cpu"
|
136 |
+
|
137 |
+
# Using an unconditional approach first - this usually works better
|
138 |
+
inputs = detailed_processor(image_pil, return_tensors="pt")
|
139 |
+
if USE_GPU:
|
140 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
141 |
+
|
142 |
+
with torch.no_grad():
|
143 |
+
# Get a basic unconditional caption
|
144 |
+
output_ids = detailed_model.generate(
|
145 |
+
**inputs,
|
146 |
+
max_length=50,
|
147 |
+
num_beams=5,
|
148 |
+
do_sample=False,
|
149 |
+
early_stopping=True
|
150 |
+
)
|
151 |
+
base_description = detailed_processor.decode(output_ids[0], skip_special_tokens=True)
|
152 |
+
|
153 |
+
# ULTRA-SIMPLE single-word prompts to avoid any echoing
|
154 |
+
analyses = {
|
155 |
+
"text": None, # Text content
|
156 |
+
"chart": None, # Chart analysis
|
157 |
+
"subject": None # Main subject
|
158 |
+
}
|
159 |
+
|
160 |
+
# Use the base description for context with ultra-simple prompts
|
161 |
+
ultra_simple_prompts = {
|
162 |
+
f"Text in {base_description[:20]}...": "text",
|
163 |
+
f"Charts in {base_description[:20]}...": "chart",
|
164 |
+
f"Subject of {base_description[:20]}...": "subject"
|
165 |
+
}
|
166 |
+
|
167 |
+
for prompt, analysis_type in ultra_simple_prompts.items():
|
168 |
+
# Process with prompt
|
169 |
+
inputs = detailed_processor(image_pil, text=prompt, return_tensors="pt")
|
170 |
+
|
171 |
+
if USE_GPU:
|
172 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
173 |
+
|
174 |
+
with torch.no_grad():
|
175 |
+
output_ids = detailed_model.generate(
|
176 |
+
**inputs,
|
177 |
+
max_length=75,
|
178 |
+
num_beams=3,
|
179 |
+
do_sample=True,
|
180 |
+
temperature=0.7,
|
181 |
+
repetition_penalty=1.2,
|
182 |
+
early_stopping=True
|
183 |
+
)
|
184 |
+
|
185 |
+
result = detailed_processor.decode(output_ids[0], skip_special_tokens=True)
|
186 |
+
|
187 |
+
# SUPER AGGRESSIVE cleaning
|
188 |
+
# First, remove anything that looks like a prefix before a colon
|
189 |
+
colon_parts = result.split(":")
|
190 |
+
if len(colon_parts) > 1:
|
191 |
+
# Take everything after the first colon
|
192 |
+
result = ":".join(colon_parts[1:]).strip()
|
193 |
+
|
194 |
+
# Remove the base description if it appears
|
195 |
+
if base_description in result:
|
196 |
+
result = result.replace(base_description, "").strip()
|
197 |
+
|
198 |
+
# Remove any part of the prompt
|
199 |
+
for p in ultra_simple_prompts.keys():
|
200 |
+
if p in result:
|
201 |
+
result = result.replace(p, "").strip()
|
202 |
+
|
203 |
+
# Remove the first 20 chars of base description if they appear
|
204 |
+
if base_description[:20] in result:
|
205 |
+
result = result.replace(base_description[:20], "").strip()
|
206 |
+
|
207 |
+
# Remove all common question patterns and filler text
|
208 |
+
remove_patterns = [
|
209 |
+
"text in", "charts in", "subject of",
|
210 |
+
"in detail", "describe", "this image", "the image",
|
211 |
+
"can you", "do you", "is there", "are there", "i can see",
|
212 |
+
"i see", "there is", "there are", "it looks like",
|
213 |
+
"appears to be", "seems to be", "might be", "could be",
|
214 |
+
"i think", "i believe", "probably", "possibly", "maybe",
|
215 |
+
"it is", "this is", "that is", "these are", "those are",
|
216 |
+
"image shows", "picture shows", "image contains", "picture contains",
|
217 |
+
"in the image", "in this image", "of this image", "from this image",
|
218 |
+
"based on", "according to", "looking at", "from what i can see",
|
219 |
+
"appears to show", "depicts", "represents", "illustrates", "demonstrates",
|
220 |
+
"presents", "displays", "portrays", "reveals", "indicates", "suggests",
|
221 |
+
"we can see", "you can see", "one can see"
|
222 |
+
]
|
223 |
+
|
224 |
+
for pattern in remove_patterns:
|
225 |
+
if pattern.lower() in result.lower():
|
226 |
+
# Find and remove each occurrence
|
227 |
+
lower_result = result.lower()
|
228 |
+
while pattern.lower() in lower_result:
|
229 |
+
idx = lower_result.find(pattern.lower())
|
230 |
+
if idx >= 0:
|
231 |
+
result = result[:idx] + result[idx+len(pattern):]
|
232 |
+
lower_result = result.lower()
|
233 |
+
|
234 |
+
# Clean up any punctuation/formatting issues
|
235 |
+
result = result.strip()
|
236 |
+
while result and result[0] in ",.;:?!-":
|
237 |
+
result = result[1:].strip()
|
238 |
+
|
239 |
+
# Remove "..." if it appears
|
240 |
+
result = result.replace("...", "").strip()
|
241 |
+
|
242 |
+
# Fix capitalization
|
243 |
+
if result and len(result) > 0:
|
244 |
+
result = result[0].upper() + result[1:] if len(result) > 1 else result[0].upper()
|
245 |
+
|
246 |
+
analyses[analysis_type] = result
|
247 |
+
|
248 |
+
# Compose the final output
|
249 |
+
output_text = f"## Detailed Description\n{base_description}\n\n"
|
250 |
+
|
251 |
+
# Only show relevant sections
|
252 |
+
if analyses['text'] and len(analyses['text']) > 5 and not any(x in analyses['text'].lower() for x in ["no text", "not any text", "can't see", "cannot see", "don't see", "couldn't find"]):
|
253 |
+
output_text += f"## Text Content\n{analyses['text']}\n\n"
|
254 |
+
|
255 |
+
if analyses['chart'] and len(analyses['chart']) > 5 and not any(x in analyses['chart'].lower() for x in ["no chart", "not any chart", "no graph", "not any graph", "can't see", "cannot see", "don't see", "couldn't find"]):
|
256 |
+
output_text += f"## Chart Analysis\n{analyses['chart']}\n\n"
|
257 |
+
|
258 |
+
output_text += f"## Main Subject\n{analyses['subject'] or 'Unable to determine main subject.'}"
|
259 |
+
|
260 |
+
# Clear GPU memory
|
261 |
+
if USE_GPU:
|
262 |
+
torch.cuda.empty_cache()
|
263 |
+
|
264 |
+
elapsed_time = time.time() - start_time
|
265 |
+
return output_text
|
266 |
+
|
267 |
+
except Exception as e:
|
268 |
+
print(f"Error in detailed analysis: {str(e)}")
|
269 |
+
# Try to clean up memory in case of error
|
270 |
+
if USE_GPU:
|
271 |
+
torch.cuda.empty_cache()
|
272 |
+
return f"Error in detailed analysis: {str(e)}"
|
273 |
+
|
274 |
+
def get_clip_classification(image):
|
275 |
+
"""Get fast classification using CLIP"""
|
276 |
+
if not load_clip_model():
|
277 |
+
return []
|
278 |
+
|
279 |
+
try:
|
280 |
+
# Process with CLIP
|
281 |
+
inputs = clip_processor(
|
282 |
+
text=CATEGORIES,
|
283 |
+
images=image,
|
284 |
+
return_tensors="pt",
|
285 |
+
padding=True
|
286 |
+
)
|
287 |
+
|
288 |
+
# Move to GPU if available
|
289 |
+
if USE_GPU:
|
290 |
+
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
291 |
+
|
292 |
+
# Get predictions
|
293 |
+
with torch.inference_mode():
|
294 |
+
outputs = clip_model(**inputs)
|
295 |
+
|
296 |
+
# Process results
|
297 |
+
logits_per_image = outputs.logits_per_image
|
298 |
+
probs = logits_per_image.softmax(dim=1)
|
299 |
+
|
300 |
+
# Get top predictions
|
301 |
+
values, indices = probs[0].topk(8)
|
302 |
+
|
303 |
+
# Format results
|
304 |
+
return [(CATEGORIES[idx], value.item() * 100) for value, idx in zip(values, indices)]
|
305 |
+
except Exception as e:
|
306 |
+
print(f"Error in CLIP classification: {str(e)}")
|
307 |
+
return []
|
308 |
+
|
309 |
+
def process_image(image, get_detailed=False):
|
310 |
+
"""Process image with both fast and detailed analysis"""
|
311 |
+
if image is None:
|
312 |
+
return "Please upload an image to analyze."
|
313 |
+
|
314 |
+
try:
|
315 |
+
# Start timing
|
316 |
+
start_time = time.time()
|
317 |
+
|
318 |
+
# Preprocess image
|
319 |
+
if hasattr(image, 'mode') and image.mode != 'RGB':
|
320 |
+
image = image.convert('RGB')
|
321 |
+
|
322 |
+
# Resize for efficiency
|
323 |
+
if max(image.size) > 600: # Smaller max size for better performance
|
324 |
+
ratio = 600 / max(image.size)
|
325 |
+
new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
|
326 |
+
image = image.resize(new_size, Image.LANCZOS)
|
327 |
+
|
328 |
+
# Get fast classification first
|
329 |
+
categories = get_clip_classification(image)
|
330 |
+
|
331 |
+
result = "## Image Classification\n"
|
332 |
+
result += "This image appears to contain:\n"
|
333 |
+
for category, confidence in categories:
|
334 |
+
result += f"- {category.title()} ({confidence:.1f}%)\n"
|
335 |
+
|
336 |
+
# Add detailed analysis if requested
|
337 |
+
if get_detailed:
|
338 |
+
result += "\n## Detailed Analysis\n"
|
339 |
+
detailed_result = get_detailed_analysis(image)
|
340 |
+
result += detailed_result
|
341 |
+
|
342 |
+
# Add timing information
|
343 |
+
elapsed_time = time.time() - start_time
|
344 |
+
result += f"\n\nAnalysis completed in {elapsed_time:.2f} seconds."
|
345 |
+
|
346 |
+
# Clean up memory
|
347 |
+
if torch.cuda.is_available():
|
348 |
+
torch.cuda.empty_cache()
|
349 |
+
|
350 |
+
return result
|
351 |
+
|
352 |
+
except Exception as e:
|
353 |
+
print(f"Error: {str(e)}")
|
354 |
+
if torch.cuda.is_available():
|
355 |
+
torch.cuda.empty_cache()
|
356 |
+
return f"Error processing image: {str(e)}"
|
357 |
+
|
358 |
+
# Create interface with more options
|
359 |
+
with gr.Blocks(title="Enhanced Image Analyzer") as demo:
|
360 |
+
gr.Markdown("# Enhanced Image Analyzer")
|
361 |
+
gr.Markdown("Upload an image and choose between fast classification or detailed analysis.")
|
362 |
+
|
363 |
+
with gr.Row():
|
364 |
+
with gr.Column():
|
365 |
+
input_image = gr.Image(type="pil", label="Upload an image")
|
366 |
+
detailed_checkbox = gr.Checkbox(label="Get detailed analysis (slower but better quality)", value=False)
|
367 |
+
analyze_btn = gr.Button("Analyze Image", variant="primary")
|
368 |
+
|
369 |
+
with gr.Column():
|
370 |
+
output = gr.Markdown(label="Analysis Results")
|
371 |
+
|
372 |
+
analyze_btn.click(
|
373 |
+
fn=process_image,
|
374 |
+
inputs=[input_image, detailed_checkbox],
|
375 |
+
outputs=output
|
376 |
+
)
|
377 |
+
|
378 |
+
# Optional examples
|
379 |
+
if os.path.exists("data_temp"):
|
380 |
+
examples = [os.path.join("data_temp", f) for f in os.listdir("data_temp")
|
381 |
+
if f.endswith(('.png', '.jpg', '.jpeg'))]
|
382 |
+
if examples:
|
383 |
+
gr.Examples(examples=examples, inputs=input_image)
|
384 |
+
|
385 |
+
if __name__ == "__main__":
|
386 |
+
# Start with clean memory
|
387 |
+
if torch.cuda.is_available():
|
388 |
+
torch.cuda.empty_cache()
|
389 |
+
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
|
app_internvl2.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from PIL import Image
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
import math
|
8 |
+
|
9 |
+
# Import lmdeploy for InternVL2 model
|
10 |
+
from lmdeploy import pipeline, TurbomindEngineConfig
|
11 |
+
from lmdeploy.vl import load_image
|
12 |
+
|
13 |
+
# Set environment variables
|
14 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
|
15 |
+
|
16 |
+
# Model configuration
|
17 |
+
MODEL_ID = "OpenGVLab/InternVL2-40B-AWQ" # 4-bit quantized model
|
18 |
+
USE_GPU = torch.cuda.is_available()
|
19 |
+
|
20 |
+
# Global variables for model
|
21 |
+
internvl2_pipeline = None
|
22 |
+
|
23 |
+
def load_internvl2_model():
|
24 |
+
"""Load the InternVL2 model using lmdeploy"""
|
25 |
+
global internvl2_pipeline
|
26 |
+
|
27 |
+
# If already loaded, return
|
28 |
+
if internvl2_pipeline is not None:
|
29 |
+
return True
|
30 |
+
|
31 |
+
print("Loading InternVL2 model...")
|
32 |
+
try:
|
33 |
+
# Configure for AWQ quantized model
|
34 |
+
backend_config = TurbomindEngineConfig(model_format='awq')
|
35 |
+
|
36 |
+
# Create pipeline
|
37 |
+
internvl2_pipeline = pipeline(
|
38 |
+
MODEL_ID,
|
39 |
+
backend_config=backend_config,
|
40 |
+
log_level='INFO'
|
41 |
+
)
|
42 |
+
|
43 |
+
print("InternVL2 model loaded successfully!")
|
44 |
+
return True
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Error loading InternVL2 model: {str(e)}")
|
47 |
+
if "CUDA out of memory" in str(e):
|
48 |
+
print("Not enough GPU memory for the model")
|
49 |
+
return False
|
50 |
+
|
51 |
+
def analyze_image(image, prompt):
|
52 |
+
"""Analyze the image using InternVL2 model"""
|
53 |
+
try:
|
54 |
+
start_time = time.time()
|
55 |
+
|
56 |
+
# Make sure the model is loaded
|
57 |
+
if not load_internvl2_model():
|
58 |
+
return "Couldn't load InternVL2 model."
|
59 |
+
|
60 |
+
# Convert numpy array to PIL Image
|
61 |
+
if isinstance(image, np.ndarray):
|
62 |
+
image_pil = Image.fromarray(image).convert('RGB')
|
63 |
+
else:
|
64 |
+
# If somehow it's already a PIL Image
|
65 |
+
image_pil = image.convert('RGB')
|
66 |
+
|
67 |
+
# Run inference with the model
|
68 |
+
response = internvl2_pipeline((prompt, image_pil))
|
69 |
+
|
70 |
+
# Get the response text
|
71 |
+
result = response.text
|
72 |
+
|
73 |
+
elapsed_time = time.time() - start_time
|
74 |
+
return result
|
75 |
+
|
76 |
+
except Exception as e:
|
77 |
+
print(f"Error in image analysis: {str(e)}")
|
78 |
+
# Try to clean up memory in case of error
|
79 |
+
if USE_GPU:
|
80 |
+
torch.cuda.empty_cache()
|
81 |
+
return f"Error in image analysis: {str(e)}"
|
82 |
+
|
83 |
+
def process_image(image, analysis_type="general"):
|
84 |
+
"""Process the image and return the analysis"""
|
85 |
+
if image is None:
|
86 |
+
return "Please upload an image."
|
87 |
+
|
88 |
+
# Define prompt based on analysis type
|
89 |
+
if analysis_type == "general":
|
90 |
+
prompt = "Describe this image in detail."
|
91 |
+
elif analysis_type == "text":
|
92 |
+
prompt = "What text can you see in this image? Please transcribe it accurately."
|
93 |
+
elif analysis_type == "chart":
|
94 |
+
prompt = "Analyze any charts, graphs or diagrams in this image in detail, including trends, data points, and conclusions."
|
95 |
+
elif analysis_type == "people":
|
96 |
+
prompt = "Describe the people in this image - their appearance, actions, and expressions."
|
97 |
+
elif analysis_type == "technical":
|
98 |
+
prompt = "Provide a technical analysis of this image, including object identification, spatial relationships, and any technical elements present."
|
99 |
+
else:
|
100 |
+
prompt = "Describe this image in detail."
|
101 |
+
|
102 |
+
start_time = time.time()
|
103 |
+
|
104 |
+
# Get analysis from the model
|
105 |
+
analysis = analyze_image(image, prompt)
|
106 |
+
|
107 |
+
elapsed_time = time.time() - start_time
|
108 |
+
return f"{analysis}\n\nAnalysis completed in {elapsed_time:.2f} seconds."
|
109 |
+
|
110 |
+
# Define the Gradio interface
|
111 |
+
def create_interface():
|
112 |
+
with gr.Blocks(title="Image Analysis with InternVL2") as demo:
|
113 |
+
gr.Markdown("# Image Analysis with InternVL2-40B")
|
114 |
+
gr.Markdown("Upload an image to analyze it using the InternVL2-40B model.")
|
115 |
+
|
116 |
+
with gr.Row():
|
117 |
+
with gr.Column(scale=1):
|
118 |
+
input_image = gr.Image(type="pil", label="Upload Image")
|
119 |
+
analysis_type = gr.Radio(
|
120 |
+
["general", "text", "chart", "people", "technical"],
|
121 |
+
label="Analysis Type",
|
122 |
+
value="general"
|
123 |
+
)
|
124 |
+
submit_btn = gr.Button("Analyze Image")
|
125 |
+
|
126 |
+
with gr.Column(scale=2):
|
127 |
+
output_text = gr.Textbox(label="Analysis Result", lines=20)
|
128 |
+
|
129 |
+
submit_btn.click(
|
130 |
+
fn=process_image,
|
131 |
+
inputs=[input_image, analysis_type],
|
132 |
+
outputs=output_text
|
133 |
+
)
|
134 |
+
|
135 |
+
gr.Markdown("""
|
136 |
+
## Analysis Types
|
137 |
+
- **General**: General description of the image
|
138 |
+
- **Text**: Focus on identifying and transcribing text in the image
|
139 |
+
- **Chart**: Detailed analysis of charts, graphs, and diagrams
|
140 |
+
- **People**: Description of people, their appearance and actions
|
141 |
+
- **Technical**: Technical analysis identifying objects and spatial relationships
|
142 |
+
""")
|
143 |
+
|
144 |
+
# Examples
|
145 |
+
gr.Examples(
|
146 |
+
examples=[
|
147 |
+
["data_temp/page_2.png", "general"],
|
148 |
+
["data_temp/page_2.png", "text"],
|
149 |
+
["data_temp/page_2.png", "chart"]
|
150 |
+
],
|
151 |
+
inputs=[input_image, analysis_type],
|
152 |
+
outputs=output_text,
|
153 |
+
fn=process_image,
|
154 |
+
cache_examples=True,
|
155 |
+
)
|
156 |
+
|
157 |
+
return demo
|
158 |
+
|
159 |
+
# Main function
|
160 |
+
if __name__ == "__main__":
|
161 |
+
# Create the Gradio interface
|
162 |
+
demo = create_interface()
|
163 |
+
|
164 |
+
# Launch the interface
|
165 |
+
demo.launch(share=False)
|
csv_to_html.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from jinja2 import Template
|
3 |
+
|
4 |
+
# Read the CSV file
|
5 |
+
df = pd.read_csv('sample_data.csv')
|
6 |
+
|
7 |
+
# Calculate column totals (excluding the first column which contains row labels)
|
8 |
+
totals = df.iloc[:, 1:].sum()
|
9 |
+
|
10 |
+
# Create HTML template
|
11 |
+
html_template = """
|
12 |
+
<!DOCTYPE html>
|
13 |
+
<html>
|
14 |
+
<head>
|
15 |
+
<title>CSV Data Report</title>
|
16 |
+
<style>
|
17 |
+
body {
|
18 |
+
font-family: Arial, sans-serif;
|
19 |
+
margin: 20px;
|
20 |
+
}
|
21 |
+
table {
|
22 |
+
border-collapse: collapse;
|
23 |
+
width: 100%;
|
24 |
+
margin-bottom: 20px;
|
25 |
+
}
|
26 |
+
th, td {
|
27 |
+
border: 1px solid #ddd;
|
28 |
+
padding: 8px;
|
29 |
+
text-align: right;
|
30 |
+
}
|
31 |
+
th {
|
32 |
+
background-color: #f2f2f2;
|
33 |
+
}
|
34 |
+
.total-row {
|
35 |
+
font-weight: bold;
|
36 |
+
background-color: #e6e6e6;
|
37 |
+
}
|
38 |
+
h1 {
|
39 |
+
color: #333;
|
40 |
+
}
|
41 |
+
</style>
|
42 |
+
</head>
|
43 |
+
<body>
|
44 |
+
<h1>CSV Data Report</h1>
|
45 |
+
|
46 |
+
<h2>Raw Data</h2>
|
47 |
+
{{ raw_data | safe }}
|
48 |
+
|
49 |
+
<h2>Column Totals</h2>
|
50 |
+
<table>
|
51 |
+
<tr>
|
52 |
+
{% for column in totals.index %}
|
53 |
+
<th>{{ column }}</th>
|
54 |
+
{% endfor %}
|
55 |
+
</tr>
|
56 |
+
<tr class="total-row">
|
57 |
+
{% for value in totals.values %}
|
58 |
+
<td>{{ "{:,.2f}".format(value) }}</td>
|
59 |
+
{% endfor %}
|
60 |
+
</tr>
|
61 |
+
</table>
|
62 |
+
</body>
|
63 |
+
</html>
|
64 |
+
"""
|
65 |
+
|
66 |
+
# Convert the raw data to HTML
|
67 |
+
raw_data_html = df.to_html(index=False, classes='table')
|
68 |
+
|
69 |
+
# Create the template and render it
|
70 |
+
template = Template(html_template)
|
71 |
+
html_output = template.render(raw_data=raw_data_html, totals=totals)
|
72 |
+
|
73 |
+
# Save the HTML output
|
74 |
+
with open('report.html', 'w') as f:
|
75 |
+
f.write(html_output)
|
76 |
+
|
77 |
+
print("Report has been generated as 'report.html'")
|
data_temp/page_2.png
ADDED
![]() |
Git LFS Details
|
docs/aws_deployment.md
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AWS SageMaker Deployment Guide
|
2 |
+
|
3 |
+
This guide provides step-by-step instructions for deploying the Image Description application to AWS SageMaker.
|
4 |
+
|
5 |
+
## Prerequisites
|
6 |
+
|
7 |
+
- AWS account with SageMaker permissions
|
8 |
+
- AWS CLI installed and configured
|
9 |
+
- Docker installed on your local machine
|
10 |
+
- The source code from this repository
|
11 |
+
|
12 |
+
## Step 1: Create an Amazon ECR Repository
|
13 |
+
|
14 |
+
```bash
|
15 |
+
aws ecr create-repository --repository-name image-descriptor
|
16 |
+
```
|
17 |
+
|
18 |
+
Note the repository URI returned by this command. You'll use it in the next step.
|
19 |
+
|
20 |
+
## Step 2: Build and Push the Docker Image
|
21 |
+
|
22 |
+
1. Log in to ECR:
|
23 |
+
|
24 |
+
```bash
|
25 |
+
aws ecr get-login-password --region your-region | docker login --username AWS --password-stdin your-account-id.dkr.ecr.your-region.amazonaws.com
|
26 |
+
```
|
27 |
+
|
28 |
+
2. Build the Docker image:
|
29 |
+
|
30 |
+
```bash
|
31 |
+
docker build -t image-descriptor .
|
32 |
+
```
|
33 |
+
|
34 |
+
3. Tag and push the image:
|
35 |
+
|
36 |
+
```bash
|
37 |
+
docker tag image-descriptor:latest your-account-id.dkr.ecr.your-region.amazonaws.com/image-descriptor:latest
|
38 |
+
docker push your-account-id.dkr.ecr.your-region.amazonaws.com/image-descriptor:latest
|
39 |
+
```
|
40 |
+
|
41 |
+
## Step 3: Create a SageMaker Model
|
42 |
+
|
43 |
+
1. Create a model.json file:
|
44 |
+
|
45 |
+
```json
|
46 |
+
{
|
47 |
+
"ModelName": "QwenVLImageDescriptor",
|
48 |
+
"PrimaryContainer": {
|
49 |
+
"Image": "your-account-id.dkr.ecr.your-region.amazonaws.com/image-descriptor:latest",
|
50 |
+
"Environment": {
|
51 |
+
"PORT": "8080"
|
52 |
+
}
|
53 |
+
},
|
54 |
+
"ExecutionRoleArn": "arn:aws:iam::your-account-id:role/service-role/AmazonSageMaker-ExecutionRole"
|
55 |
+
}
|
56 |
+
```
|
57 |
+
|
58 |
+
2. Create the SageMaker model:
|
59 |
+
|
60 |
+
```bash
|
61 |
+
aws sagemaker create-model --cli-input-json file://model.json
|
62 |
+
```
|
63 |
+
|
64 |
+
## Step 4: Create an Endpoint Configuration
|
65 |
+
|
66 |
+
1. Create a config.json file:
|
67 |
+
|
68 |
+
```json
|
69 |
+
{
|
70 |
+
"EndpointConfigName": "QwenVLImageDescriptorConfig",
|
71 |
+
"ProductionVariants": [
|
72 |
+
{
|
73 |
+
"VariantName": "AllTraffic",
|
74 |
+
"ModelName": "QwenVLImageDescriptor",
|
75 |
+
"InstanceType": "ml.g5.2xlarge",
|
76 |
+
"InitialInstanceCount": 1
|
77 |
+
}
|
78 |
+
]
|
79 |
+
}
|
80 |
+
```
|
81 |
+
|
82 |
+
2. Create the endpoint configuration:
|
83 |
+
|
84 |
+
```bash
|
85 |
+
aws sagemaker create-endpoint-config --cli-input-json file://config.json
|
86 |
+
```
|
87 |
+
|
88 |
+
## Step 5: Create the Endpoint
|
89 |
+
|
90 |
+
```bash
|
91 |
+
aws sagemaker create-endpoint --endpoint-name qwen-vl-image-descriptor --endpoint-config-name QwenVLImageDescriptorConfig
|
92 |
+
```
|
93 |
+
|
94 |
+
This will take several minutes to deploy.
|
95 |
+
|
96 |
+
## Step 6: Invoke the Endpoint
|
97 |
+
|
98 |
+
You can invoke the endpoint using the AWS SDK or AWS CLI.
|
99 |
+
|
100 |
+
Using Python SDK:
|
101 |
+
|
102 |
+
```python
|
103 |
+
import boto3
|
104 |
+
import json
|
105 |
+
import base64
|
106 |
+
from PIL import Image
|
107 |
+
import io
|
108 |
+
|
109 |
+
# Initialize the SageMaker runtime client
|
110 |
+
runtime = boto3.client('sagemaker-runtime')
|
111 |
+
|
112 |
+
# Load and encode the image
|
113 |
+
with open('data_temp/page_2.png', 'rb') as f:
|
114 |
+
image_data = f.read()
|
115 |
+
image_b64 = base64.b64encode(image_data).decode('utf-8')
|
116 |
+
|
117 |
+
# Create the request payload
|
118 |
+
payload = {
|
119 |
+
'image_data': image_b64
|
120 |
+
}
|
121 |
+
|
122 |
+
# Invoke the endpoint
|
123 |
+
response = runtime.invoke_endpoint(
|
124 |
+
EndpointName='qwen-vl-image-descriptor',
|
125 |
+
ContentType='application/json',
|
126 |
+
Body=json.dumps(payload)
|
127 |
+
)
|
128 |
+
|
129 |
+
# Parse the response
|
130 |
+
result = json.loads(response['Body'].read().decode())
|
131 |
+
print(json.dumps(result, indent=2))
|
132 |
+
```
|
133 |
+
|
134 |
+
## Step 7: Set Up API Gateway (Optional)
|
135 |
+
|
136 |
+
For public HTTP access, set up an API Gateway:
|
137 |
+
|
138 |
+
1. Create a new REST API in API Gateway
|
139 |
+
2. Create a new resource and POST method
|
140 |
+
3. Configure the integration to use the SageMaker endpoint
|
141 |
+
4. Deploy the API to a stage
|
142 |
+
5. Note the API Gateway URL for client use
|
143 |
+
|
144 |
+
## Cost Optimization
|
145 |
+
|
146 |
+
To optimize costs:
|
147 |
+
|
148 |
+
1. Use SageMaker Serverless Inference instead of a dedicated endpoint
|
149 |
+
2. Implement auto-scaling for your endpoint
|
150 |
+
3. Use Spot Instances for non-critical workloads
|
151 |
+
4. Schedule endpoints to be active only during business hours
|
152 |
+
|
153 |
+
## Monitoring
|
154 |
+
|
155 |
+
Set up CloudWatch Alarms to monitor:
|
156 |
+
|
157 |
+
1. Endpoint invocation metrics
|
158 |
+
2. Error rates
|
159 |
+
3. Latency
|
160 |
+
4. Instance utilization
|
161 |
+
|
162 |
+
## Cleanup
|
163 |
+
|
164 |
+
To avoid ongoing charges, delete resources when not in use:
|
165 |
+
|
166 |
+
```bash
|
167 |
+
aws sagemaker delete-endpoint --endpoint-name qwen-vl-image-descriptor
|
168 |
+
aws sagemaker delete-endpoint-config --endpoint-config-name QwenVLImageDescriptorConfig
|
169 |
+
aws sagemaker delete-model --model-name QwenVLImageDescriptor
|
170 |
+
```
|
docs/azure_deployment.md
ADDED
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Azure Machine Learning Deployment Guide
|
2 |
+
|
3 |
+
This guide provides step-by-step instructions for deploying the Image Description application to Azure Machine Learning.
|
4 |
+
|
5 |
+
## Prerequisites
|
6 |
+
|
7 |
+
- Azure subscription
|
8 |
+
- Azure CLI installed and configured
|
9 |
+
- Azure Machine Learning workspace
|
10 |
+
- The source code from this repository
|
11 |
+
|
12 |
+
## Step 1: Set Up Azure Machine Learning
|
13 |
+
|
14 |
+
1. Create a Resource Group (if you don't have one):
|
15 |
+
|
16 |
+
```bash
|
17 |
+
az group create --name image-descriptor-rg --location eastus
|
18 |
+
```
|
19 |
+
|
20 |
+
2. Create an Azure Machine Learning workspace:
|
21 |
+
|
22 |
+
```bash
|
23 |
+
az ml workspace create --workspace-name image-descriptor-ws \
|
24 |
+
--resource-group image-descriptor-rg \
|
25 |
+
--location eastus
|
26 |
+
```
|
27 |
+
|
28 |
+
## Step 2: Create a Compute Cluster
|
29 |
+
|
30 |
+
Create a GPU-enabled compute cluster for training and inference:
|
31 |
+
|
32 |
+
```bash
|
33 |
+
az ml compute create --name gpu-cluster \
|
34 |
+
--workspace-name image-descriptor-ws \
|
35 |
+
--resource-group image-descriptor-rg \
|
36 |
+
--type AmlCompute \
|
37 |
+
--min-instances 0 \
|
38 |
+
--max-instances 1 \
|
39 |
+
--size Standard_NC6s_v3
|
40 |
+
```
|
41 |
+
|
42 |
+
## Step 3: Prepare Environment Configuration
|
43 |
+
|
44 |
+
Create an environment.yml file to define dependencies:
|
45 |
+
|
46 |
+
```yaml
|
47 |
+
name: image_descriptor_env
|
48 |
+
channels:
|
49 |
+
- pytorch
|
50 |
+
- conda-forge
|
51 |
+
- defaults
|
52 |
+
dependencies:
|
53 |
+
- python=3.9
|
54 |
+
- pip=23.0
|
55 |
+
- pytorch=2.0.0
|
56 |
+
- torchvision=0.15.0
|
57 |
+
- pip:
|
58 |
+
- transformers>=4.36.0
|
59 |
+
- accelerate>=0.25.0
|
60 |
+
- bitsandbytes>=0.41.0
|
61 |
+
- safetensors>=0.4.0
|
62 |
+
- flask>=2.3.2
|
63 |
+
- flask-cors>=4.0.0
|
64 |
+
- gunicorn>=21.2.0
|
65 |
+
- pillow>=10.0.0
|
66 |
+
- matplotlib>=3.7.0
|
67 |
+
- python-dotenv>=1.0.0
|
68 |
+
- azureml-core>=1.48.0
|
69 |
+
- azureml-defaults>=1.48.0
|
70 |
+
- inference-schema>=1.4.1
|
71 |
+
```
|
72 |
+
|
73 |
+
## Step 4: Create a Model Entry Script
|
74 |
+
|
75 |
+
Create a file called `score.py` to handle Azure ML model inference:
|
76 |
+
|
77 |
+
```python
|
78 |
+
import json
|
79 |
+
import os
|
80 |
+
import io
|
81 |
+
import base64
|
82 |
+
import logging
|
83 |
+
import torch
|
84 |
+
from PIL import Image
|
85 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
|
86 |
+
|
87 |
+
# Configure logging
|
88 |
+
logging.basicConfig(level=logging.INFO)
|
89 |
+
logger = logging.getLogger(__name__)
|
90 |
+
|
91 |
+
# Global variables
|
92 |
+
model = None
|
93 |
+
processor = None
|
94 |
+
tokenizer = None
|
95 |
+
|
96 |
+
def init():
|
97 |
+
"""Initialize the model when the service starts"""
|
98 |
+
global model, processor, tokenizer
|
99 |
+
|
100 |
+
logger.info("Loading model...")
|
101 |
+
model_id = "Qwen/Qwen2-VL-7B"
|
102 |
+
|
103 |
+
# Load model components with quantization for efficiency
|
104 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
105 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
106 |
+
|
107 |
+
# Load model with 4-bit quantization to reduce memory requirements
|
108 |
+
model = AutoModelForCausalLM.from_pretrained(
|
109 |
+
model_id,
|
110 |
+
torch_dtype=torch.bfloat16,
|
111 |
+
load_in_4bit=True,
|
112 |
+
device_map="auto"
|
113 |
+
)
|
114 |
+
logger.info("Model loaded successfully")
|
115 |
+
|
116 |
+
def run(raw_data):
|
117 |
+
"""Process an image and generate descriptions
|
118 |
+
|
119 |
+
Args:
|
120 |
+
raw_data: A JSON string containing the image as base64 encoded data
|
121 |
+
|
122 |
+
Returns:
|
123 |
+
A JSON string containing the descriptions
|
124 |
+
"""
|
125 |
+
global model, processor, tokenizer
|
126 |
+
|
127 |
+
try:
|
128 |
+
# Parse input
|
129 |
+
data = json.loads(raw_data)
|
130 |
+
|
131 |
+
# Get the image data (from base64 or URL)
|
132 |
+
if 'image_data' in data:
|
133 |
+
image_bytes = base64.b64decode(data['image_data'])
|
134 |
+
image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
|
135 |
+
logger.info("Loaded image from base64 data")
|
136 |
+
elif 'image_url' in data:
|
137 |
+
# Handle image URLs (for Azure Storage or public URLs)
|
138 |
+
from urllib.request import urlopen
|
139 |
+
with urlopen(data['image_url']) as response:
|
140 |
+
image_bytes = response.read()
|
141 |
+
image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
|
142 |
+
logger.info(f"Loaded image from URL: {data['image_url']}")
|
143 |
+
else:
|
144 |
+
return json.dumps({"error": "No image data or URL provided"})
|
145 |
+
|
146 |
+
# Process the image
|
147 |
+
inputs = processor(
|
148 |
+
images=image,
|
149 |
+
return_tensors="pt"
|
150 |
+
).to(model.device)
|
151 |
+
|
152 |
+
# Basic description prompt
|
153 |
+
prompt_basic = "Describe this image briefly."
|
154 |
+
input_ids_basic = tokenizer(prompt_basic, return_tensors="pt").input_ids.to(model.device)
|
155 |
+
|
156 |
+
# Detailed description prompt
|
157 |
+
prompt_detailed = "Analyze this image in detail. Describe the main elements, any text visible, the colors, and the overall composition."
|
158 |
+
input_ids_detailed = tokenizer(prompt_detailed, return_tensors="pt").input_ids.to(model.device)
|
159 |
+
|
160 |
+
# Technical analysis prompt
|
161 |
+
prompt_technical = "What can you tell me about the technical aspects of this image?"
|
162 |
+
input_ids_technical = tokenizer(prompt_technical, return_tensors="pt").input_ids.to(model.device)
|
163 |
+
|
164 |
+
# Generate outputs for each prompt
|
165 |
+
# Basic description
|
166 |
+
with torch.no_grad():
|
167 |
+
output_basic = model.generate(
|
168 |
+
**inputs,
|
169 |
+
input_ids=input_ids_basic,
|
170 |
+
max_new_tokens=150,
|
171 |
+
do_sample=False
|
172 |
+
)
|
173 |
+
basic_description = tokenizer.decode(output_basic[0], skip_special_tokens=True).replace(prompt_basic, "").strip()
|
174 |
+
|
175 |
+
# Detailed description
|
176 |
+
with torch.no_grad():
|
177 |
+
output_detailed = model.generate(
|
178 |
+
**inputs,
|
179 |
+
input_ids=input_ids_detailed,
|
180 |
+
max_new_tokens=300,
|
181 |
+
do_sample=False
|
182 |
+
)
|
183 |
+
detailed_description = tokenizer.decode(output_detailed[0], skip_special_tokens=True).replace(prompt_detailed, "").strip()
|
184 |
+
|
185 |
+
# Technical analysis
|
186 |
+
with torch.no_grad():
|
187 |
+
output_technical = model.generate(
|
188 |
+
**inputs,
|
189 |
+
input_ids=input_ids_technical,
|
190 |
+
max_new_tokens=200,
|
191 |
+
do_sample=False
|
192 |
+
)
|
193 |
+
technical_analysis = tokenizer.decode(output_technical[0], skip_special_tokens=True).replace(prompt_technical, "").strip()
|
194 |
+
|
195 |
+
# Return the results
|
196 |
+
return json.dumps({
|
197 |
+
"success": True,
|
198 |
+
"basic_description": basic_description,
|
199 |
+
"detailed_description": detailed_description,
|
200 |
+
"technical_analysis": technical_analysis
|
201 |
+
})
|
202 |
+
|
203 |
+
except Exception as e:
|
204 |
+
logger.error(f"Error processing image: {str(e)}", exc_info=True)
|
205 |
+
return json.dumps({"error": f"Error generating description: {str(e)}"})
|
206 |
+
```
|
207 |
+
|
208 |
+
## Step 5: Register the Model
|
209 |
+
|
210 |
+
1. Create a model.yml file:
|
211 |
+
|
212 |
+
```yaml
|
213 |
+
$schema: https://azuremlschemas.azureedge.net/latest/model.schema.json
|
214 |
+
name: qwen-vl-image-descriptor
|
215 |
+
version: 1
|
216 |
+
description: Qwen2-VL-7B model for image description
|
217 |
+
path: .
|
218 |
+
```
|
219 |
+
|
220 |
+
2. Register the model:
|
221 |
+
|
222 |
+
```bash
|
223 |
+
az ml model create --file model.yml \
|
224 |
+
--workspace-name image-descriptor-ws \
|
225 |
+
--resource-group image-descriptor-rg
|
226 |
+
```
|
227 |
+
|
228 |
+
## Step 6: Deploy as an Online Endpoint
|
229 |
+
|
230 |
+
1. Create an endpoint.yml file:
|
231 |
+
|
232 |
+
```yaml
|
233 |
+
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
|
234 |
+
name: image-descriptor-endpoint
|
235 |
+
description: Endpoint for image description
|
236 |
+
auth_mode: key
|
237 |
+
```
|
238 |
+
|
239 |
+
2. Create a deployment.yml file:
|
240 |
+
|
241 |
+
```yaml
|
242 |
+
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
|
243 |
+
name: qwen-vl-deployment
|
244 |
+
endpoint_name: image-descriptor-endpoint
|
245 |
+
model: azureml:qwen-vl-image-descriptor:1
|
246 |
+
environment:
|
247 |
+
conda_file: environment.yml
|
248 |
+
image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.6-cudnn8-ubuntu20.04:latest
|
249 |
+
instance_type: Standard_NC6s_v3
|
250 |
+
instance_count: 1
|
251 |
+
request_settings:
|
252 |
+
max_concurrent_requests_per_instance: 1
|
253 |
+
request_timeout_ms: 120000
|
254 |
+
```
|
255 |
+
|
256 |
+
3. Create the endpoint:
|
257 |
+
|
258 |
+
```bash
|
259 |
+
az ml online-endpoint create --file endpoint.yml \
|
260 |
+
--workspace-name image-descriptor-ws \
|
261 |
+
--resource-group image-descriptor-rg
|
262 |
+
```
|
263 |
+
|
264 |
+
4. Create the deployment:
|
265 |
+
|
266 |
+
```bash
|
267 |
+
az ml online-deployment create --file deployment.yml \
|
268 |
+
--workspace-name image-descriptor-ws \
|
269 |
+
--resource-group image-descriptor-rg
|
270 |
+
```
|
271 |
+
|
272 |
+
5. Allocate 100% traffic to the deployment:
|
273 |
+
|
274 |
+
```bash
|
275 |
+
az ml online-endpoint update --name image-descriptor-endpoint \
|
276 |
+
--traffic "qwen-vl-deployment=100" \
|
277 |
+
--workspace-name image-descriptor-ws \
|
278 |
+
--resource-group image-descriptor-rg
|
279 |
+
```
|
280 |
+
|
281 |
+
## Step 7: Test the Endpoint
|
282 |
+
|
283 |
+
You can test the endpoint using the Azure ML SDK:
|
284 |
+
|
285 |
+
```python
|
286 |
+
import json
|
287 |
+
import base64
|
288 |
+
from azure.ai.ml import MLClient
|
289 |
+
from azure.identity import DefaultAzureCredential
|
290 |
+
from azure.ai.ml.entities import ManagedOnlineEndpoint
|
291 |
+
|
292 |
+
# Get a handle to the workspace
|
293 |
+
credential = DefaultAzureCredential()
|
294 |
+
ml_client = MLClient(
|
295 |
+
credential=credential,
|
296 |
+
subscription_id="your-subscription-id",
|
297 |
+
resource_group_name="image-descriptor-rg",
|
298 |
+
workspace_name="image-descriptor-ws"
|
299 |
+
)
|
300 |
+
|
301 |
+
# Get endpoint
|
302 |
+
endpoint = ml_client.online_endpoints.get("image-descriptor-endpoint")
|
303 |
+
|
304 |
+
# Load and encode the image
|
305 |
+
with open('data_temp/page_2.png', 'rb') as f:
|
306 |
+
image_data = f.read()
|
307 |
+
image_b64 = base64.b64encode(image_data).decode('utf-8')
|
308 |
+
|
309 |
+
# Create the request payload
|
310 |
+
payload = {
|
311 |
+
'image_data': image_b64
|
312 |
+
}
|
313 |
+
|
314 |
+
# Invoke the endpoint
|
315 |
+
response = ml_client.online_endpoints.invoke(
|
316 |
+
endpoint_name="image-descriptor-endpoint",
|
317 |
+
request_file=json.dumps(payload),
|
318 |
+
deployment_name="qwen-vl-deployment"
|
319 |
+
)
|
320 |
+
|
321 |
+
# Parse the response
|
322 |
+
result = json.loads(response)
|
323 |
+
print(json.dumps(result, indent=2))
|
324 |
+
```
|
325 |
+
|
326 |
+
## Cost Optimization
|
327 |
+
|
328 |
+
To optimize costs:
|
329 |
+
|
330 |
+
1. Use a smaller compute size if possible
|
331 |
+
2. Scale to zero instances when not in use
|
332 |
+
3. Set up autoscaling rules
|
333 |
+
4. Consider reserved instances for long-term deployments
|
334 |
+
|
335 |
+
## Monitoring
|
336 |
+
|
337 |
+
Monitor your endpoint using:
|
338 |
+
|
339 |
+
1. Azure Monitor
|
340 |
+
2. Application Insights
|
341 |
+
3. Azure ML metrics dashboard
|
342 |
+
4. Set up alerts for anomalies
|
343 |
+
|
344 |
+
## Cleanup
|
345 |
+
|
346 |
+
To avoid ongoing charges, delete resources when not in use:
|
347 |
+
|
348 |
+
```bash
|
349 |
+
# Delete the endpoint
|
350 |
+
az ml online-endpoint delete --name image-descriptor-endpoint \
|
351 |
+
--workspace-name image-descriptor-ws \
|
352 |
+
--resource-group image-descriptor-rg -y
|
353 |
+
|
354 |
+
# Delete compute cluster
|
355 |
+
az ml compute delete --name gpu-cluster \
|
356 |
+
--workspace-name image-descriptor-ws \
|
357 |
+
--resource-group image-descriptor-rg -y
|
358 |
+
|
359 |
+
# Delete workspace (optional)
|
360 |
+
az ml workspace delete --name image-descriptor-ws \
|
361 |
+
--resource-group image-descriptor-rg -y
|
362 |
+
|
363 |
+
# Delete resource group (optional)
|
364 |
+
az group delete --name image-descriptor-rg -y
|
365 |
+
```
|
entrypoint.sh
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
set -e
|
3 |
+
|
4 |
+
# Load environment variables
|
5 |
+
source .env
|
6 |
+
|
7 |
+
# Check if we should run in UI or API mode
|
8 |
+
if [ "$ENABLE_UI" = "true" ]; then
|
9 |
+
echo "Starting Gradio UI with API backend..."
|
10 |
+
exec python app.py --api-server &
|
11 |
+
sleep 5 # Give the API server time to start
|
12 |
+
exec python app.py
|
13 |
+
elif [ "$API_ONLY" = "true" ]; then
|
14 |
+
echo "Starting API server only..."
|
15 |
+
exec gunicorn --bind 0.0.0.0:$PORT --workers 1 --timeout 300 "image_descriptor:app"
|
16 |
+
else
|
17 |
+
# Default behavior for Hugging Face Spaces: run both
|
18 |
+
echo "Starting service in Hugging Face Spaces mode..."
|
19 |
+
exec python app.py
|
20 |
+
fi
|
image_descriptor.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import io
|
4 |
+
from PIL import Image
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPImageProcessor
|
7 |
+
import logging
|
8 |
+
import time
|
9 |
+
|
10 |
+
# Configure logging
|
11 |
+
logging.basicConfig(
|
12 |
+
level=logging.INFO,
|
13 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
14 |
+
)
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
# Global variables to store the model and processors
|
18 |
+
model = None
|
19 |
+
processor = None
|
20 |
+
tokenizer = None
|
21 |
+
|
22 |
+
def describe_image(image_path=None, image_data=None, show_image=False):
|
23 |
+
"""
|
24 |
+
Load an image and generate a description using Qwen2-VL-7B model.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
image_path (str, optional): Path to the image file
|
28 |
+
image_data (bytes, optional): Raw image data
|
29 |
+
show_image (bool): Whether to display the image
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
dict: Descriptions of the image
|
33 |
+
"""
|
34 |
+
global model, processor, tokenizer
|
35 |
+
|
36 |
+
# Initialize model if not already loaded
|
37 |
+
if model is None or processor is None or tokenizer is None:
|
38 |
+
load_model()
|
39 |
+
|
40 |
+
# Check if we have valid input
|
41 |
+
if image_path is None and image_data is None:
|
42 |
+
return {"error": "No image provided"}
|
43 |
+
|
44 |
+
try:
|
45 |
+
# Load the image
|
46 |
+
if image_path is not None:
|
47 |
+
if not os.path.exists(image_path):
|
48 |
+
return {"error": f"Image not found at {image_path}"}
|
49 |
+
logger.info(f"Processing image from path: {image_path}")
|
50 |
+
image = Image.open(image_path).convert('RGB')
|
51 |
+
else:
|
52 |
+
logger.info("Processing image from uploaded data")
|
53 |
+
image = Image.open(io.BytesIO(image_data)).convert('RGB')
|
54 |
+
|
55 |
+
# Display the image if requested (for local testing only)
|
56 |
+
if show_image:
|
57 |
+
plt.figure(figsize=(10, 8))
|
58 |
+
plt.imshow(image)
|
59 |
+
plt.axis('off')
|
60 |
+
if image_path:
|
61 |
+
plt.title(os.path.basename(image_path))
|
62 |
+
plt.show()
|
63 |
+
|
64 |
+
# Process the image
|
65 |
+
logger.info("Generating descriptions...")
|
66 |
+
|
67 |
+
# Process image and create inputs for various prompts
|
68 |
+
pixel_values = processor(images=image, return_tensors="pt").to(model.device)
|
69 |
+
|
70 |
+
# Basic description prompt
|
71 |
+
prompt_basic = "Describe this image briefly."
|
72 |
+
input_ids_basic = tokenizer(prompt_basic, return_tensors="pt").input_ids.to(model.device)
|
73 |
+
|
74 |
+
# Detailed description prompt
|
75 |
+
prompt_detailed = "Analyze this image in detail. Describe the main elements, any text visible, the colors, and the overall composition."
|
76 |
+
input_ids_detailed = tokenizer(prompt_detailed, return_tensors="pt").input_ids.to(model.device)
|
77 |
+
|
78 |
+
# Technical analysis prompt
|
79 |
+
prompt_technical = "What can you tell me about the technical aspects of this image?"
|
80 |
+
input_ids_technical = tokenizer(prompt_technical, return_tensors="pt").input_ids.to(model.device)
|
81 |
+
|
82 |
+
# Generate outputs for each prompt
|
83 |
+
# Basic description
|
84 |
+
with torch.no_grad():
|
85 |
+
output_basic = model.generate(
|
86 |
+
input_ids=input_ids_basic,
|
87 |
+
pixel_values=pixel_values.pixel_values,
|
88 |
+
max_new_tokens=150,
|
89 |
+
do_sample=False
|
90 |
+
)
|
91 |
+
basic_description = tokenizer.decode(output_basic[0], skip_special_tokens=True).replace(prompt_basic, "").strip()
|
92 |
+
|
93 |
+
# Detailed description
|
94 |
+
with torch.no_grad():
|
95 |
+
output_detailed = model.generate(
|
96 |
+
input_ids=input_ids_detailed,
|
97 |
+
pixel_values=pixel_values.pixel_values,
|
98 |
+
max_new_tokens=300,
|
99 |
+
do_sample=False
|
100 |
+
)
|
101 |
+
detailed_description = tokenizer.decode(output_detailed[0], skip_special_tokens=True).replace(prompt_detailed, "").strip()
|
102 |
+
|
103 |
+
# Technical analysis
|
104 |
+
with torch.no_grad():
|
105 |
+
output_technical = model.generate(
|
106 |
+
input_ids=input_ids_technical,
|
107 |
+
pixel_values=pixel_values.pixel_values,
|
108 |
+
max_new_tokens=200,
|
109 |
+
do_sample=False
|
110 |
+
)
|
111 |
+
technical_analysis = tokenizer.decode(output_technical[0], skip_special_tokens=True).replace(prompt_technical, "").strip()
|
112 |
+
|
113 |
+
return {
|
114 |
+
"success": True,
|
115 |
+
"basic_description": basic_description,
|
116 |
+
"detailed_description": detailed_description,
|
117 |
+
"technical_analysis": technical_analysis
|
118 |
+
}
|
119 |
+
|
120 |
+
except Exception as e:
|
121 |
+
logger.error(f"Error processing image: {str(e)}", exc_info=True)
|
122 |
+
return {"error": f"Error generating description: {str(e)}"}
|
123 |
+
|
124 |
+
def load_model():
|
125 |
+
"""Load the model and related components"""
|
126 |
+
global model, processor, tokenizer
|
127 |
+
|
128 |
+
try:
|
129 |
+
logger.info("Loading model...")
|
130 |
+
model_id = "Qwen/Qwen2-VL-7B"
|
131 |
+
|
132 |
+
# Use explicit processor class instead of AutoProcessor
|
133 |
+
processor = CLIPImageProcessor.from_pretrained(model_id)
|
134 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
135 |
+
|
136 |
+
# Load model with 4-bit quantization to reduce memory requirements
|
137 |
+
model = AutoModelForCausalLM.from_pretrained(
|
138 |
+
model_id,
|
139 |
+
torch_dtype=torch.bfloat16,
|
140 |
+
load_in_4bit=True,
|
141 |
+
device_map="auto"
|
142 |
+
)
|
143 |
+
logger.info("Model loaded successfully")
|
144 |
+
return True
|
145 |
+
except Exception as e:
|
146 |
+
logger.error(f"Error loading model: {str(e)}", exc_info=True)
|
147 |
+
return False
|
148 |
+
|
149 |
+
def main():
|
150 |
+
"""Run in command-line mode"""
|
151 |
+
# Path to the image
|
152 |
+
image_folder = "data_temp"
|
153 |
+
image_name = "page_2.png"
|
154 |
+
image_path = os.path.join(image_folder, image_name)
|
155 |
+
|
156 |
+
# Get the description
|
157 |
+
result = describe_image(image_path=image_path, show_image=True)
|
158 |
+
|
159 |
+
# Print the results
|
160 |
+
if "error" not in result:
|
161 |
+
print("\n==== Image Description Results (Qwen2-VL-7B) ====")
|
162 |
+
print(f"\nBasic Description:\n{result['basic_description']}")
|
163 |
+
print(f"\nDetailed Description:\n{result['detailed_description']}")
|
164 |
+
print(f"\nTechnical Analysis:\n{result['technical_analysis']}")
|
165 |
+
else:
|
166 |
+
print(result["error"]) # Print error message if there was an issue
|
167 |
+
|
168 |
+
if __name__ == "__main__":
|
169 |
+
main()
|
image_descriptor_requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==2.0.1
|
2 |
+
torchvision==0.15.2
|
3 |
+
transformers==4.36.2
|
4 |
+
pillow==10.0.1
|
5 |
+
matplotlib==3.7.3
|
6 |
+
accelerate==0.27.2
|
7 |
+
bitsandbytes==0.41.3
|
8 |
+
safetensors==0.4.1
|
9 |
+
flask==2.3.3
|
10 |
+
flask-cors==4.0.0
|
11 |
+
gunicorn==21.2.0
|
12 |
+
python-dotenv==1.0.0
|
13 |
+
gradio==4.12.0
|
14 |
+
requests==2.31.0
|
readme.mdp
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
placeholder
|
report.html
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
<!DOCTYPE html>
|
3 |
+
<html>
|
4 |
+
<head>
|
5 |
+
<title>CSV Data Report</title>
|
6 |
+
<style>
|
7 |
+
body {
|
8 |
+
font-family: Arial, sans-serif;
|
9 |
+
margin: 20px;
|
10 |
+
}
|
11 |
+
table {
|
12 |
+
border-collapse: collapse;
|
13 |
+
width: 100%;
|
14 |
+
margin-bottom: 20px;
|
15 |
+
}
|
16 |
+
th, td {
|
17 |
+
border: 1px solid #ddd;
|
18 |
+
padding: 8px;
|
19 |
+
text-align: right;
|
20 |
+
}
|
21 |
+
th {
|
22 |
+
background-color: #f2f2f2;
|
23 |
+
}
|
24 |
+
.total-row {
|
25 |
+
font-weight: bold;
|
26 |
+
background-color: #e6e6e6;
|
27 |
+
}
|
28 |
+
h1 {
|
29 |
+
color: #333;
|
30 |
+
}
|
31 |
+
</style>
|
32 |
+
</head>
|
33 |
+
<body>
|
34 |
+
<h1>CSV Data Report</h1>
|
35 |
+
|
36 |
+
<h2>Raw Data</h2>
|
37 |
+
<table border="1" class="dataframe table">
|
38 |
+
<thead>
|
39 |
+
<tr style="text-align: right;">
|
40 |
+
<th>Product</th>
|
41 |
+
<th>Electronics</th>
|
42 |
+
<th>Clothing</th>
|
43 |
+
<th>Food</th>
|
44 |
+
<th>Books</th>
|
45 |
+
</tr>
|
46 |
+
</thead>
|
47 |
+
<tbody>
|
48 |
+
<tr>
|
49 |
+
<td>January</td>
|
50 |
+
<td>1200</td>
|
51 |
+
<td>800</td>
|
52 |
+
<td>500</td>
|
53 |
+
<td>300</td>
|
54 |
+
</tr>
|
55 |
+
<tr>
|
56 |
+
<td>February</td>
|
57 |
+
<td>950</td>
|
58 |
+
<td>750</td>
|
59 |
+
<td>600</td>
|
60 |
+
<td>400</td>
|
61 |
+
</tr>
|
62 |
+
<tr>
|
63 |
+
<td>March</td>
|
64 |
+
<td>1500</td>
|
65 |
+
<td>900</td>
|
66 |
+
<td>550</td>
|
67 |
+
<td>350</td>
|
68 |
+
</tr>
|
69 |
+
<tr>
|
70 |
+
<td>April</td>
|
71 |
+
<td>1100</td>
|
72 |
+
<td>850</td>
|
73 |
+
<td>450</td>
|
74 |
+
<td>250</td>
|
75 |
+
</tr>
|
76 |
+
</tbody>
|
77 |
+
</table>
|
78 |
+
|
79 |
+
<h2>Column Totals</h2>
|
80 |
+
<table>
|
81 |
+
<tr>
|
82 |
+
|
83 |
+
<th>Electronics</th>
|
84 |
+
|
85 |
+
<th>Clothing</th>
|
86 |
+
|
87 |
+
<th>Food</th>
|
88 |
+
|
89 |
+
<th>Books</th>
|
90 |
+
|
91 |
+
</tr>
|
92 |
+
<tr class="total-row">
|
93 |
+
|
94 |
+
<td>4,750.00</td>
|
95 |
+
|
96 |
+
<td>3,300.00</td>
|
97 |
+
|
98 |
+
<td>2,100.00</td>
|
99 |
+
|
100 |
+
<td>1,300.00</td>
|
101 |
+
|
102 |
+
</tr>
|
103 |
+
</table>
|
104 |
+
</body>
|
105 |
+
</html>
|
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==2.1.0
|
2 |
+
torchvision==0.16.0
|
3 |
+
transformers>=4.37.2
|
4 |
+
lmdeploy>=0.5.3
|
5 |
+
pillow==10.0.1
|
6 |
+
numpy>=1.24.0
|
7 |
+
scipy==1.11.3
|
8 |
+
gradio==3.38.0
|
9 |
+
requests==2.31.0
|
10 |
+
accelerate==0.27.2
|
11 |
+
bitsandbytes==0.41.3
|
12 |
+
safetensors==0.4.1
|
13 |
+
openai
|
14 |
+
huggingface_hub
|
sample_data.csv
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Product,Electronics,Clothing,Food,Books
|
2 |
+
January,1200,800,500,300
|
3 |
+
February,950,750,600,400
|
4 |
+
March,1500,900,550,350
|
5 |
+
April,1100,850,450,250
|
server_deploy.sh
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Server deployment script for Image Description Application
|
4 |
+
echo "Starting Image Description API server deployment"
|
5 |
+
|
6 |
+
# Create virtual environment if it doesn't exist
|
7 |
+
if [ ! -d "venv" ]; then
|
8 |
+
echo "Creating virtual environment..."
|
9 |
+
python3 -m venv venv
|
10 |
+
fi
|
11 |
+
|
12 |
+
# Activate virtual environment
|
13 |
+
echo "Activating virtual environment..."
|
14 |
+
source venv/bin/activate
|
15 |
+
|
16 |
+
# Install requirements
|
17 |
+
echo "Installing dependencies..."
|
18 |
+
pip install -r image_descriptor_requirements.txt
|
19 |
+
|
20 |
+
# Create necessary directories
|
21 |
+
echo "Setting up directories..."
|
22 |
+
mkdir -p uploads
|
23 |
+
mkdir -p logs
|
24 |
+
|
25 |
+
# Start the server with Gunicorn
|
26 |
+
echo "Starting server with Gunicorn..."
|
27 |
+
gunicorn --bind 0.0.0.0:8000 --workers 1 --timeout 300 "image_descriptor:app" \
|
28 |
+
--log-level info \
|
29 |
+
--access-logfile logs/access.log \
|
30 |
+
--error-logfile logs/error.log \
|
31 |
+
--daemon
|
32 |
+
|
33 |
+
echo "Server started in the background on port 8000"
|
34 |
+
echo "You can test it with: curl http://localhost:8000/health"
|
35 |
+
echo "To process an image: curl -X POST -F \"image=@data_temp/page_2.png\" http://localhost:8000/describe"
|
test_internvl2.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
"""
|
5 |
+
Test script for InternVL2-40B-AWQ model using lmdeploy
|
6 |
+
"""
|
7 |
+
|
8 |
+
import argparse
|
9 |
+
import os
|
10 |
+
import time
|
11 |
+
from PIL import Image
|
12 |
+
|
13 |
+
from lmdeploy import pipeline, TurbomindEngineConfig
|
14 |
+
from lmdeploy.vl import load_image
|
15 |
+
|
16 |
+
|
17 |
+
def parse_args():
|
18 |
+
parser = argparse.ArgumentParser(description="Test InternVL2 model")
|
19 |
+
parser.add_argument(
|
20 |
+
"--model",
|
21 |
+
type=str,
|
22 |
+
default="OpenGVLab/InternVL2-40B-AWQ",
|
23 |
+
help="Model name or path"
|
24 |
+
)
|
25 |
+
parser.add_argument(
|
26 |
+
"--image",
|
27 |
+
type=str,
|
28 |
+
required=True,
|
29 |
+
help="Path to the test image"
|
30 |
+
)
|
31 |
+
parser.add_argument(
|
32 |
+
"--prompt",
|
33 |
+
type=str,
|
34 |
+
default="Describe this image in detail.",
|
35 |
+
help="Prompt for the model"
|
36 |
+
)
|
37 |
+
return parser.parse_args()
|
38 |
+
|
39 |
+
|
40 |
+
def main():
|
41 |
+
args = parse_args()
|
42 |
+
|
43 |
+
print(f"Loading model: {args.model}")
|
44 |
+
start_time = time.time()
|
45 |
+
|
46 |
+
# Configure the engine for AWQ quantized model
|
47 |
+
backend_config = TurbomindEngineConfig(model_format='awq')
|
48 |
+
|
49 |
+
# Create pipeline
|
50 |
+
pipe = pipeline(args.model, backend_config=backend_config, log_level='INFO')
|
51 |
+
|
52 |
+
load_time = time.time() - start_time
|
53 |
+
print(f"Model loaded in {load_time:.2f} seconds")
|
54 |
+
|
55 |
+
# Load image
|
56 |
+
try:
|
57 |
+
if args.image.startswith(('http://', 'https://')):
|
58 |
+
image = load_image(args.image)
|
59 |
+
print(f"Loaded image from URL: {args.image}")
|
60 |
+
else:
|
61 |
+
image_path = os.path.abspath(args.image)
|
62 |
+
if not os.path.exists(image_path):
|
63 |
+
raise FileNotFoundError(f"Image not found: {image_path}")
|
64 |
+
image = Image.open(image_path).convert('RGB')
|
65 |
+
print(f"Loaded image from path: {image_path}")
|
66 |
+
except Exception as e:
|
67 |
+
print(f"Error loading image: {e}")
|
68 |
+
return
|
69 |
+
|
70 |
+
# Run inference
|
71 |
+
print(f"Running inference with prompt: '{args.prompt}'")
|
72 |
+
start_time = time.time()
|
73 |
+
|
74 |
+
response = pipe((args.prompt, image))
|
75 |
+
|
76 |
+
inference_time = time.time() - start_time
|
77 |
+
print(f"Inference completed in {inference_time:.2f} seconds")
|
78 |
+
|
79 |
+
# Print result
|
80 |
+
print("\n--- RESULT ---")
|
81 |
+
print(response.text)
|
82 |
+
print("-------------\n")
|
83 |
+
|
84 |
+
|
85 |
+
if __name__ == "__main__":
|
86 |
+
main()
|
upload_internvl2_to_hf.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
"""
|
5 |
+
Script to upload the InternVL2 files to Hugging Face Spaces
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import sys
|
10 |
+
import getpass
|
11 |
+
from huggingface_hub import HfApi, create_repo, upload_folder
|
12 |
+
|
13 |
+
# Default repository name
|
14 |
+
DEFAULT_REPO = "mknolan/cursor_slides_internvl2"
|
15 |
+
|
16 |
+
def main():
|
17 |
+
"""Main function to upload files to Hugging Face Spaces"""
|
18 |
+
# Get Hugging Face token with WRITE access
|
19 |
+
token = getpass.getpass("Enter your Hugging Face token (with WRITE access): ")
|
20 |
+
|
21 |
+
# Get repository name
|
22 |
+
repo_name = input("Enter repository name (default: {}): ".format(DEFAULT_REPO)) or DEFAULT_REPO
|
23 |
+
|
24 |
+
print("Uploading to Space: {}".format(repo_name))
|
25 |
+
|
26 |
+
# Initialize Hugging Face API
|
27 |
+
api = HfApi(token=token)
|
28 |
+
|
29 |
+
try:
|
30 |
+
# Try to get the repository, create if it doesn't exist
|
31 |
+
try:
|
32 |
+
repo = api.repo_info(repo_id=repo_name, repo_type="space")
|
33 |
+
print("Repo {} ready".format(repo_name))
|
34 |
+
except Exception:
|
35 |
+
print("Creating new Space: {}".format(repo_name))
|
36 |
+
create_repo(
|
37 |
+
repo_id=repo_name,
|
38 |
+
token=token,
|
39 |
+
repo_type="space",
|
40 |
+
space_sdk="gradio",
|
41 |
+
private=False
|
42 |
+
)
|
43 |
+
|
44 |
+
# Upload the entire folder at once using upload_folder
|
45 |
+
print("Uploading files to Hugging Face Space...")
|
46 |
+
upload_folder(
|
47 |
+
folder_path=".", # Current directory
|
48 |
+
repo_id=repo_name,
|
49 |
+
repo_type="space",
|
50 |
+
ignore_patterns=[
|
51 |
+
".git*",
|
52 |
+
"*__pycache__*",
|
53 |
+
"*.pyc",
|
54 |
+
".DS_Store",
|
55 |
+
"*.ipynb_checkpoints*",
|
56 |
+
"venv",
|
57 |
+
".env"
|
58 |
+
],
|
59 |
+
commit_message="Upload InternVL2 implementation",
|
60 |
+
token=token
|
61 |
+
)
|
62 |
+
|
63 |
+
print("Upload completed!")
|
64 |
+
print("Check your Space at: https://huggingface.co/spaces/{}".format(repo_name))
|
65 |
+
|
66 |
+
except Exception as e:
|
67 |
+
print("Error: {}".format(e))
|
68 |
+
return 1
|
69 |
+
|
70 |
+
return 0
|
71 |
+
|
72 |
+
if __name__ == "__main__":
|
73 |
+
sys.exit(main())
|
upload_to_hf.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Script to upload the current directory to a Hugging Face Space
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
from huggingface_hub import HfApi, create_repo
|
9 |
+
|
10 |
+
# Configuration
|
11 |
+
SPACE_NAME = "mknolan/cursor_slides" # Change this to your space name
|
12 |
+
TOKEN = input("Enter your Hugging Face token (with WRITE access): ")
|
13 |
+
|
14 |
+
# Initialize API
|
15 |
+
api = HfApi(token=TOKEN)
|
16 |
+
|
17 |
+
def upload_directory():
|
18 |
+
"""Upload all files in the current directory to HF Space"""
|
19 |
+
print("Uploading to Space: {}".format(SPACE_NAME))
|
20 |
+
|
21 |
+
# Create repo if it doesn't exist (this is idempotent)
|
22 |
+
try:
|
23 |
+
create_repo(
|
24 |
+
repo_id=SPACE_NAME,
|
25 |
+
token=TOKEN,
|
26 |
+
repo_type="space",
|
27 |
+
exist_ok=True,
|
28 |
+
space_sdk="docker"
|
29 |
+
)
|
30 |
+
print("Repo {} ready".format(SPACE_NAME))
|
31 |
+
except Exception as e:
|
32 |
+
print("Note: Repo already exists or {}".format(str(e)))
|
33 |
+
|
34 |
+
# Gather all files to upload
|
35 |
+
files_to_upload = []
|
36 |
+
for root, _, files in os.walk("."):
|
37 |
+
# Skip .git directory and any other hidden directories
|
38 |
+
if "/.git" in root or "/.__pycache__" in root:
|
39 |
+
continue
|
40 |
+
|
41 |
+
for file in files:
|
42 |
+
# Skip hidden files and .git files
|
43 |
+
if file.startswith(".git") or file.startswith("."):
|
44 |
+
continue
|
45 |
+
|
46 |
+
path = os.path.join(root, file)
|
47 |
+
# Skip this upload script itself
|
48 |
+
if path == "./upload_to_hf.py":
|
49 |
+
continue
|
50 |
+
|
51 |
+
files_to_upload.append(path)
|
52 |
+
|
53 |
+
print("Found {} files to upload".format(len(files_to_upload)))
|
54 |
+
|
55 |
+
# Upload each file
|
56 |
+
for i, path in enumerate(files_to_upload):
|
57 |
+
print("[{}/{}] Uploading {}...".format(i+1, len(files_to_upload), path))
|
58 |
+
try:
|
59 |
+
# Path in repo (remove leading ./)
|
60 |
+
path_in_repo = path[2:] if path.startswith("./") else path
|
61 |
+
api.upload_file(
|
62 |
+
path_or_fileobj=path,
|
63 |
+
path_in_repo=path_in_repo,
|
64 |
+
repo_id=SPACE_NAME,
|
65 |
+
repo_type="space"
|
66 |
+
)
|
67 |
+
except Exception as e:
|
68 |
+
print("Error uploading {}: {}".format(path, str(e)))
|
69 |
+
|
70 |
+
print("Upload completed!")
|
71 |
+
print("Check your Space at: https://huggingface.co/spaces/{}".format(SPACE_NAME))
|
72 |
+
|
73 |
+
if __name__ == "__main__":
|
74 |
+
upload_directory()
|