Spaces:

DeepLearning101
/

Multimodal-Playground

Running

File size: 7,820 Bytes

import gradio as gr
import requests
import mimetypes
import json, os
import asyncio
import aiohttp

import subprocess

def upgrade_pip():
    try:
        subprocess.check_call([os.sys.executable, "-m", "pip", "install", "--upgrade", "pip"])
        print("pip 升級成功")
    except subprocess.CalledProcessError:
        print("pip 升級失敗")

# 呼叫升級函數
upgrade_pip()

LLM_API = os.environ.get("LLM_API")
LLM_URL = os.environ.get("LLM_URL")

USER_ID = "HuggingFace Space"  # Placeholder user ID

async def send_chat_message(LLM_URL, LLM_API, category, file_id):
    payload = {
        "inputs": {},
        "query": category,
        "response_mode": "streaming",
        "conversation_id": "",
        "user": USER_ID,
        "files": [
            {
                "type": "image",
                "transfer_method": "local_file",
                "upload_file_id": file_id
            }
        ]
    }
    print("Sending chat message payload:", payload)  # Debug information
    async with aiohttp.ClientSession() as session:
        async with session.post(
            f"{LLM_URL}/chat-messages",
            headers={"Authorization": f"Bearer {LLM_API}"},
            json=payload
        ) as response:
            print("Request URL:", f"{LLM_URL}/chat-messages")
            print("Response status code:", response.status)
            if response.status == 404:
                return "Error: Endpoint not found (404)"
            
            last_thought = None
            async for line in response.content:
                if line:
                    try:
                        data = json.loads(line.split(b"data: ")[1].decode("utf-8"))
                        if data.get("event") == "agent_thought":
                            last_thought = data.get("thought")
                    except (IndexError, json.JSONDecodeError):
                        continue
            
            if last_thought:
                return last_thought.strip()
            else:
                return "Error: No thought found in the response"

async def upload_file(LLM_URL, LLM_API, file_path, user_id):
    if not os.path.exists(file_path):
        return f"Error: File {file_path} not found"
    mime_type, _ = mimetypes.guess_type(file_path)
    with open(file_path, 'rb') as f:
        async with aiohttp.ClientSession() as session:
            form_data = aiohttp.FormData()
            form_data.add_field('file', f, filename=file_path, content_type=mime_type)
            form_data.add_field('user', user_id)
            
            async with session.post(
                f"{LLM_URL}/files/upload",
                headers={"Authorization": f"Bearer {LLM_API}"},
                data=form_data
            ) as response:
                print("Upload response status code:", response.status)  # Debug information
                if response.status == 404:
                    return "Error: Endpoint not found (404)"
                response_text = await response.text()
                print("Raw upload response text:", response_text)  # Debug information
                try:
                    return json.loads(response_text)
                except json.JSONDecodeError:
                    return "Error: Invalid JSON response"

async def handle_input(file_path, category):
    upload_response = await upload_file(LLM_URL, LLM_API, file_path, USER_ID)
    print("Upload response:", upload_response)  # Debug information
    if "error" in upload_response:
        return upload_response
    file_id = upload_response.get("id")  # Extract file ID from the response
    if not file_id:
        return "Error: No file ID returned from upload"
    
    chat_response = await send_chat_message(LLM_URL, LLM_API, category, file_id)
    print("Chat response:", chat_response)  # Debug information
    return chat_response

# Define Gradio interface
file_input = gr.Image(label='圖片上傳', type='filepath')
category = gr.Radio(label="Message Category", choices=["機票", "計程車乘車證明", "通行明細 (etag)", "QRCODE發票", "超商高鐵車票", "高鐵車票", "超商台鐵車票", "台鐵車票"])

examples = [
    ['DEMO/boarding-pass.png', '機票'],    
    ['DEMO/taxi.jpg', '計程車乘車證明'],    
    ['DEMO/etag.jpg', '通行明細 (etag)'],
    ["DEMO/qrcode.jpg", 'QRCODE發票'],
    ['DEMO/mthsr.JPG', '超商高鐵車票'],
    ['DEMO/thsr.jpg', '高鐵車票'],
    ['DEMO/mtra.jpg', '超商台鐵車票'],
    ['DEMO/tra.JPG', '台鐵車票'],
]

TITLE = """<h1>Multimodal Playground 💬 輸入各種單據並選擇種類，解析得到各種關鍵資訊 </h1>"""
SUBTITLE = """<h2><a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D.</a> | <a href='https://blog.twman.org/p/deeplearning101.html' target='_blank'>手把手帶你一起踩AI坑</a><br></h2>"""
LINKS = """
<a href='https://github.com/Deep-Learning-101' target='_blank'>Deep Learning 101 Github</a> | <a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a> | <a href='https://www.facebook.com/groups/525579498272187/' target='_blank'>台灣人工智慧社團 FB</a> | <a href='https://www.youtube.com/c/DeepLearning101' target='_blank'>YouTube</a><br>
<a href='https://blog.twman.org/2025/03/AIAgent.html' target='_blank'>那些 AI Agent 要踩的坑</a>：探討多種 AI 代理人工具的應用經驗與挑戰，分享實用經驗與工具推薦。<br>
<a href='https://blog.twman.org/2024/08/LLM.html' target='_blank'>白話文手把手帶你科普 GenAI</a>：淺顯介紹生成式人工智慧核心概念，強調硬體資源和數據的重要性。<br>
<a href='https://blog.twman.org/2024/09/LLM.html' target='_blank'>大型語言模型直接就打完收工？</a>：回顧 LLM 領域探索歷程，討論硬體升級對 AI 開發的重要性。<br>
<a href='https://blog.twman.org/2024/07/RAG.html' target='_blank'>那些檢索增強生成要踩的坑</a>：探討 RAG 技術應用與挑戰，提供實用經驗分享和工具建議。<br>
<a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>那些大型語言模型要踩的坑</a>：探討多種 LLM 工具的應用與挑戰，強調硬體資源的重要性。<br>
<a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>Large Language Model，LLM</a>：探討 LLM 的發展與應用，強調硬體資源在開發中的關鍵作用。。<br>
<a href='https://blog.twman.org/2024/11/diffusion.html' target='_blank'>ComfyUI + Stable Diffuision</a>：深入探討影像生成與分割技術的應用，強調硬體資源的重要性。<br>
<a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>那些ASR和TTS可能會踩的坑</a>：探討 ASR 和 TTS 技術應用中的問題，強調數據質量的重要性。<br>
<a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (Natural Language Processing, NLP) 踩的坑</a>：分享 NLP 領域的實踐經驗，強調數據質量對模型效果的影響。<br>
<a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a>：分享語音處理領域的實務經驗，強調資料品質對模型效果的影響。<br>
<a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a><br>
<a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>
"""
with gr.Blocks() as iface:
    gr.HTML(TITLE)
    gr.HTML(SUBTITLE)
    gr.HTML(LINKS)
    gr.Interface(
        fn=handle_input,
        inputs=[file_input, category],
        outputs="text",
        examples=examples,
        allow_flagging="never"
    )

iface.launch()