|
from langchain.agents import load_tools |
|
from langchain.agents import initialize_agent |
|
from langchain.agents import AgentType |
|
from langchain.llms import OpenAI |
|
from langchain.chat_models import AzureChatOpenAI |
|
from langchain.chains.conversation.memory import ConversationBufferWindowMemory |
|
import os |
|
from langchain.tools import BaseTool |
|
import torch |
|
from transformers import BlipProcessor, BlipForConditionalGeneration |
|
from transformers.models.oneformer.modeling_oneformer import OneFormerModelOutput |
|
import requests |
|
from PIL import Image |
|
import gradio as gr |
|
from langchain.chains import LLMChain |
|
from langchain.prompts import PromptTemplate |
|
|
|
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY") |
|
OPENAI_API_BASE=os.getenv("OPENAI_API_BASE") |
|
DEP_NAME=os.getenv("deployment_name") |
|
llm=AzureChatOpenAI(deployment_name=DEP_NAME,openai_api_base=OPENAI_API_BASE,openai_api_key=OPENAI_API_KEY,openai_api_version="2023-03-15-preview",model_name="gpt-3.5-turbo") |
|
|
|
image_to_text_model = "Salesforce/blip-image-captioning-large" |
|
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
|
|
model = BlipForConditionalGeneration.from_pretrained(image_to_text_model).to(device) |
|
processor = BlipProcessor.from_pretrained(image_to_text_model) |
|
|
|
def describeImage(image_url): |
|
|
|
image_object = Image.open(image_url).convert('RGB') |
|
|
|
inputs = processor(image_object, return_tensors="pt").to(device) |
|
outputs = model.generate(**inputs) |
|
return processor.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
class DescribeImageTool(BaseTool): |
|
name = "Describe Image Tool" |
|
description = 'use this tool to describe an image.' |
|
|
|
def _run(self, url: str): |
|
description = describeImage(url) |
|
return description |
|
|
|
def _arun(self, query: str): |
|
raise NotImplementedError("Async operation not supported yet") |
|
|
|
tools = [DescribeImageTool()] |
|
|
|
def enToChinese(english): |
|
pp = "Please translate the following sentence from English to Chinese:{english}" |
|
prompt = PromptTemplate( |
|
input_variables=["english"], |
|
template=pp |
|
) |
|
llchain=LLMChain(llm=llm,prompt=prompt) |
|
return llchain.run(english) |
|
|
|
|
|
def chToEnglish(chinese): |
|
pp = "Please translate the following sentence from Chinese to English:{chinese}" |
|
prompt = PromptTemplate( |
|
input_variables=["chinese"], |
|
template=pp |
|
) |
|
llchain=LLMChain(llm=llm,prompt=prompt) |
|
return llchain.run(chinese) |
|
|
|
|
|
agent = initialize_agent( |
|
agent='chat-conversational-react-description', |
|
tools=tools, |
|
llm=llm, |
|
verbose=True, |
|
max_iterations=3, |
|
early_stopping_method='generate', |
|
memory=ConversationBufferWindowMemory( |
|
memory_key='chat_history', |
|
k=5, |
|
return_messages=True |
|
) |
|
) |
|
|
|
def image_to_txt(image_url,user_input): |
|
user_input = chToEnglish(user_input) |
|
return enToChinese(agent(f"{input}:\n{image_url}")['output']) |
|
|
|
with gr.Blocks() as demo: |
|
image_url = gr.Image(type="filepath",label="请选择一张图片") |
|
print(image_url) |
|
user_input = gr.Textbox(show_label=False, placeholder="请输入...", lines=1) |
|
output = gr.Textbox(show_label=False, placeholder="", lines=1) |
|
submit_btn = gr.Button('提交',variant="primary") |
|
submit_btn.click(image_to_txt,inputs=[image_url,user_input],outputs=output) |
|
|
|
demo.launch() |