zp / app.py
zh21608's picture
Create app.py
e4f9749
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from langchain.llms import OpenAI
from langchain.chat_models import AzureChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
import os
from langchain.tools import BaseTool
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers.models.oneformer.modeling_oneformer import OneFormerModelOutput
import requests
from PIL import Image
import gradio as gr
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
OPENAI_API_BASE=os.getenv("OPENAI_API_BASE")
DEP_NAME=os.getenv("deployment_name")
llm=AzureChatOpenAI(deployment_name=DEP_NAME,openai_api_base=OPENAI_API_BASE,openai_api_key=OPENAI_API_KEY,openai_api_version="2023-03-15-preview",model_name="gpt-3.5-turbo")
image_to_text_model = "Salesforce/blip-image-captioning-large"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = BlipForConditionalGeneration.from_pretrained(image_to_text_model).to(device)
processor = BlipProcessor.from_pretrained(image_to_text_model)
def describeImage(image_url):
#image_object = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
image_object = Image.open(image_url).convert('RGB')
# image
inputs = processor(image_object, return_tensors="pt").to(device)
outputs = model.generate(**inputs)
return processor.decode(outputs[0], skip_special_tokens=True)
class DescribeImageTool(BaseTool):
name = "Describe Image Tool"
description = 'use this tool to describe an image.'
def _run(self, url: str):
description = describeImage(url)
return description
def _arun(self, query: str):
raise NotImplementedError("Async operation not supported yet")
tools = [DescribeImageTool()]
def enToChinese(english):
pp = "Please translate the following sentence from English to Chinese:{english}"
prompt = PromptTemplate(
input_variables=["english"],
template=pp
)
llchain=LLMChain(llm=llm,prompt=prompt)
return llchain.run(english)
def chToEnglish(chinese):
pp = "Please translate the following sentence from Chinese to English:{chinese}"
prompt = PromptTemplate(
input_variables=["chinese"],
template=pp
)
llchain=LLMChain(llm=llm,prompt=prompt)
return llchain.run(chinese)
agent = initialize_agent(
agent='chat-conversational-react-description',
tools=tools,
llm=llm,
verbose=True,
max_iterations=3,
early_stopping_method='generate',
memory=ConversationBufferWindowMemory(
memory_key='chat_history',
k=5,
return_messages=True
)
)
def image_to_txt(image_url,user_input):
user_input = chToEnglish(user_input)
return enToChinese(agent(f"{input}:\n{image_url}")['output'])
with gr.Blocks() as demo:
image_url = gr.Image(type="filepath",label="请选择一张图片")
print(image_url)
user_input = gr.Textbox(show_label=False, placeholder="请输入...", lines=1)
output = gr.Textbox(show_label=False, placeholder="", lines=1)
submit_btn = gr.Button('提交',variant="primary")
submit_btn.click(image_to_txt,inputs=[image_url,user_input],outputs=output)
demo.launch()