File size: 3,390 Bytes
e4f9749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from langchain.llms import OpenAI
from langchain.chat_models import AzureChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
import os
from langchain.tools import BaseTool
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers.models.oneformer.modeling_oneformer import OneFormerModelOutput
import requests
from PIL import Image
import gradio as gr
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
OPENAI_API_BASE=os.getenv("OPENAI_API_BASE")
DEP_NAME=os.getenv("deployment_name")
llm=AzureChatOpenAI(deployment_name=DEP_NAME,openai_api_base=OPENAI_API_BASE,openai_api_key=OPENAI_API_KEY,openai_api_version="2023-03-15-preview",model_name="gpt-3.5-turbo")

image_to_text_model = "Salesforce/blip-image-captioning-large"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = BlipForConditionalGeneration.from_pretrained(image_to_text_model).to(device)
processor = BlipProcessor.from_pretrained(image_to_text_model)

def describeImage(image_url):
  #image_object = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
  image_object = Image.open(image_url).convert('RGB')
  # image
  inputs = processor(image_object, return_tensors="pt").to(device)
  outputs = model.generate(**inputs)
  return processor.decode(outputs[0], skip_special_tokens=True)


class DescribeImageTool(BaseTool):
    name = "Describe Image Tool"
    description = 'use this tool to describe an image.'

    def _run(self, url: str):
        description = describeImage(url)
        return description

    def _arun(self, query: str):
        raise NotImplementedError("Async operation not supported yet")

tools = [DescribeImageTool()]

def enToChinese(english):
    pp = "Please translate the following sentence from English to Chinese:{english}"
    prompt = PromptTemplate(
           input_variables=["english"],
           template=pp
    )
    llchain=LLMChain(llm=llm,prompt=prompt)
    return llchain.run(english)
    

def chToEnglish(chinese):
    pp = "Please translate the following sentence from Chinese to English:{chinese}"
    prompt = PromptTemplate(
           input_variables=["chinese"],
           template=pp
    )
    llchain=LLMChain(llm=llm,prompt=prompt)
    return llchain.run(chinese)


agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=ConversationBufferWindowMemory(
        memory_key='chat_history',
        k=5,
        return_messages=True
    )
)

def image_to_txt(image_url,user_input):
    user_input = chToEnglish(user_input)
    return enToChinese(agent(f"{input}:\n{image_url}")['output'])

with gr.Blocks() as demo:
    image_url = gr.Image(type="filepath",label="请选择一张图片")
    print(image_url)
    user_input = gr.Textbox(show_label=False, placeholder="请输入...", lines=1)
    output = gr.Textbox(show_label=False, placeholder="", lines=1)
    submit_btn = gr.Button('提交',variant="primary")
    submit_btn.click(image_to_txt,inputs=[image_url,user_input],outputs=output)
    
demo.launch()