Spaces:

Yiyuan
/

VSA

Runtime error

App Files Files Community

cnzzx commited on Oct 30, 2024

Commit

d2ca3e2

1 Parent(s): 271c21d

update

Browse files

Files changed (3) hide show

models/search_agent/mindsearch_agent.py +12 -6
models/search_agent/utils.py +173 -0
models/vsa_model.py +60 -21

models/search_agent/mindsearch_agent.py CHANGED Viewed

@@ -194,7 +194,7 @@ class MindSearchAgent(BaseAgent):
         WebSearchGraph.searcher_cfg = searcher_cfg
         super().__init__(llm=llm, action_executor=None, protocol=protocol)
-    def chat(self, message, **kwargs):
         if isinstance(message, str):
             message = [{'role': 'user', 'content': message}]
         elif isinstance(message, dict):
@@ -211,26 +211,32 @@ class MindSearchAgent(BaseAgent):
         agent_return.inner_steps = deepcopy(inner_history)
         for _ in range(self.max_turn):
             prompt = self._protocol.format(inner_step=inner_history)
             code = None
-            response = self.llm.chat(prompt, session_id=random.randint(0, 999999), **kwargs)
-            model_state = ModelStatusCode.END
             response = response.replace('<|plugin|>', '<|interpreter|>')
             _, language, action = self._protocol.parse(response)
             if not language and not action:
                 continue
             code = action['parameters']['command'] if action else ''
-            agent_return.state = self._determine_agent_state(model_state, code, agent_return)
             agent_return.response = language if not code else code
             inner_history.append({'role': 'language', 'content': language})
             print(colored(response, 'blue'))
             if code:
-                agent_return = self._process_code_simple(agent_return, inner_history,
-                                              code, as_dict, return_early)
             else:
                 agent_return.state = AgentStatusCode.END
                 return agent_return
         agent_return.state = AgentStatusCode.END
         return agent_return
     def stream_chat(self, message, **kwargs):
         if isinstance(message, str):

         WebSearchGraph.searcher_cfg = searcher_cfg
         super().__init__(llm=llm, action_executor=None, protocol=protocol)
+    def generate(self, message, **kwargs):
         if isinstance(message, str):
             message = [{'role': 'user', 'content': message}]
         elif isinstance(message, dict):
         agent_return.inner_steps = deepcopy(inner_history)
         for _ in range(self.max_turn):
             prompt = self._protocol.format(inner_step=inner_history)
+            prompt = [
+                ''.join([p['role'] + ': ' + p['content'] for p in prompt])
+            ]
             code = None
+            response = self.llm.generate(
+                prompt,
+                **kwargs,
+            )[0]
             response = response.replace('<|plugin|>', '<|interpreter|>')
             _, language, action = self._protocol.parse(response)
             if not language and not action:
                 continue
             code = action['parameters']['command'] if action else ''
             agent_return.response = language if not code else code
             inner_history.append({'role': 'language', 'content': language})
             print(colored(response, 'blue'))
             if code:
+                self._process_code(agent_return, inner_history, code, as_dict, return_early)
             else:
                 agent_return.state = AgentStatusCode.END
                 return agent_return
         agent_return.state = AgentStatusCode.END
         return agent_return
     def stream_chat(self, message, **kwargs):
         if isinstance(message, str):

models/search_agent/utils.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import copy
+import logging
+from typing import List, Optional, Union
+from lagent.llms.base_llm import BaseModel
+from lagent.schema import ModelStatusCode
+from lagent.utils.util import filter_suffix
+class LMDeployServer(BaseModel):
+    """
+    Args:
+        path (str): The path to the model.
+            It could be one of the following options:
+                - i) A local directory path of a turbomind model which is
+                    converted by `lmdeploy convert` command or download from
+                    ii) and iii).
+                - ii) The model_id of a lmdeploy-quantized model hosted
+                    inside a model repo on huggingface.co, such as
+                    "InternLM/internlm-chat-20b-4bit",
+                    "lmdeploy/llama2-chat-70b-4bit", etc.
+                - iii) The model_id of a model hosted inside a model repo
+                    on huggingface.co, such as "internlm/internlm-chat-7b",
+                    "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
+                    and so on.
+        model_name (str): needed when model_path is a pytorch model on
+            huggingface.co, such as "internlm-chat-7b",
+            "Qwen-7B-Chat ", "Baichuan2-7B-Chat" and so on.
+        server_name (str): host ip for serving
+        server_port (int): server port
+        tp (int): tensor parallel
+        log_level (str): set log level whose value among
+            [CRITICAL, ERROR, WARNING, INFO, DEBUG]
+    """
+    def __init__(self,
+                 path: str,
+                 model_name: Optional[str] = None,
+                 server_name: str = '0.0.0.0',
+                 server_port: int = 23333,
+                 tp: int = 1,
+                 log_level: str = 'WARNING',
+                 serve_cfg=dict(),
+                 **kwargs):
+        super().__init__(path=path, **kwargs)
+        self.model_name = model_name
+        # TODO get_logger issue in multi processing
+        import lmdeploy
+        self.client = lmdeploy.serve(
+            model_path=self.path,
+            model_name=model_name,
+            server_name=server_name,
+            server_port=server_port,
+            tp=tp,
+            log_level=log_level,
+            **serve_cfg)
+    def generate(self,
+                 inputs: Union[str, List[str]],
+                 session_id: int = 2967,
+                 sequence_start: bool = True,
+                 sequence_end: bool = True,
+                 ignore_eos: bool = False,
+                 skip_special_tokens: Optional[bool] = False,
+                 timeout: int = 30,
+                 **kwargs) -> List[str]:
+        """Start a new round conversation of a session. Return the chat
+        completions in non-stream mode.
+        Args:
+            inputs (str, List[str]): user's prompt(s) in this round
+            session_id (int): the identical id of a session
+            sequence_start (bool): start flag of a session
+            sequence_end (bool): end flag of a session
+            ignore_eos (bool): indicator for ignoring eos
+            skip_special_tokens (bool): Whether or not to remove special tokens
+                in the decoding. Default to be False.
+            timeout (int): max time to wait for response
+        Returns:
+            (a list of/batched) text/chat completion
+        """
+        batched = True
+        if isinstance(inputs, str):
+            inputs = [inputs]
+            batched = False
+        gen_params = self.update_gen_params(**kwargs)
+        max_new_tokens = gen_params.pop('max_new_tokens')
+        gen_params.update(max_tokens=max_new_tokens)
+        resp = [''] * len(inputs)
+        for text in self.client.completions_v1(
+                self.model_name,
+                inputs,
+                session_id=session_id,
+                sequence_start=sequence_start,
+                sequence_end=sequence_end,
+                stream=False,
+                ignore_eos=ignore_eos,
+                skip_special_tokens=skip_special_tokens,
+                timeout=timeout,
+                **gen_params):
+            resp = [
+                resp[i] + item['text']
+                for i, item in enumerate(text['choices'])
+            ]
+        # remove stop_words
+        resp = filter_suffix(resp, self.gen_params.get('stop_words'))
+        if not batched:
+            return resp[0]
+        return resp
+    def stream_chat(self,
+                    inputs: List[dict],
+                    session_id=0,
+                    sequence_start: bool = True,
+                    sequence_end: bool = True,
+                    stream: bool = True,
+                    ignore_eos: bool = False,
+                    skip_special_tokens: Optional[bool] = False,
+                    timeout: int = 30,
+                    **kwargs):
+        """Start a new round conversation of a session. Return the chat
+        completions in stream mode.
+        Args:
+            session_id (int): the identical id of a session
+            inputs (List[dict]): user's inputs in this round conversation
+            sequence_start (bool): start flag of a session
+            sequence_end (bool): end flag of a session
+            stream (bool): return in a streaming format if enabled
+            ignore_eos (bool): indicator for ignoring eos
+            skip_special_tokens (bool): Whether or not to remove special tokens
+                in the decoding. Default to be False.
+            timeout (int): max time to wait for response
+        Returns:
+            tuple(Status, str, int): status, text/chat completion,
+            generated token number
+        """
+        gen_params = self.update_gen_params(**kwargs)
+        max_new_tokens = gen_params.pop('max_new_tokens')
+        gen_params.update(max_tokens=max_new_tokens)
+        prompt = self.template_parser(inputs)
+        resp = ''
+        finished = False
+        stop_words = self.gen_params.get('stop_words')
+        for text in self.client.completions_v1(
+                self.model_name,
+                prompt,
+                session_id=session_id,
+                sequence_start=sequence_start,
+                sequence_end=sequence_end,
+                stream=stream,
+                ignore_eos=ignore_eos,
+                skip_special_tokens=skip_special_tokens,
+                timeout=timeout,
+                **gen_params):
+            resp += text['choices'][0]['text']
+            if not resp:
+                continue
+            # remove stop_words
+            for sw in stop_words:
+                if sw in resp:
+                    resp = filter_suffix(resp, stop_words)
+                    finished = True
+                    break
+            yield ModelStatusCode.STREAM_ING, resp, None
+            if finished:
+                break
+        yield ModelStatusCode.END, resp, None

models/vsa_model.py CHANGED Viewed

@@ -6,7 +6,6 @@
 # https://github.com/IDEA-Research/GroundingDINO
 # https://github.com/InternLM/MindSearch
 # --------------------------------------------------------
-import spaces
 import os
 import copy
@@ -25,7 +24,7 @@ from llava.mm_utils import process_images, tokenizer_image_token, get_model_name
 from datetime import datetime
 from lagent.actions import ActionExecutor, BingBrowser
-from lagent.llms import INTERNLM2_META, LMDeployServer
 from lagent.schema import AgentReturn, AgentStatusCode
 from lagent.schema import AgentStatusCode
 from .search_agent.mindsearch_agent import (
@@ -37,6 +36,7 @@ from .search_agent.mindsearch_prompt import (
     searcher_input_template_cn, searcher_input_template_en,
     searcher_system_prompt_cn, searcher_system_prompt_en
 )
 from typing import List, Union
@@ -210,7 +210,24 @@ class WebSearcher:
             raise Exception('Unsupported model for web searcher.')
         self.lang = lang
-        llm = LMDeployServer(
             path = model_path,
             model_name = model_name,
             meta_template = INTERNLM2_META,
@@ -219,7 +236,7 @@ class WebSearcher:
             temperature = temperature,
             max_new_tokens = max_new_tokens,
             repetition_penalty = repetition_penalty,
-            stop_words = ['<|im_end|>']
         )
         self.agent = MindSearchAgent(
             llm = llm,
@@ -259,6 +276,14 @@ class WebSearcher:
             with open('temp/search_result_{}.txt'.format(qid), 'w', encoding='utf-8') as wf:
                 wf.write(result)
             results.append(result)
         return results
@@ -296,28 +321,17 @@ class VisionSearchAssistant:
         self.vlm_load_4bit = vlm_load_4bit
         self.vlm_load_8bit = vlm_load_8bit
         self.use_correlate = True
-    @spaces.GPU
     def app_run(
         self,
         image: Union[str, Image.Image, np.ndarray],
         text: str,
         ground_classes: List[str] = COCO_CLASSES
-    ):
-        self.searcher = WebSearcher(
-            model_path = self.search_model
-        )
-        self.grounder = VisualGrounder(
-            model_path = self.ground_model,
-            device = self.ground_device,
-        )
-        self.vlm = VLM(
-            model_path = self.vlm_model,
-            device = self.vlm_device,
-            load_4bit = self.vlm_load_4bit,
-            load_8bit = self.vlm_load_8bit
-        )
         # Create and clear the temporary directory.
         if not os.access('temp', os.F_OK):
             os.makedirs('temp')
@@ -338,6 +352,10 @@ class VisionSearchAssistant:
             raise Exception('Unsupported input image format.')
         # Visual Grounding
         bboxes, labels, out_image = self.grounder(in_image, classes = ground_classes)
         yield out_image, 'ground'
@@ -352,7 +370,16 @@ class VisionSearchAssistant:
             det_images.append(in_image)
             labels.append('image')
         # Visual Captioning
         captions = []
         for det_image, label in zip(det_images, labels):
             inp = get_caption_prompt(label, text)
@@ -386,11 +413,20 @@ class VisionSearchAssistant:
         queries = [text + " " + query for query in queries]
         # Web Searching
         contexts = self.searcher(queries)
         yield contexts, 'search'
         # QA
         TOKEN_LIMIT = 3500
         max_length_per_context = TOKEN_LIMIT // len(contexts)
         for cid, context in enumerate(contexts):
@@ -403,4 +439,7 @@ class VisionSearchAssistant:
             wf.write(answer)
         print(answer)
-        yield answer, 'answer'

 # https://github.com/IDEA-Research/GroundingDINO
 # https://github.com/InternLM/MindSearch
 # --------------------------------------------------------
 import os
 import copy
 from datetime import datetime
 from lagent.actions import ActionExecutor, BingBrowser
+from lagent.llms import INTERNLM2_META, LMDeployServer, LMDeployPipeline
 from lagent.schema import AgentReturn, AgentStatusCode
 from lagent.schema import AgentStatusCode
 from .search_agent.mindsearch_agent import (
     searcher_input_template_cn, searcher_input_template_en,
     searcher_system_prompt_cn, searcher_system_prompt_en
 )
+from lmdeploy.messages import PytorchEngineConfig
 from typing import List, Union
             raise Exception('Unsupported model for web searcher.')
         self.lang = lang
+        backend_config = PytorchEngineConfig(
+            max_batch_size = 1,
+        )
+        # llm = LMDeployServer(
+        #     path = model_path,
+        #     model_name = model_name,
+        #     meta_template = INTERNLM2_META,
+        #     top_p = top_p,
+        #     top_k = top_k,
+        #     temperature = temperature,
+        #     max_new_tokens = max_new_tokens,
+        #     repetition_penalty = repetition_penalty,
+        #     stop_words = ['<|im_end|>'],
+        #     serve_cfg = dict(
+        #         backend_config = backend_config
+        #     )
+        # )
+        llm = LMDeployPipeline(
             path = model_path,
             model_name = model_name,
             meta_template = INTERNLM2_META,
             temperature = temperature,
             max_new_tokens = max_new_tokens,
             repetition_penalty = repetition_penalty,
+            stop_words = ['<|im_end|>'],
         )
         self.agent = MindSearchAgent(
             llm = llm,
             with open('temp/search_result_{}.txt'.format(qid), 'w', encoding='utf-8') as wf:
                 wf.write(result)
             results.append(result)
+        # for qid, query in enumerate(queries):
+        #     result = None
+        #     agent_return = self.agent.generate(query)
+        #     result = agent_return.response
+        #     assert result is not None
+        #     with open('temp/search_result_{}.txt'.format(qid), 'w', encoding='utf-8') as wf:
+        #         wf.write(result)
+        #     results.append(result)
         return results
         self.vlm_load_4bit = vlm_load_4bit
         self.vlm_load_8bit = vlm_load_8bit
         self.use_correlate = True
+        self.searcher = WebSearcher(
+            model_path = self.search_model
+        )
     def app_run(
         self,
         image: Union[str, Image.Image, np.ndarray],
         text: str,
         ground_classes: List[str] = COCO_CLASSES
+    ):
         # Create and clear the temporary directory.
         if not os.access('temp', os.F_OK):
             os.makedirs('temp')
             raise Exception('Unsupported input image format.')
         # Visual Grounding
+        self.grounder = VisualGrounder(
+            model_path = self.ground_model,
+            device = self.ground_device,
+        )
         bboxes, labels, out_image = self.grounder(in_image, classes = ground_classes)
         yield out_image, 'ground'
             det_images.append(in_image)
             labels.append('image')
+        del self.grounder
+        torch.cuda.empty_cache()
         # Visual Captioning
+        self.vlm = VLM(
+            model_path = self.vlm_model,
+            device = self.vlm_device,
+            load_4bit = self.vlm_load_4bit,
+            load_8bit = self.vlm_load_8bit
+        )
         captions = []
         for det_image, label in zip(det_images, labels):
             inp = get_caption_prompt(label, text)
         queries = [text + " " + query for query in queries]
+        del self.vlm
+        torch.cuda.empty_cache()
         # Web Searching
         contexts = self.searcher(queries)
         yield contexts, 'search'
         # QA
+        self.vlm = VLM(
+            model_path = self.vlm_model,
+            device = self.vlm_device,
+            load_4bit = self.vlm_load_4bit,
+            load_8bit = self.vlm_load_8bit
+        )
         TOKEN_LIMIT = 3500
         max_length_per_context = TOKEN_LIMIT // len(contexts)
         for cid, context in enumerate(contexts):
             wf.write(answer)
         print(answer)
+        yield answer, 'answer'
+        del self.vlm
+        torch.cuda.empty_cache()