Spaces:

geekyrakshit
/

medrag

Runtime error

App Files Files Community

geekyrakshit commited on Oct 24, 2024

Commit

fd0aa67

1 Parent(s): 7302c8f

add: FigureAnnotatorFromPageImage.extract_structured_output

Browse files

Files changed (3) hide show

medrag_multi_modal/assistant/figure_annotation.py +28 -6
medrag_multi_modal/assistant/llm_client.py +52 -0
pyproject.toml +2 -0

medrag_multi_modal/assistant/figure_annotation.py CHANGED Viewed

@@ -4,20 +4,31 @@ from typing import Union
 import cv2
 import weave
 from PIL import Image
 from rich.progress import track
 from ..utils import get_wandb_artifact, read_jsonl_file
 from .llm_client import LLMClient
 class FigureAnnotatorFromPageImage(weave.Model):
-    llm_client: LLMClient
     @weave.op()
     def annotate_figures(
         self, page_image: Image.Image
     ) -> dict[str, Union[Image.Image, str]]:
-        annotation = self.llm_client.predict(
             system_prompt="""
 You are an expert in the domain of scientific textbooks, especially medical texts.
 You are presented with a page from a scientific textbook from the domain of biology, specifically anatomy.
@@ -43,16 +54,27 @@ Here are some clues you need to follow:
         )
         return {"page_image": page_image, "annotations": annotation}
     @weave.op()
     def predict(self, image_artifact_address: str):
         artifact_dir = get_wandb_artifact(image_artifact_address, "dataset")
         metadata = read_jsonl_file(os.path.join(artifact_dir, "metadata.jsonl"))
         annotations = []
         for item in track(metadata, description="Annotating images:"):
-            page_image = cv2.imread(
-                os.path.join(artifact_dir, f"page{item['page_idx']}.png")
-            )
             page_image = cv2.cvtColor(page_image, cv2.COLOR_BGR2RGB)
             page_image = Image.fromarray(page_image)
-            annotations.append(self.annotate_figures(page_image=page_image))
         return annotations

 import cv2
 import weave
 from PIL import Image
+from pydantic import BaseModel
 from rich.progress import track
 from ..utils import get_wandb_artifact, read_jsonl_file
 from .llm_client import LLMClient
+class FigureAnnotation(BaseModel):
+    figure_id: str
+    figure_description: str
+class FigureAnnotations(BaseModel):
+    annotations: list[FigureAnnotation]
 class FigureAnnotatorFromPageImage(weave.Model):
+    figure_extraction_llm_client: LLMClient
+    structured_output_llm_client: LLMClient
     @weave.op()
     def annotate_figures(
         self, page_image: Image.Image
     ) -> dict[str, Union[Image.Image, str]]:
+        annotation = self.figure_extraction_llm_client.predict(
             system_prompt="""
 You are an expert in the domain of scientific textbooks, especially medical texts.
 You are presented with a page from a scientific textbook from the domain of biology, specifically anatomy.
         )
         return {"page_image": page_image, "annotations": annotation}
+    @weave.op
+    def extract_structured_output(self, annotations: str) -> FigureAnnotations:
+        return self.structured_output_llm_client.predict(
+            system_prompt="You are suppossed to extract a list of figure annotations consisting of figure IDs and corresponding figure descriptions.",
+            user_prompt=[annotations],
+            schema=FigureAnnotations,
+        )
     @weave.op()
     def predict(self, image_artifact_address: str):
         artifact_dir = get_wandb_artifact(image_artifact_address, "dataset")
         metadata = read_jsonl_file(os.path.join(artifact_dir, "metadata.jsonl"))
         annotations = []
         for item in track(metadata, description="Annotating images:"):
+            page_image_file = os.path.join(artifact_dir, f"page{item['page_idx']}.png")
+            page_image = cv2.imread(page_image_file)
             page_image = cv2.cvtColor(page_image, cv2.COLOR_BGR2RGB)
             page_image = Image.fromarray(page_image)
+            figure_extracted_annotations = self.annotate_figures(page_image=page_image)
+            figure_extracted_annotations["annotations"] = self.extract_structured_output(
+                figure_extracted_annotations["annotations"]
+            ).model_dump()
+            annotations.append(figure_extracted_annotations)
         return annotations

medrag_multi_modal/assistant/llm_client.py CHANGED Viewed

@@ -12,6 +12,7 @@ from ..utils import base64_encode_image
 class ClientType(str, Enum):
     GEMINI = "gemini"
     MISTRAL = "mistral"
 GOOGLE_MODELS = [
@@ -54,6 +55,8 @@ MISTRAL_MODELS = [
     "open-mixtral-8x22b",
 ]
 class LLMClient(weave.Model):
     model_name: str
@@ -65,6 +68,8 @@ class LLMClient(weave.Model):
                 client_type = ClientType.GEMINI
             elif model_name in MISTRAL_MODELS:
                 client_type = ClientType.MISTRAL
             else:
                 raise ValueError(f"Invalid model name: {model_name}")
         super().__init__(model_name=model_name, client_type=client_type)
@@ -139,6 +144,51 @@ class LLMClient(weave.Model):
         )
         return response.choices[0].message.content
     @weave.op()
     def predict(
         self,
@@ -150,5 +200,7 @@ class LLMClient(weave.Model):
             return self.execute_gemini_sdk(user_prompt, system_prompt, schema)
         elif self.client_type == ClientType.MISTRAL:
             return self.execute_mistral_sdk(user_prompt, system_prompt, schema)
         else:
             raise ValueError(f"Invalid client type: {self.client_type}")

 class ClientType(str, Enum):
     GEMINI = "gemini"
     MISTRAL = "mistral"
+    OPENAI = "openai"
 GOOGLE_MODELS = [
     "open-mixtral-8x22b",
 ]
+OPENAI_MODELS = ["gpt-4o", "gpt-4o-2024-08-06", "gpt-4o-mini", "gpt-4o-mini-2024-07-18"]
 class LLMClient(weave.Model):
     model_name: str
                 client_type = ClientType.GEMINI
             elif model_name in MISTRAL_MODELS:
                 client_type = ClientType.MISTRAL
+            elif model_name in OPENAI_MODELS:
+                client_type = ClientType.OPENAI
             else:
                 raise ValueError(f"Invalid model name: {model_name}")
         super().__init__(model_name=model_name, client_type=client_type)
         )
         return response.choices[0].message.content
+    @weave.op()
+    def execute_openai_sdk(
+        self,
+        user_prompt: Union[str, list[str]],
+        system_prompt: Optional[Union[str, list[str]]] = None,
+        schema: Optional[Any] = None,
+    ) -> Union[str, Any]:
+        from openai import OpenAI
+        system_prompt = (
+            [system_prompt] if isinstance(system_prompt, str) else system_prompt
+        )
+        user_prompt = [user_prompt] if isinstance(user_prompt, str) else user_prompt
+        system_messages = [
+            {"role": "system", "content": prompt} for prompt in system_prompt
+        ]
+        user_messages = []
+        for prompt in user_prompt:
+            if isinstance(prompt, Image.Image):
+                user_messages.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": base64_encode_image(prompt, "image/png"),
+                        },
+                    },
+                )
+            else:
+                user_messages.append({"type": "text", "text": prompt})
+        messages = system_messages + [{"role": "user", "content": user_messages}]
+        client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+        if schema is None:
+            completion = client.chat.completions.create(
+                model=self.model_name, messages=messages
+            )
+            return completion.choices[0].message.content
+        completion = weave.op()(client.beta.chat.completions.parse)(
+            model=self.model_name, messages=messages, response_format=schema
+        )
+        return completion.choices[0].message.parsed
     @weave.op()
     def predict(
         self,
             return self.execute_gemini_sdk(user_prompt, system_prompt, schema)
         elif self.client_type == ClientType.MISTRAL:
             return self.execute_mistral_sdk(user_prompt, system_prompt, schema)
+        elif self.client_type == ClientType.OPENAI:
+            return self.execute_openai_sdk(user_prompt, system_prompt, schema)
         else:
             raise ValueError(f"Invalid client type: {self.client_type}")

pyproject.toml CHANGED Viewed

@@ -43,6 +43,7 @@ dependencies = [
     "instructor>=1.6.3",
     "jsonlines>=4.0.0",
     "opencv-python>=4.10.0.84",
 ]
 [project.optional-dependencies]
@@ -71,6 +72,7 @@ core = [
     "instructor>=1.6.3",
     "jsonlines>=4.0.0",
     "opencv-python>=4.10.0.84",
 ]
 dev = ["pytest>=8.3.3", "isort>=5.13.2", "black>=24.10.0", "ruff>=0.6.9"]

     "instructor>=1.6.3",
     "jsonlines>=4.0.0",
     "opencv-python>=4.10.0.84",
+    "openai>=1.52.2",
 ]
 [project.optional-dependencies]
     "instructor>=1.6.3",
     "jsonlines>=4.0.0",
     "opencv-python>=4.10.0.84",
+    "openai>=1.52.2",
 ]
 dev = ["pytest>=8.3.3", "isort>=5.13.2", "black>=24.10.0", "ruff>=0.6.9"]