Spaces:

cambioml
/

parser-leaderboard

Runtime error

App Files Files Community

jojortz commited on Sep 9, 2024

Commit

26197e0

1 Parent(s): 9fe1137

add timeout to model run

Browse files

Files changed (2) hide show

extractors/model.py +120 -25
extractors/model_runner.py +21 -11

extractors/model.py CHANGED Viewed

@@ -20,15 +20,38 @@ class Model:
     BASE_URL: str | None = None
     API_KEY: str | None = None
     MODEL: str | None = None
     def __init_subclass__(cls) -> None:
         """Initialize subclass."""
         super().__init_subclass__()
     def __init__(self):
-        """Init self"""
-    def extract(self, file_path: str) -> str:
         """Extract model.
         Args:
@@ -39,11 +62,94 @@ class Model:
         """
         raise NotImplementedError("Model extract method is not implemented")
 class AnyParserModel(Model):
     BASE_URL = "https://k7u1c342dc.execute-api.us-west-2.amazonaws.com/v1/extract"
     API_KEY = os.getenv('ANYPARSER_RT_API_KEY')
-    def extract(self, file_path: str) -> str:
         """Extract data in real-time.
         Args:
@@ -107,7 +213,7 @@ class LlamaParseModel(Model):
         if not self.API_KEY:
             raise ValueError("The API key is required. Please set the LLAMA_CLOUD_API_KEY environment variable.")
-    def extract(self, file_path: str) -> str:
         """Extract data in real-time.
         Args:
@@ -142,7 +248,7 @@ class UnstructuredModel(Model):
         """Init."""
         super().__init__()
-    def extract(self, file_path: str) -> str:
         """Extract data in real-time.
         Args:
@@ -155,8 +261,10 @@ class UnstructuredModel(Model):
             elements = partition(file_path)
-            parsed_text = "\n".join(str(element) for element in elements)
             markdown = parsed_text if parsed_text else "No content parsed"
             return markdown
         except Exception as e:
@@ -171,14 +279,9 @@ class GPTModel(Model):
     def __init__(self):
         """Init."""
         super().__init__()
-        if not self.API_KEY:
-            raise ValueError(
-                "The API key is required. Please set the OPENAI_API_KEY environment variable."
-            )
-        self._client = openai.OpenAI(api_key=self.API_KEY)
-    def extract(self, file_path: str) -> str:
         """Extract data in real-time.
         Args:
@@ -206,7 +309,7 @@ class GPTModel(Model):
                 {
                 "role": "user",
                 "content": [
-                    {"type": "text", "text": "Convert this image to markdown"},
                     *contents,
                 ],
                 }
@@ -226,21 +329,13 @@ class ClaudeModel(Model):
     BASE_URL = "http://103.114.163.134:3000/v1/"
     API_KEY = os.getenv("ANTHROPIC_API_KEY")
     MODEL = "claude-3-5-sonnet-20240620"
-    REQUIRES_OPENAI = True
     def __init__(self):
         """Init."""
         super().__init__()
-        if not self.API_KEY:
-            raise ValueError(
-                "The API key is required. Please set the ANTHROPIC_API_KEY environment variable."
-            )
-        self._client = anthropic.Anthropic(
-            api_key=self.API_KEY,
-        )
-    def extract(self, file_path: str) -> str:
         """Extract data in real-time.
         Args:
@@ -251,7 +346,7 @@ class ClaudeModel(Model):
         """
         try:
-            prompt = "Convert this image to markdown."
             pdf_preprocessor = PdfPreprocessor()
             claude_postprocessor = ClaudePostprocessor()
             file_contents = pdf_preprocessor.run(file_path)
@@ -278,7 +373,7 @@ class ClaudeModel(Model):
             response = self._client.messages.create(
                 model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=messages
             )
-            print(response.content[0].text)
             return claude_postprocessor.run(response.content[0].text)
         except Exception as e:
             return f"Error processing ClaudeModel: {str(e)}"

     BASE_URL: str | None = None
     API_KEY: str | None = None
     MODEL: str | None = None
+    REQUIRES_OPENAI: bool = False
+    REQUIRES_ANTHROPIC: bool = False
+    PROMPT: str = "Convert these images to markdown"
     def __init_subclass__(cls) -> None:
         """Initialize subclass."""
         super().__init_subclass__()
     def __init__(self):
+        if self.REQUIRES_OPENAI:
+            if not self.API_KEY:
+                raise ValueError("Model api key is not provided")
+            if not self.MODEL:
+                raise ValueError("Model name is not provided")
+            if self.BASE_URL:
+                self._client = openai.OpenAI(
+                    base_url=self.BASE_URL,
+                    api_key=self.API_KEY,
+                )
+            else:
+                self._client = openai.OpenAI(api_key=self.API_KEY)
+        elif self.REQUIRES_ANTHROPIC:
+            if not self.API_KEY:
+                raise ValueError("Model api key is not provided")
+            if not self.MODEL:
+                raise ValueError("Model name is not provided")
+            self._client = anthropic.Anthropic(
+                api_key=self.API_KEY,
+            )
+    def run(self, file_path: str) -> str:
         """Extract model.
         Args:
         """
         raise NotImplementedError("Model extract method is not implemented")
+class CambioVQA0713(Model):
+    BASE_URL = "http://44.242.239.38:8000/v1"
+    API_KEY = "Cambioml2024!"
+    MODEL = "cambiollm-dust-preview-0713"
+    REQUIRES_OPENAI = True
+    USE_BEAM_SEARCH = True
+    def __init__(self):
+        """Init."""
+        super().__init__()
+    def run(self, file_path: str) -> str:
+        """Extract data in real-time.
+        Args:
+            file_path (str): The path to the file to be parsed.
+        Returns:
+            str: The extracted data.
+        """
+        try:
+            pdf_preprocessor = PdfPreprocessor()
+            file_contents = pdf_preprocessor.run(file_path)
+            contents = []
+            for content in file_contents:
+                contents.append(
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{content}",
+                    },
+                },)
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Convert this image to markdown\nOutput figures\nOutput charts\nOutput tables\nOutput footnotes\nOutput headers\nOutput footers\nOutput page nums",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{file_contents[0]}",
+                            },
+                        },
+                    ],
+                }
+            ]
+            print('Cambio Model - ready to run: ', json.dumps(messages[0])[:200])
+            if self.USE_BEAM_SEARCH:
+                response = self._client.chat.completions.create(
+                    model=self.MODEL,
+                    messages=messages,
+                    top_p=1,
+                    temperature=0,
+                    extra_body={
+                        "top_k": -1,
+                        "use_beam_search": True,
+                        "best_of": 2,
+                    },
+                )
+            else:
+                response = self._client.chat.completions.create(
+                    model=self.MODEL,
+                    messages=messages,
+                    max_tokens=1024,
+                    temperature=0.3,
+                    top_p=0.7,
+                    extra_body={
+                        "top_k": 20,
+                    },
+                )
+            print('Cambio Model - response: ', response.choices[0].message.content)
+            return response.choices[0].message.content
+        except Exception as e:
+            print(f"Error processing input: {str(e)}")
+            return f"Error processing with CambioVQA0713: {str(e)}"
 class AnyParserModel(Model):
     BASE_URL = "https://k7u1c342dc.execute-api.us-west-2.amazonaws.com/v1/extract"
     API_KEY = os.getenv('ANYPARSER_RT_API_KEY')
+    def run(self, file_path: str) -> str:
         """Extract data in real-time.
         Args:
         if not self.API_KEY:
             raise ValueError("The API key is required. Please set the LLAMA_CLOUD_API_KEY environment variable.")
+    def run(self, file_path: str) -> str:
         """Extract data in real-time.
         Args:
         """Init."""
         super().__init__()
+    def run(self, file_path: str) -> str:
         """Extract data in real-time.
         Args:
             elements = partition(file_path)
+            # Combine the elements into a single string
+            parsed_text = "\n".join(element.text for element in elements if element.text)
+            # Handle case where no content is parsed
             markdown = parsed_text if parsed_text else "No content parsed"
             return markdown
         except Exception as e:
     def __init__(self):
         """Init."""
         super().__init__()
+    def run(self, file_path: str) -> str:
         """Extract data in real-time.
         Args:
                 {
                 "role": "user",
                 "content": [
+                    {"type": "text", "text": self.PROMPT},
                     *contents,
                 ],
                 }
     BASE_URL = "http://103.114.163.134:3000/v1/"
     API_KEY = os.getenv("ANTHROPIC_API_KEY")
     MODEL = "claude-3-5-sonnet-20240620"
+    REQUIRES_ANTHROPIC = True
     def __init__(self):
         """Init."""
         super().__init__()
+    def run(self, file_path: str) -> str:
         """Extract data in real-time.
         Args:
         """
         try:
+            prompt = self.PROMPT
             pdf_preprocessor = PdfPreprocessor()
             claude_postprocessor = ClaudePostprocessor()
             file_contents = pdf_preprocessor.run(file_path)
             response = self._client.messages.create(
                 model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=messages
             )
+            print('-----------\n\n***Anthropic Response:\n\n ', response.content[0].text)
             return claude_postprocessor.run(response.content[0].text)
         except Exception as e:
             return f"Error processing ClaudeModel: {str(e)}"

extractors/model_runner.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import concurrent.futures
-from extractors.model import AnyParserModel, LlamaParseModel, UnstructuredModel, GPTModel, ClaudeModel
 ap_rt = AnyParserModel()
 lp = LlamaParseModel()
@@ -8,11 +10,11 @@ gpt = GPTModel()
 claude = ClaudeModel()
 model_function_map = {
-    "AnyParser": ap_rt.extract,
-    "LlamaParse": lp.extract,
-    "Unstructured": un.extract,
-    "GPT-4o-mini": gpt.extract,
-    "Claude-3.5-Sonnet": claude.extract,
 }
 models = [key for key in model_function_map]
@@ -23,14 +25,22 @@ def run_extract(model, file_path):
     markdown = extractor(file_path)
     return markdown
-def run_extract_parallel(model_a, model_b, pdf):
     with concurrent.futures.ThreadPoolExecutor() as executor:
         # Submit tasks to the executor for parallel execution
         future_a = executor.submit(run_extract, model_a, pdf)
         future_b = executor.submit(run_extract, model_b, pdf)
-        # Get the results as they complete
-        result_a = future_a.result()
-        result_b = future_b.result()
-    return result_a, result_b

 import concurrent.futures
+from extractors.model import LlamaParseModel, UnstructuredModel, GPTModel, ClaudeModel, AnyParserModel
+DEFAULT_TIMEOUT = 30
 ap_rt = AnyParserModel()
 lp = LlamaParseModel()
 claude = ClaudeModel()
 model_function_map = {
+    "AnyParser": ap_rt.run,
+    "LlamaParse": lp.run,
+    "Unstructured": un.run,
+    "GPT-4o-mini": gpt.run,
+    "Claude-3.5-Sonnet": claude.run,
 }
 models = [key for key in model_function_map]
     markdown = extractor(file_path)
     return markdown
+def run_extract_parallel(model_a, model_b, pdf, timeout=DEFAULT_TIMEOUT):
     with concurrent.futures.ThreadPoolExecutor() as executor:
         # Submit tasks to the executor for parallel execution
         future_a = executor.submit(run_extract, model_a, pdf)
         future_b = executor.submit(run_extract, model_b, pdf)
+        try:
+            # Get the results with a timeout
+            result_a = future_a.result(timeout=timeout)
+        except concurrent.futures.TimeoutError:
+            result_a = f"Error: Timeout after {timeout} seconds"
+        try:
+            result_b = future_b.result(timeout=timeout)
+        except concurrent.futures.TimeoutError:
+            result_b = f"Error: Timeout after {timeout} seconds"
+    return result_a, result_b