jojortz commited on
Commit
26197e0
1 Parent(s): 9fe1137

add timeout to model run

Browse files
Files changed (2) hide show
  1. extractors/model.py +120 -25
  2. extractors/model_runner.py +21 -11
extractors/model.py CHANGED
@@ -20,15 +20,38 @@ class Model:
20
  BASE_URL: str | None = None
21
  API_KEY: str | None = None
22
  MODEL: str | None = None
 
 
 
 
23
 
24
  def __init_subclass__(cls) -> None:
25
  """Initialize subclass."""
26
  super().__init_subclass__()
27
 
28
  def __init__(self):
29
- """Init self"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- def extract(self, file_path: str) -> str:
32
  """Extract model.
33
 
34
  Args:
@@ -39,11 +62,94 @@ class Model:
39
  """
40
  raise NotImplementedError("Model extract method is not implemented")
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  class AnyParserModel(Model):
43
  BASE_URL = "https://k7u1c342dc.execute-api.us-west-2.amazonaws.com/v1/extract"
44
  API_KEY = os.getenv('ANYPARSER_RT_API_KEY')
45
 
46
- def extract(self, file_path: str) -> str:
47
  """Extract data in real-time.
48
 
49
  Args:
@@ -107,7 +213,7 @@ class LlamaParseModel(Model):
107
  if not self.API_KEY:
108
  raise ValueError("The API key is required. Please set the LLAMA_CLOUD_API_KEY environment variable.")
109
 
110
- def extract(self, file_path: str) -> str:
111
  """Extract data in real-time.
112
 
113
  Args:
@@ -142,7 +248,7 @@ class UnstructuredModel(Model):
142
  """Init."""
143
  super().__init__()
144
 
145
- def extract(self, file_path: str) -> str:
146
  """Extract data in real-time.
147
 
148
  Args:
@@ -155,8 +261,10 @@ class UnstructuredModel(Model):
155
 
156
  elements = partition(file_path)
157
 
158
- parsed_text = "\n".join(str(element) for element in elements)
 
159
 
 
160
  markdown = parsed_text if parsed_text else "No content parsed"
161
  return markdown
162
  except Exception as e:
@@ -171,14 +279,9 @@ class GPTModel(Model):
171
  def __init__(self):
172
  """Init."""
173
  super().__init__()
174
- if not self.API_KEY:
175
- raise ValueError(
176
- "The API key is required. Please set the OPENAI_API_KEY environment variable."
177
- )
178
- self._client = openai.OpenAI(api_key=self.API_KEY)
179
 
180
 
181
- def extract(self, file_path: str) -> str:
182
  """Extract data in real-time.
183
 
184
  Args:
@@ -206,7 +309,7 @@ class GPTModel(Model):
206
  {
207
  "role": "user",
208
  "content": [
209
- {"type": "text", "text": "Convert this image to markdown"},
210
  *contents,
211
  ],
212
  }
@@ -226,21 +329,13 @@ class ClaudeModel(Model):
226
  BASE_URL = "http://103.114.163.134:3000/v1/"
227
  API_KEY = os.getenv("ANTHROPIC_API_KEY")
228
  MODEL = "claude-3-5-sonnet-20240620"
229
- REQUIRES_OPENAI = True
230
 
231
  def __init__(self):
232
  """Init."""
233
  super().__init__()
234
- if not self.API_KEY:
235
- raise ValueError(
236
- "The API key is required. Please set the ANTHROPIC_API_KEY environment variable."
237
- )
238
- self._client = anthropic.Anthropic(
239
- api_key=self.API_KEY,
240
- )
241
-
242
 
243
- def extract(self, file_path: str) -> str:
244
  """Extract data in real-time.
245
 
246
  Args:
@@ -251,7 +346,7 @@ class ClaudeModel(Model):
251
  """
252
 
253
  try:
254
- prompt = "Convert this image to markdown."
255
  pdf_preprocessor = PdfPreprocessor()
256
  claude_postprocessor = ClaudePostprocessor()
257
  file_contents = pdf_preprocessor.run(file_path)
@@ -278,7 +373,7 @@ class ClaudeModel(Model):
278
  response = self._client.messages.create(
279
  model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=messages
280
  )
281
- print(response.content[0].text)
282
  return claude_postprocessor.run(response.content[0].text)
283
  except Exception as e:
284
  return f"Error processing ClaudeModel: {str(e)}"
 
20
  BASE_URL: str | None = None
21
  API_KEY: str | None = None
22
  MODEL: str | None = None
23
+ REQUIRES_OPENAI: bool = False
24
+ REQUIRES_ANTHROPIC: bool = False
25
+ PROMPT: str = "Convert these images to markdown"
26
+
27
 
28
  def __init_subclass__(cls) -> None:
29
  """Initialize subclass."""
30
  super().__init_subclass__()
31
 
32
  def __init__(self):
33
+ if self.REQUIRES_OPENAI:
34
+ if not self.API_KEY:
35
+ raise ValueError("Model api key is not provided")
36
+ if not self.MODEL:
37
+ raise ValueError("Model name is not provided")
38
+ if self.BASE_URL:
39
+ self._client = openai.OpenAI(
40
+ base_url=self.BASE_URL,
41
+ api_key=self.API_KEY,
42
+ )
43
+ else:
44
+ self._client = openai.OpenAI(api_key=self.API_KEY)
45
+ elif self.REQUIRES_ANTHROPIC:
46
+ if not self.API_KEY:
47
+ raise ValueError("Model api key is not provided")
48
+ if not self.MODEL:
49
+ raise ValueError("Model name is not provided")
50
+ self._client = anthropic.Anthropic(
51
+ api_key=self.API_KEY,
52
+ )
53
 
54
+ def run(self, file_path: str) -> str:
55
  """Extract model.
56
 
57
  Args:
 
62
  """
63
  raise NotImplementedError("Model extract method is not implemented")
64
 
65
+ class CambioVQA0713(Model):
66
+ BASE_URL = "http://44.242.239.38:8000/v1"
67
+ API_KEY = "Cambioml2024!"
68
+ MODEL = "cambiollm-dust-preview-0713"
69
+ REQUIRES_OPENAI = True
70
+ USE_BEAM_SEARCH = True
71
+
72
+ def __init__(self):
73
+ """Init."""
74
+ super().__init__()
75
+
76
+ def run(self, file_path: str) -> str:
77
+ """Extract data in real-time.
78
+
79
+ Args:
80
+ file_path (str): The path to the file to be parsed.
81
+
82
+ Returns:
83
+ str: The extracted data.
84
+ """
85
+ try:
86
+ pdf_preprocessor = PdfPreprocessor()
87
+ file_contents = pdf_preprocessor.run(file_path)
88
+ contents = []
89
+ for content in file_contents:
90
+ contents.append(
91
+ {
92
+ "type": "image_url",
93
+ "image_url": {
94
+ "url": f"data:image/jpeg;base64,{content}",
95
+ },
96
+ },)
97
+
98
+ messages = [
99
+ {
100
+ "role": "user",
101
+ "content": [
102
+ {
103
+ "type": "text",
104
+ "text": "Convert this image to markdown\nOutput figures\nOutput charts\nOutput tables\nOutput footnotes\nOutput headers\nOutput footers\nOutput page nums",
105
+ },
106
+ {
107
+ "type": "image_url",
108
+ "image_url": {
109
+ "url": f"data:image/jpeg;base64,{file_contents[0]}",
110
+ },
111
+ },
112
+ ],
113
+ }
114
+ ]
115
+ print('Cambio Model - ready to run: ', json.dumps(messages[0])[:200])
116
+
117
+ if self.USE_BEAM_SEARCH:
118
+ response = self._client.chat.completions.create(
119
+ model=self.MODEL,
120
+ messages=messages,
121
+ top_p=1,
122
+ temperature=0,
123
+ extra_body={
124
+ "top_k": -1,
125
+ "use_beam_search": True,
126
+ "best_of": 2,
127
+ },
128
+ )
129
+ else:
130
+ response = self._client.chat.completions.create(
131
+ model=self.MODEL,
132
+ messages=messages,
133
+ max_tokens=1024,
134
+ temperature=0.3,
135
+ top_p=0.7,
136
+ extra_body={
137
+ "top_k": 20,
138
+ },
139
+ )
140
+ print('Cambio Model - response: ', response.choices[0].message.content)
141
+
142
+ return response.choices[0].message.content
143
+ except Exception as e:
144
+ print(f"Error processing input: {str(e)}")
145
+ return f"Error processing with CambioVQA0713: {str(e)}"
146
+
147
+
148
  class AnyParserModel(Model):
149
  BASE_URL = "https://k7u1c342dc.execute-api.us-west-2.amazonaws.com/v1/extract"
150
  API_KEY = os.getenv('ANYPARSER_RT_API_KEY')
151
 
152
+ def run(self, file_path: str) -> str:
153
  """Extract data in real-time.
154
 
155
  Args:
 
213
  if not self.API_KEY:
214
  raise ValueError("The API key is required. Please set the LLAMA_CLOUD_API_KEY environment variable.")
215
 
216
+ def run(self, file_path: str) -> str:
217
  """Extract data in real-time.
218
 
219
  Args:
 
248
  """Init."""
249
  super().__init__()
250
 
251
+ def run(self, file_path: str) -> str:
252
  """Extract data in real-time.
253
 
254
  Args:
 
261
 
262
  elements = partition(file_path)
263
 
264
+ # Combine the elements into a single string
265
+ parsed_text = "\n".join(element.text for element in elements if element.text)
266
 
267
+ # Handle case where no content is parsed
268
  markdown = parsed_text if parsed_text else "No content parsed"
269
  return markdown
270
  except Exception as e:
 
279
  def __init__(self):
280
  """Init."""
281
  super().__init__()
 
 
 
 
 
282
 
283
 
284
+ def run(self, file_path: str) -> str:
285
  """Extract data in real-time.
286
 
287
  Args:
 
309
  {
310
  "role": "user",
311
  "content": [
312
+ {"type": "text", "text": self.PROMPT},
313
  *contents,
314
  ],
315
  }
 
329
  BASE_URL = "http://103.114.163.134:3000/v1/"
330
  API_KEY = os.getenv("ANTHROPIC_API_KEY")
331
  MODEL = "claude-3-5-sonnet-20240620"
332
+ REQUIRES_ANTHROPIC = True
333
 
334
  def __init__(self):
335
  """Init."""
336
  super().__init__()
 
 
 
 
 
 
 
 
337
 
338
+ def run(self, file_path: str) -> str:
339
  """Extract data in real-time.
340
 
341
  Args:
 
346
  """
347
 
348
  try:
349
+ prompt = self.PROMPT
350
  pdf_preprocessor = PdfPreprocessor()
351
  claude_postprocessor = ClaudePostprocessor()
352
  file_contents = pdf_preprocessor.run(file_path)
 
373
  response = self._client.messages.create(
374
  model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=messages
375
  )
376
+ print('-----------\n\n***Anthropic Response:\n\n ', response.content[0].text)
377
  return claude_postprocessor.run(response.content[0].text)
378
  except Exception as e:
379
  return f"Error processing ClaudeModel: {str(e)}"
extractors/model_runner.py CHANGED
@@ -1,5 +1,7 @@
1
  import concurrent.futures
2
- from extractors.model import AnyParserModel, LlamaParseModel, UnstructuredModel, GPTModel, ClaudeModel
 
 
3
 
4
  ap_rt = AnyParserModel()
5
  lp = LlamaParseModel()
@@ -8,11 +10,11 @@ gpt = GPTModel()
8
  claude = ClaudeModel()
9
 
10
  model_function_map = {
11
- "AnyParser": ap_rt.extract,
12
- "LlamaParse": lp.extract,
13
- "Unstructured": un.extract,
14
- "GPT-4o-mini": gpt.extract,
15
- "Claude-3.5-Sonnet": claude.extract,
16
  }
17
 
18
  models = [key for key in model_function_map]
@@ -23,14 +25,22 @@ def run_extract(model, file_path):
23
  markdown = extractor(file_path)
24
  return markdown
25
 
26
- def run_extract_parallel(model_a, model_b, pdf):
 
27
  with concurrent.futures.ThreadPoolExecutor() as executor:
28
  # Submit tasks to the executor for parallel execution
29
  future_a = executor.submit(run_extract, model_a, pdf)
30
  future_b = executor.submit(run_extract, model_b, pdf)
31
 
32
- # Get the results as they complete
33
- result_a = future_a.result()
34
- result_b = future_b.result()
 
 
 
 
 
 
 
35
 
36
- return result_a, result_b
 
1
  import concurrent.futures
2
+ from extractors.model import LlamaParseModel, UnstructuredModel, GPTModel, ClaudeModel, AnyParserModel
3
+
4
+ DEFAULT_TIMEOUT = 30
5
 
6
  ap_rt = AnyParserModel()
7
  lp = LlamaParseModel()
 
10
  claude = ClaudeModel()
11
 
12
  model_function_map = {
13
+ "AnyParser": ap_rt.run,
14
+ "LlamaParse": lp.run,
15
+ "Unstructured": un.run,
16
+ "GPT-4o-mini": gpt.run,
17
+ "Claude-3.5-Sonnet": claude.run,
18
  }
19
 
20
  models = [key for key in model_function_map]
 
25
  markdown = extractor(file_path)
26
  return markdown
27
 
28
+
29
+ def run_extract_parallel(model_a, model_b, pdf, timeout=DEFAULT_TIMEOUT):
30
  with concurrent.futures.ThreadPoolExecutor() as executor:
31
  # Submit tasks to the executor for parallel execution
32
  future_a = executor.submit(run_extract, model_a, pdf)
33
  future_b = executor.submit(run_extract, model_b, pdf)
34
 
35
+ try:
36
+ # Get the results with a timeout
37
+ result_a = future_a.result(timeout=timeout)
38
+ except concurrent.futures.TimeoutError:
39
+ result_a = f"Error: Timeout after {timeout} seconds"
40
+
41
+ try:
42
+ result_b = future_b.result(timeout=timeout)
43
+ except concurrent.futures.TimeoutError:
44
+ result_b = f"Error: Timeout after {timeout} seconds"
45
 
46
+ return result_a, result_b