Spaces:
Sleeping
Sleeping
add timeout to model run
Browse files- extractors/model.py +120 -25
- extractors/model_runner.py +21 -11
extractors/model.py
CHANGED
@@ -20,15 +20,38 @@ class Model:
|
|
20 |
BASE_URL: str | None = None
|
21 |
API_KEY: str | None = None
|
22 |
MODEL: str | None = None
|
|
|
|
|
|
|
|
|
23 |
|
24 |
def __init_subclass__(cls) -> None:
|
25 |
"""Initialize subclass."""
|
26 |
super().__init_subclass__()
|
27 |
|
28 |
def __init__(self):
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
def
|
32 |
"""Extract model.
|
33 |
|
34 |
Args:
|
@@ -39,11 +62,94 @@ class Model:
|
|
39 |
"""
|
40 |
raise NotImplementedError("Model extract method is not implemented")
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
class AnyParserModel(Model):
|
43 |
BASE_URL = "https://k7u1c342dc.execute-api.us-west-2.amazonaws.com/v1/extract"
|
44 |
API_KEY = os.getenv('ANYPARSER_RT_API_KEY')
|
45 |
|
46 |
-
def
|
47 |
"""Extract data in real-time.
|
48 |
|
49 |
Args:
|
@@ -107,7 +213,7 @@ class LlamaParseModel(Model):
|
|
107 |
if not self.API_KEY:
|
108 |
raise ValueError("The API key is required. Please set the LLAMA_CLOUD_API_KEY environment variable.")
|
109 |
|
110 |
-
def
|
111 |
"""Extract data in real-time.
|
112 |
|
113 |
Args:
|
@@ -142,7 +248,7 @@ class UnstructuredModel(Model):
|
|
142 |
"""Init."""
|
143 |
super().__init__()
|
144 |
|
145 |
-
def
|
146 |
"""Extract data in real-time.
|
147 |
|
148 |
Args:
|
@@ -155,8 +261,10 @@ class UnstructuredModel(Model):
|
|
155 |
|
156 |
elements = partition(file_path)
|
157 |
|
158 |
-
|
|
|
159 |
|
|
|
160 |
markdown = parsed_text if parsed_text else "No content parsed"
|
161 |
return markdown
|
162 |
except Exception as e:
|
@@ -171,14 +279,9 @@ class GPTModel(Model):
|
|
171 |
def __init__(self):
|
172 |
"""Init."""
|
173 |
super().__init__()
|
174 |
-
if not self.API_KEY:
|
175 |
-
raise ValueError(
|
176 |
-
"The API key is required. Please set the OPENAI_API_KEY environment variable."
|
177 |
-
)
|
178 |
-
self._client = openai.OpenAI(api_key=self.API_KEY)
|
179 |
|
180 |
|
181 |
-
def
|
182 |
"""Extract data in real-time.
|
183 |
|
184 |
Args:
|
@@ -206,7 +309,7 @@ class GPTModel(Model):
|
|
206 |
{
|
207 |
"role": "user",
|
208 |
"content": [
|
209 |
-
{"type": "text", "text":
|
210 |
*contents,
|
211 |
],
|
212 |
}
|
@@ -226,21 +329,13 @@ class ClaudeModel(Model):
|
|
226 |
BASE_URL = "http://103.114.163.134:3000/v1/"
|
227 |
API_KEY = os.getenv("ANTHROPIC_API_KEY")
|
228 |
MODEL = "claude-3-5-sonnet-20240620"
|
229 |
-
|
230 |
|
231 |
def __init__(self):
|
232 |
"""Init."""
|
233 |
super().__init__()
|
234 |
-
if not self.API_KEY:
|
235 |
-
raise ValueError(
|
236 |
-
"The API key is required. Please set the ANTHROPIC_API_KEY environment variable."
|
237 |
-
)
|
238 |
-
self._client = anthropic.Anthropic(
|
239 |
-
api_key=self.API_KEY,
|
240 |
-
)
|
241 |
-
|
242 |
|
243 |
-
def
|
244 |
"""Extract data in real-time.
|
245 |
|
246 |
Args:
|
@@ -251,7 +346,7 @@ class ClaudeModel(Model):
|
|
251 |
"""
|
252 |
|
253 |
try:
|
254 |
-
prompt =
|
255 |
pdf_preprocessor = PdfPreprocessor()
|
256 |
claude_postprocessor = ClaudePostprocessor()
|
257 |
file_contents = pdf_preprocessor.run(file_path)
|
@@ -278,7 +373,7 @@ class ClaudeModel(Model):
|
|
278 |
response = self._client.messages.create(
|
279 |
model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=messages
|
280 |
)
|
281 |
-
print(response.content[0].text)
|
282 |
return claude_postprocessor.run(response.content[0].text)
|
283 |
except Exception as e:
|
284 |
return f"Error processing ClaudeModel: {str(e)}"
|
|
|
20 |
BASE_URL: str | None = None
|
21 |
API_KEY: str | None = None
|
22 |
MODEL: str | None = None
|
23 |
+
REQUIRES_OPENAI: bool = False
|
24 |
+
REQUIRES_ANTHROPIC: bool = False
|
25 |
+
PROMPT: str = "Convert these images to markdown"
|
26 |
+
|
27 |
|
28 |
def __init_subclass__(cls) -> None:
|
29 |
"""Initialize subclass."""
|
30 |
super().__init_subclass__()
|
31 |
|
32 |
def __init__(self):
|
33 |
+
if self.REQUIRES_OPENAI:
|
34 |
+
if not self.API_KEY:
|
35 |
+
raise ValueError("Model api key is not provided")
|
36 |
+
if not self.MODEL:
|
37 |
+
raise ValueError("Model name is not provided")
|
38 |
+
if self.BASE_URL:
|
39 |
+
self._client = openai.OpenAI(
|
40 |
+
base_url=self.BASE_URL,
|
41 |
+
api_key=self.API_KEY,
|
42 |
+
)
|
43 |
+
else:
|
44 |
+
self._client = openai.OpenAI(api_key=self.API_KEY)
|
45 |
+
elif self.REQUIRES_ANTHROPIC:
|
46 |
+
if not self.API_KEY:
|
47 |
+
raise ValueError("Model api key is not provided")
|
48 |
+
if not self.MODEL:
|
49 |
+
raise ValueError("Model name is not provided")
|
50 |
+
self._client = anthropic.Anthropic(
|
51 |
+
api_key=self.API_KEY,
|
52 |
+
)
|
53 |
|
54 |
+
def run(self, file_path: str) -> str:
|
55 |
"""Extract model.
|
56 |
|
57 |
Args:
|
|
|
62 |
"""
|
63 |
raise NotImplementedError("Model extract method is not implemented")
|
64 |
|
65 |
+
class CambioVQA0713(Model):
|
66 |
+
BASE_URL = "http://44.242.239.38:8000/v1"
|
67 |
+
API_KEY = "Cambioml2024!"
|
68 |
+
MODEL = "cambiollm-dust-preview-0713"
|
69 |
+
REQUIRES_OPENAI = True
|
70 |
+
USE_BEAM_SEARCH = True
|
71 |
+
|
72 |
+
def __init__(self):
|
73 |
+
"""Init."""
|
74 |
+
super().__init__()
|
75 |
+
|
76 |
+
def run(self, file_path: str) -> str:
|
77 |
+
"""Extract data in real-time.
|
78 |
+
|
79 |
+
Args:
|
80 |
+
file_path (str): The path to the file to be parsed.
|
81 |
+
|
82 |
+
Returns:
|
83 |
+
str: The extracted data.
|
84 |
+
"""
|
85 |
+
try:
|
86 |
+
pdf_preprocessor = PdfPreprocessor()
|
87 |
+
file_contents = pdf_preprocessor.run(file_path)
|
88 |
+
contents = []
|
89 |
+
for content in file_contents:
|
90 |
+
contents.append(
|
91 |
+
{
|
92 |
+
"type": "image_url",
|
93 |
+
"image_url": {
|
94 |
+
"url": f"data:image/jpeg;base64,{content}",
|
95 |
+
},
|
96 |
+
},)
|
97 |
+
|
98 |
+
messages = [
|
99 |
+
{
|
100 |
+
"role": "user",
|
101 |
+
"content": [
|
102 |
+
{
|
103 |
+
"type": "text",
|
104 |
+
"text": "Convert this image to markdown\nOutput figures\nOutput charts\nOutput tables\nOutput footnotes\nOutput headers\nOutput footers\nOutput page nums",
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"type": "image_url",
|
108 |
+
"image_url": {
|
109 |
+
"url": f"data:image/jpeg;base64,{file_contents[0]}",
|
110 |
+
},
|
111 |
+
},
|
112 |
+
],
|
113 |
+
}
|
114 |
+
]
|
115 |
+
print('Cambio Model - ready to run: ', json.dumps(messages[0])[:200])
|
116 |
+
|
117 |
+
if self.USE_BEAM_SEARCH:
|
118 |
+
response = self._client.chat.completions.create(
|
119 |
+
model=self.MODEL,
|
120 |
+
messages=messages,
|
121 |
+
top_p=1,
|
122 |
+
temperature=0,
|
123 |
+
extra_body={
|
124 |
+
"top_k": -1,
|
125 |
+
"use_beam_search": True,
|
126 |
+
"best_of": 2,
|
127 |
+
},
|
128 |
+
)
|
129 |
+
else:
|
130 |
+
response = self._client.chat.completions.create(
|
131 |
+
model=self.MODEL,
|
132 |
+
messages=messages,
|
133 |
+
max_tokens=1024,
|
134 |
+
temperature=0.3,
|
135 |
+
top_p=0.7,
|
136 |
+
extra_body={
|
137 |
+
"top_k": 20,
|
138 |
+
},
|
139 |
+
)
|
140 |
+
print('Cambio Model - response: ', response.choices[0].message.content)
|
141 |
+
|
142 |
+
return response.choices[0].message.content
|
143 |
+
except Exception as e:
|
144 |
+
print(f"Error processing input: {str(e)}")
|
145 |
+
return f"Error processing with CambioVQA0713: {str(e)}"
|
146 |
+
|
147 |
+
|
148 |
class AnyParserModel(Model):
|
149 |
BASE_URL = "https://k7u1c342dc.execute-api.us-west-2.amazonaws.com/v1/extract"
|
150 |
API_KEY = os.getenv('ANYPARSER_RT_API_KEY')
|
151 |
|
152 |
+
def run(self, file_path: str) -> str:
|
153 |
"""Extract data in real-time.
|
154 |
|
155 |
Args:
|
|
|
213 |
if not self.API_KEY:
|
214 |
raise ValueError("The API key is required. Please set the LLAMA_CLOUD_API_KEY environment variable.")
|
215 |
|
216 |
+
def run(self, file_path: str) -> str:
|
217 |
"""Extract data in real-time.
|
218 |
|
219 |
Args:
|
|
|
248 |
"""Init."""
|
249 |
super().__init__()
|
250 |
|
251 |
+
def run(self, file_path: str) -> str:
|
252 |
"""Extract data in real-time.
|
253 |
|
254 |
Args:
|
|
|
261 |
|
262 |
elements = partition(file_path)
|
263 |
|
264 |
+
# Combine the elements into a single string
|
265 |
+
parsed_text = "\n".join(element.text for element in elements if element.text)
|
266 |
|
267 |
+
# Handle case where no content is parsed
|
268 |
markdown = parsed_text if parsed_text else "No content parsed"
|
269 |
return markdown
|
270 |
except Exception as e:
|
|
|
279 |
def __init__(self):
|
280 |
"""Init."""
|
281 |
super().__init__()
|
|
|
|
|
|
|
|
|
|
|
282 |
|
283 |
|
284 |
+
def run(self, file_path: str) -> str:
|
285 |
"""Extract data in real-time.
|
286 |
|
287 |
Args:
|
|
|
309 |
{
|
310 |
"role": "user",
|
311 |
"content": [
|
312 |
+
{"type": "text", "text": self.PROMPT},
|
313 |
*contents,
|
314 |
],
|
315 |
}
|
|
|
329 |
BASE_URL = "http://103.114.163.134:3000/v1/"
|
330 |
API_KEY = os.getenv("ANTHROPIC_API_KEY")
|
331 |
MODEL = "claude-3-5-sonnet-20240620"
|
332 |
+
REQUIRES_ANTHROPIC = True
|
333 |
|
334 |
def __init__(self):
|
335 |
"""Init."""
|
336 |
super().__init__()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
|
338 |
+
def run(self, file_path: str) -> str:
|
339 |
"""Extract data in real-time.
|
340 |
|
341 |
Args:
|
|
|
346 |
"""
|
347 |
|
348 |
try:
|
349 |
+
prompt = self.PROMPT
|
350 |
pdf_preprocessor = PdfPreprocessor()
|
351 |
claude_postprocessor = ClaudePostprocessor()
|
352 |
file_contents = pdf_preprocessor.run(file_path)
|
|
|
373 |
response = self._client.messages.create(
|
374 |
model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=messages
|
375 |
)
|
376 |
+
print('-----------\n\n***Anthropic Response:\n\n ', response.content[0].text)
|
377 |
return claude_postprocessor.run(response.content[0].text)
|
378 |
except Exception as e:
|
379 |
return f"Error processing ClaudeModel: {str(e)}"
|
extractors/model_runner.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import concurrent.futures
|
2 |
-
from extractors.model import
|
|
|
|
|
3 |
|
4 |
ap_rt = AnyParserModel()
|
5 |
lp = LlamaParseModel()
|
@@ -8,11 +10,11 @@ gpt = GPTModel()
|
|
8 |
claude = ClaudeModel()
|
9 |
|
10 |
model_function_map = {
|
11 |
-
"AnyParser": ap_rt.
|
12 |
-
"LlamaParse": lp.
|
13 |
-
"Unstructured": un.
|
14 |
-
"GPT-4o-mini": gpt.
|
15 |
-
"Claude-3.5-Sonnet": claude.
|
16 |
}
|
17 |
|
18 |
models = [key for key in model_function_map]
|
@@ -23,14 +25,22 @@ def run_extract(model, file_path):
|
|
23 |
markdown = extractor(file_path)
|
24 |
return markdown
|
25 |
|
26 |
-
|
|
|
27 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
28 |
# Submit tasks to the executor for parallel execution
|
29 |
future_a = executor.submit(run_extract, model_a, pdf)
|
30 |
future_b = executor.submit(run_extract, model_b, pdf)
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
return result_a, result_b
|
|
|
1 |
import concurrent.futures
|
2 |
+
from extractors.model import LlamaParseModel, UnstructuredModel, GPTModel, ClaudeModel, AnyParserModel
|
3 |
+
|
4 |
+
DEFAULT_TIMEOUT = 30
|
5 |
|
6 |
ap_rt = AnyParserModel()
|
7 |
lp = LlamaParseModel()
|
|
|
10 |
claude = ClaudeModel()
|
11 |
|
12 |
model_function_map = {
|
13 |
+
"AnyParser": ap_rt.run,
|
14 |
+
"LlamaParse": lp.run,
|
15 |
+
"Unstructured": un.run,
|
16 |
+
"GPT-4o-mini": gpt.run,
|
17 |
+
"Claude-3.5-Sonnet": claude.run,
|
18 |
}
|
19 |
|
20 |
models = [key for key in model_function_map]
|
|
|
25 |
markdown = extractor(file_path)
|
26 |
return markdown
|
27 |
|
28 |
+
|
29 |
+
def run_extract_parallel(model_a, model_b, pdf, timeout=DEFAULT_TIMEOUT):
|
30 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
31 |
# Submit tasks to the executor for parallel execution
|
32 |
future_a = executor.submit(run_extract, model_a, pdf)
|
33 |
future_b = executor.submit(run_extract, model_b, pdf)
|
34 |
|
35 |
+
try:
|
36 |
+
# Get the results with a timeout
|
37 |
+
result_a = future_a.result(timeout=timeout)
|
38 |
+
except concurrent.futures.TimeoutError:
|
39 |
+
result_a = f"Error: Timeout after {timeout} seconds"
|
40 |
+
|
41 |
+
try:
|
42 |
+
result_b = future_b.result(timeout=timeout)
|
43 |
+
except concurrent.futures.TimeoutError:
|
44 |
+
result_b = f"Error: Timeout after {timeout} seconds"
|
45 |
|
46 |
+
return result_a, result_b
|