Spaces:
Sleeping
Sleeping
import base64 | |
import json | |
import os | |
import requests | |
import anthropic | |
import openai | |
from dotenv import load_dotenv | |
from pathlib import Path | |
from llama_parse import LlamaParse | |
from llama_index.core import SimpleDirectoryReader | |
from unstructured.partition.auto import partition | |
from preprocessors.preprocessor import PdfPreprocessor | |
from postprocessors.postprocessor import ClaudePostprocessor, GPTPostprocessor | |
load_dotenv() | |
class Model: | |
BASE_URL: str | None = None | |
API_KEY: str | None = None | |
MODEL: str | None = None | |
REQUIRES_OPENAI: bool = False | |
REQUIRES_ANTHROPIC: bool = False | |
PROMPT: str = "Convert these images to markdown" | |
def __init_subclass__(cls) -> None: | |
"""Initialize subclass.""" | |
super().__init_subclass__() | |
def __init__(self): | |
if self.REQUIRES_OPENAI: | |
if not self.API_KEY: | |
raise ValueError("Model api key is not provided") | |
if not self.MODEL: | |
raise ValueError("Model name is not provided") | |
if self.BASE_URL: | |
self._client = openai.OpenAI( | |
base_url=self.BASE_URL, | |
api_key=self.API_KEY, | |
) | |
else: | |
self._client = openai.OpenAI(api_key=self.API_KEY) | |
elif self.REQUIRES_ANTHROPIC: | |
if not self.API_KEY: | |
raise ValueError("Model api key is not provided") | |
if not self.MODEL: | |
raise ValueError("Model name is not provided") | |
self._client = anthropic.Anthropic( | |
api_key=self.API_KEY, | |
) | |
def run(self, file_path: str) -> str: | |
"""Extract model. | |
Args: | |
file_path: path to file to extract | |
Returns: | |
str: output markdown | |
""" | |
raise NotImplementedError("Model extract method is not implemented") | |
class CambioVQA0713(Model): | |
BASE_URL = "http://44.242.239.38:8000/v1" | |
API_KEY = "Cambioml2024!" | |
MODEL = "cambiollm-dust-preview-0713" | |
REQUIRES_OPENAI = True | |
USE_BEAM_SEARCH = True | |
def __init__(self): | |
"""Init.""" | |
super().__init__() | |
def run(self, file_path: str) -> str: | |
"""Extract data in real-time. | |
Args: | |
file_path (str): The path to the file to be parsed. | |
Returns: | |
str: The extracted data. | |
""" | |
try: | |
pdf_preprocessor = PdfPreprocessor() | |
file_contents = pdf_preprocessor.run(file_path) | |
contents = [] | |
for content in file_contents: | |
contents.append( | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{content}", | |
}, | |
},) | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": "Convert this image to markdown\nOutput figures\nOutput charts\nOutput tables\nOutput footnotes\nOutput headers\nOutput footers\nOutput page nums", | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{file_contents[0]}", | |
}, | |
}, | |
], | |
} | |
] | |
print('Cambio Model - ready to run: ', json.dumps(messages[0])[:200]) | |
if self.USE_BEAM_SEARCH: | |
response = self._client.chat.completions.create( | |
model=self.MODEL, | |
messages=messages, | |
top_p=1, | |
temperature=0, | |
extra_body={ | |
"top_k": -1, | |
"use_beam_search": True, | |
"best_of": 2, | |
}, | |
) | |
else: | |
response = self._client.chat.completions.create( | |
model=self.MODEL, | |
messages=messages, | |
max_tokens=1024, | |
temperature=0.3, | |
top_p=0.7, | |
extra_body={ | |
"top_k": 20, | |
}, | |
) | |
print('Cambio Model - response: ', response.choices[0].message.content) | |
return response.choices[0].message.content | |
except Exception as e: | |
print(f"Error processing input: {str(e)}") | |
return f"Error processing with CambioVQA0713: {str(e)}" | |
class AnyParserModel(Model): | |
BASE_URL = "https://k7u1c342dc.execute-api.us-west-2.amazonaws.com/v1/extract" | |
API_KEY = os.getenv('ANYPARSER_RT_API_KEY') | |
def run(self, file_path: str) -> str: | |
"""Extract data in real-time. | |
Args: | |
file_path (str): The path to the file to be parsed. | |
Returns: | |
str: The extracted data. | |
""" | |
file_extension = Path(file_path).suffix.lower().lstrip(".") | |
# Check if the file exists | |
if not Path(file_path).is_file(): | |
return "Error: File does not exist", "File does not exist" | |
if file_extension in ["pdf", "docx"]: | |
# Encode the PDF file content in base64 | |
with open(file_path, "rb") as file: | |
encoded_file = base64.b64encode(file.read()).decode("utf-8") | |
else: | |
return "Error: Unsupported file type", "Unsupported file type" | |
# Create the JSON payload | |
payload = { | |
"file_content": encoded_file, | |
"file_type": file_extension, | |
} | |
# Set the headers | |
headers = { | |
"Content-Type": "application/json", | |
"x-api-key": self.API_KEY, | |
} | |
# Send the POST request | |
response = requests.post( | |
self.BASE_URL, headers=headers, data=json.dumps(payload), timeout=30 | |
) | |
# Check if the request was successful | |
if response.status_code == 200: | |
try: | |
response_data = response.json() | |
response_list = [] | |
for text in response_data["markdown"]: | |
response_list.append(text) | |
markdown_text = "\n".join(response_list) | |
return markdown_text | |
except json.JSONDecodeError: | |
return "Error: Invalid JSON response", f"Response: {response.text}" | |
else: | |
return f"Error: {response.status_code}", f"Response: {response.text}" | |
class LlamaParseModel(Model): | |
BASE_URL = None | |
API_KEY = os.getenv('LLAMA_CLOUD_API_KEY') | |
def __init__(self): | |
"""Init.""" | |
super().__init__() | |
if not self.API_KEY: | |
raise ValueError("The API key is required. Please set the LLAMA_CLOUD_API_KEY environment variable.") | |
def run(self, file_path: str) -> str: | |
"""Extract data in real-time. | |
Args: | |
file_path (str): The path to the file to be parsed. | |
Returns: | |
str: The extracted data. | |
""" | |
try: | |
parser = LlamaParse( | |
result_type="markdown", | |
num_workers=4, | |
verbose=True, | |
language="en", | |
) | |
file_extractor = {".pdf": parser} | |
documents = SimpleDirectoryReader(input_files=[file_path], file_extractor=file_extractor).load_data() | |
markdown = "\n\n".join([doc.text for doc in documents]) | |
return markdown | |
except Exception as e: | |
print(f"Error processing input: {str(e)}") | |
return f"Error processing with LlamaParse: {str(e)}" | |
class UnstructuredModel(Model): | |
BASE_URL = None | |
API_KEY = None | |
def __init__(self): | |
"""Init.""" | |
super().__init__() | |
def run(self, file_path: str) -> str: | |
"""Extract data in real-time. | |
Args: | |
file_path (str): The path to the file to be parsed. | |
Returns: | |
str: The extracted data. | |
""" | |
try: | |
elements = partition(file_path) | |
# Combine the elements into a single string | |
parsed_text = "\n".join(element.text for element in elements if element.text) | |
# Handle case where no content is parsed | |
markdown = parsed_text if parsed_text else "No content parsed" | |
return markdown | |
except Exception as e: | |
return f"Error processing UnstructuredModel: {str(e)}" | |
class GPTModel(Model): | |
BASE_URL = None | |
API_KEY = os.getenv("OPENAI_API_KEY") | |
MODEL = "gpt-4o-mini" | |
REQUIRES_OPENAI = True | |
def __init__(self): | |
"""Init.""" | |
super().__init__() | |
def run(self, file_path: str) -> str: | |
"""Extract data in real-time. | |
Args: | |
file_path (str): The path to the file to be parsed. | |
Returns: | |
str: The extracted data. | |
""" | |
try: | |
pdf_preprocessor = PdfPreprocessor() | |
gpt_postprocessor = GPTPostprocessor() | |
file_contents = pdf_preprocessor.run(file_path) | |
contents = [] | |
for content in file_contents: | |
contents.append( | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{content}", | |
}, | |
}) | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": self.PROMPT}, | |
*contents, | |
], | |
} | |
] | |
response = self._client.chat.completions.create( | |
model=self.MODEL, | |
messages=messages, | |
) | |
return gpt_postprocessor.run(response.choices[0].message.content) | |
except Exception as e: | |
print(f"Error processing input: {str(e)}") | |
return f"Error processing with GPTModel: {str(e)}" | |
class ClaudeModel(Model): | |
BASE_URL = "http://103.114.163.134:3000/v1/" | |
API_KEY = os.getenv("ANTHROPIC_API_KEY") | |
MODEL = "claude-3-5-sonnet-20240620" | |
REQUIRES_ANTHROPIC = True | |
def __init__(self): | |
"""Init.""" | |
super().__init__() | |
def run(self, file_path: str) -> str: | |
"""Extract data in real-time. | |
Args: | |
file_path (str): The path to the file to be parsed. | |
Returns: | |
str: The extracted data. | |
""" | |
try: | |
prompt = self.PROMPT | |
pdf_preprocessor = PdfPreprocessor() | |
claude_postprocessor = ClaudePostprocessor() | |
file_contents = pdf_preprocessor.run(file_path) | |
contents = [] | |
for content in file_contents: | |
contents.append( | |
{ | |
"type": "image", | |
"source": { | |
"type": "base64", | |
"media_type": "image/jpeg", | |
"data": content, | |
} | |
}) | |
messages = [ | |
{"role": "user", "content": [ | |
{"type": "text", "text": prompt}, | |
*contents, | |
]} | |
] | |
response = self._client.messages.create( | |
model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=messages | |
) | |
print('-----------\n\n***Anthropic Response:\n\n ', response.content[0].text) | |
return claude_postprocessor.run(response.content[0].text) | |
except Exception as e: | |
return f"Error processing ClaudeModel: {str(e)}" | |