import base64 import json import os import requests import anthropic import openai from dotenv import load_dotenv from pathlib import Path from llama_parse import LlamaParse from llama_index.core import SimpleDirectoryReader from unstructured.partition.auto import partition from preprocessors.preprocessor import PdfPreprocessor from postprocessors.postprocessor import ClaudePostprocessor, GPTPostprocessor load_dotenv() class Model: BASE_URL: str | None = None API_KEY: str | None = None MODEL: str | None = None REQUIRES_OPENAI: bool = False REQUIRES_ANTHROPIC: bool = False PROMPT: str = "Convert these images to markdown" def __init_subclass__(cls) -> None: """Initialize subclass.""" super().__init_subclass__() def __init__(self): if self.REQUIRES_OPENAI: if not self.API_KEY: raise ValueError("Model api key is not provided") if not self.MODEL: raise ValueError("Model name is not provided") if self.BASE_URL: self._client = openai.OpenAI( base_url=self.BASE_URL, api_key=self.API_KEY, ) else: self._client = openai.OpenAI(api_key=self.API_KEY) elif self.REQUIRES_ANTHROPIC: if not self.API_KEY: raise ValueError("Model api key is not provided") if not self.MODEL: raise ValueError("Model name is not provided") self._client = anthropic.Anthropic( api_key=self.API_KEY, ) def run(self, file_path: str) -> str: """Extract model. Args: file_path: path to file to extract Returns: str: output markdown """ raise NotImplementedError("Model extract method is not implemented") class CambioVQA0713(Model): BASE_URL = "http://44.242.239.38:8000/v1" API_KEY = "Cambioml2024!" MODEL = "cambiollm-dust-preview-0713" REQUIRES_OPENAI = True USE_BEAM_SEARCH = True def __init__(self): """Init.""" super().__init__() def run(self, file_path: str) -> str: """Extract data in real-time. Args: file_path (str): The path to the file to be parsed. Returns: str: The extracted data. """ try: pdf_preprocessor = PdfPreprocessor() file_contents = pdf_preprocessor.run(file_path) contents = [] for content in file_contents: contents.append( { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{content}", }, },) messages = [ { "role": "user", "content": [ { "type": "text", "text": "Convert this image to markdown\nOutput figures\nOutput charts\nOutput tables\nOutput footnotes\nOutput headers\nOutput footers\nOutput page nums", }, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{file_contents[0]}", }, }, ], } ] print('Cambio Model - ready to run: ', json.dumps(messages[0])[:200]) if self.USE_BEAM_SEARCH: response = self._client.chat.completions.create( model=self.MODEL, messages=messages, top_p=1, temperature=0, extra_body={ "top_k": -1, "use_beam_search": True, "best_of": 2, }, ) else: response = self._client.chat.completions.create( model=self.MODEL, messages=messages, max_tokens=1024, temperature=0.3, top_p=0.7, extra_body={ "top_k": 20, }, ) print('Cambio Model - response: ', response.choices[0].message.content) return response.choices[0].message.content except Exception as e: print(f"Error processing input: {str(e)}") return f"Error processing with CambioVQA0713: {str(e)}" class AnyParserModel(Model): BASE_URL = "https://k7u1c342dc.execute-api.us-west-2.amazonaws.com/v1/extract" API_KEY = os.getenv('ANYPARSER_RT_API_KEY') def run(self, file_path: str) -> str: """Extract data in real-time. Args: file_path (str): The path to the file to be parsed. Returns: str: The extracted data. """ file_extension = Path(file_path).suffix.lower().lstrip(".") # Check if the file exists if not Path(file_path).is_file(): return "Error: File does not exist", "File does not exist" if file_extension in ["pdf", "docx"]: # Encode the PDF file content in base64 with open(file_path, "rb") as file: encoded_file = base64.b64encode(file.read()).decode("utf-8") else: return "Error: Unsupported file type", "Unsupported file type" # Create the JSON payload payload = { "file_content": encoded_file, "file_type": file_extension, } # Set the headers headers = { "Content-Type": "application/json", "x-api-key": self.API_KEY, } # Send the POST request response = requests.post( self.BASE_URL, headers=headers, data=json.dumps(payload), timeout=30 ) # Check if the request was successful if response.status_code == 200: try: response_data = response.json() response_list = [] for text in response_data["markdown"]: response_list.append(text) markdown_text = "\n".join(response_list) return markdown_text except json.JSONDecodeError: return "Error: Invalid JSON response", f"Response: {response.text}" else: return f"Error: {response.status_code}", f"Response: {response.text}" class LlamaParseModel(Model): BASE_URL = None API_KEY = os.getenv('LLAMA_CLOUD_API_KEY') def __init__(self): """Init.""" super().__init__() if not self.API_KEY: raise ValueError("The API key is required. Please set the LLAMA_CLOUD_API_KEY environment variable.") def run(self, file_path: str) -> str: """Extract data in real-time. Args: file_path (str): The path to the file to be parsed. Returns: str: The extracted data. """ try: parser = LlamaParse( result_type="markdown", num_workers=4, verbose=True, language="en", ) file_extractor = {".pdf": parser} documents = SimpleDirectoryReader(input_files=[file_path], file_extractor=file_extractor).load_data() markdown = "\n\n".join([doc.text for doc in documents]) return markdown except Exception as e: print(f"Error processing input: {str(e)}") return f"Error processing with LlamaParse: {str(e)}" class UnstructuredModel(Model): BASE_URL = None API_KEY = None def __init__(self): """Init.""" super().__init__() def run(self, file_path: str) -> str: """Extract data in real-time. Args: file_path (str): The path to the file to be parsed. Returns: str: The extracted data. """ try: elements = partition(file_path) # Combine the elements into a single string parsed_text = "\n".join(element.text for element in elements if element.text) # Handle case where no content is parsed markdown = parsed_text if parsed_text else "No content parsed" return markdown except Exception as e: return f"Error processing UnstructuredModel: {str(e)}" class GPTModel(Model): BASE_URL = None API_KEY = os.getenv("OPENAI_API_KEY") MODEL = "gpt-4o-mini" REQUIRES_OPENAI = True def __init__(self): """Init.""" super().__init__() def run(self, file_path: str) -> str: """Extract data in real-time. Args: file_path (str): The path to the file to be parsed. Returns: str: The extracted data. """ try: pdf_preprocessor = PdfPreprocessor() gpt_postprocessor = GPTPostprocessor() file_contents = pdf_preprocessor.run(file_path) contents = [] for content in file_contents: contents.append( { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{content}", }, }) messages = [ { "role": "user", "content": [ {"type": "text", "text": self.PROMPT}, *contents, ], } ] response = self._client.chat.completions.create( model=self.MODEL, messages=messages, ) return gpt_postprocessor.run(response.choices[0].message.content) except Exception as e: print(f"Error processing input: {str(e)}") return f"Error processing with GPTModel: {str(e)}" class ClaudeModel(Model): BASE_URL = "http://103.114.163.134:3000/v1/" API_KEY = os.getenv("ANTHROPIC_API_KEY") MODEL = "claude-3-5-sonnet-20240620" REQUIRES_ANTHROPIC = True def __init__(self): """Init.""" super().__init__() def run(self, file_path: str) -> str: """Extract data in real-time. Args: file_path (str): The path to the file to be parsed. Returns: str: The extracted data. """ try: prompt = self.PROMPT pdf_preprocessor = PdfPreprocessor() claude_postprocessor = ClaudePostprocessor() file_contents = pdf_preprocessor.run(file_path) contents = [] for content in file_contents: contents.append( { "type": "image", "source": { "type": "base64", "media_type": "image/jpeg", "data": content, } }) messages = [ {"role": "user", "content": [ {"type": "text", "text": prompt}, *contents, ]} ] response = self._client.messages.create( model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=messages ) print('-----------\n\n***Anthropic Response:\n\n ', response.content[0].text) return claude_postprocessor.run(response.content[0].text) except Exception as e: return f"Error processing ClaudeModel: {str(e)}"