import base64 import json import os import requests import anthropic import openai from dotenv import load_dotenv from pathlib import Path from llama_parse import LlamaParse from llama_index.core import SimpleDirectoryReader from unstructured.partition.auto import partition from preprocessors.preprocessor import PdfPreprocessor from postprocessors.postprocessor import ClaudePostprocessor, GPTPostprocessor load_dotenv() class Model: BASE_URL: str | None = None API_KEY: str | None = None MODEL: str | None = None def __init_subclass__(cls) -> None: """Initialize subclass.""" super().__init_subclass__() def __init__(self): """Init self""" def extract(self, file_path: str) -> str: """Extract model. Args: file_path: path to file to extract Returns: str: output markdown """ raise NotImplementedError("Model extract method is not implemented") class AnyParserModel(Model): BASE_URL = "https://k7u1c342dc.execute-api.us-west-2.amazonaws.com/v1/extract" API_KEY = os.getenv('ANYPARSER_RT_API_KEY') def extract(self, file_path: str) -> str: """Extract data in real-time. Args: file_path (str): The path to the file to be parsed. Returns: str: The extracted data. """ file_extension = Path(file_path).suffix.lower().lstrip(".") # Check if the file exists if not Path(file_path).is_file(): return "Error: File does not exist", "File does not exist" if file_extension in ["pdf", "docx"]: # Encode the PDF file content in base64 with open(file_path, "rb") as file: encoded_file = base64.b64encode(file.read()).decode("utf-8") else: return "Error: Unsupported file type", "Unsupported file type" # Create the JSON payload payload = { "file_content": encoded_file, "file_type": file_extension, } # Set the headers headers = { "Content-Type": "application/json", "x-api-key": self.API_KEY, } # Send the POST request response = requests.post( self.BASE_URL, headers=headers, data=json.dumps(payload), timeout=30 ) # Check if the request was successful if response.status_code == 200: try: response_data = response.json() response_list = [] for text in response_data["markdown"]: response_list.append(text) markdown_text = "\n".join(response_list) return markdown_text except json.JSONDecodeError: return "Error: Invalid JSON response", f"Response: {response.text}" else: return f"Error: {response.status_code}", f"Response: {response.text}" class LlamaParseModel(Model): BASE_URL = None API_KEY = os.getenv('LLAMA_CLOUD_API_KEY') def __init__(self): """Init.""" super().__init__() if not self.API_KEY: raise ValueError("The API key is required. Please set the LLAMA_CLOUD_API_KEY environment variable.") def extract(self, file_path: str) -> str: """Extract data in real-time. Args: file_path (str): The path to the file to be parsed. Returns: str: The extracted data. """ try: parser = LlamaParse( result_type="markdown", num_workers=4, verbose=True, language="en", ) file_extractor = {".pdf": parser} documents = SimpleDirectoryReader(input_files=[file_path], file_extractor=file_extractor).load_data() markdown = "\n\n".join([doc.text for doc in documents]) return markdown except Exception as e: print(f"Error processing input: {str(e)}") return f"Error processing with LlamaParse: {str(e)}" class UnstructuredModel(Model): BASE_URL = None API_KEY = None def __init__(self): """Init.""" super().__init__() def extract(self, file_path: str) -> str: """Extract data in real-time. Args: file_path (str): The path to the file to be parsed. Returns: str: The extracted data. """ try: elements = partition(file_path) parsed_text = "\n".join(str(element) for element in elements) markdown = parsed_text if parsed_text else "No content parsed" return markdown except Exception as e: return f"Error processing UnstructuredModel: {str(e)}" class GPTModel(Model): BASE_URL = None API_KEY = os.getenv("OPENAI_API_KEY") MODEL = "gpt-4o-mini" REQUIRES_OPENAI = True def __init__(self): """Init.""" super().__init__() if not self.API_KEY: raise ValueError( "The API key is required. Please set the OPENAI_API_KEY environment variable." ) self._client = openai.OpenAI(api_key=self.API_KEY) def extract(self, file_path: str) -> str: """Extract data in real-time. Args: file_path (str): The path to the file to be parsed. Returns: str: The extracted data. """ pdf_preprocessor = PdfPreprocessor() gpt_postprocessor = GPTPostprocessor() file_contents = pdf_preprocessor.run(file_path) contents = [] for content in file_contents: contents.append( { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{content}", }, }) messages = [ { "role": "user", "content": [ {"type": "text", "text": "Convert this image to markdown"}, *contents, ], } ] response = self._client.chat.completions.create( model=self.MODEL, messages=messages, ) return gpt_postprocessor.run(response.choices[0].message.content) class ClaudeModel(Model): BASE_URL = "http://103.114.163.134:3000/v1/" API_KEY = os.getenv("ANTHROPIC_API_KEY") MODEL = "claude-3-5-sonnet-20240620" REQUIRES_OPENAI = True def __init__(self): """Init.""" super().__init__() if not self.API_KEY: raise ValueError( "The API key is required. Please set the ANTHROPIC_API_KEY environment variable." ) self._client = anthropic.Anthropic( api_key=self.API_KEY, ) def extract(self, file_path: str) -> str: """Extract data in real-time. Args: file_path (str): The path to the file to be parsed. Returns: str: The extracted data. """ prompt = "Convert this image to markdown." pdf_preprocessor = PdfPreprocessor() claude_postprocessor = ClaudePostprocessor() file_contents = pdf_preprocessor.run(file_path) contents = [] for content in file_contents: contents.append( { "type": "image", "source": { "type": "base64", "media_type": "image/jpeg", "data": content, } }) messages = [ {"role": "user", "content": [ {"type": "text", "text": prompt}, *contents, ]} ] try: response = self._client.messages.create( model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=messages ) print(response.content[0].text) return claude_postprocessor.run(response.content[0].text) except Exception as e: return f"Error processing ClaudeModel: {str(e)}"