Spaces:
Sleeping
Sleeping
import base64 | |
import json | |
import os | |
import requests | |
import anthropic | |
import openai | |
from dotenv import load_dotenv | |
from pathlib import Path | |
from llama_parse import LlamaParse | |
from llama_index.core import SimpleDirectoryReader | |
from unstructured.partition.auto import partition | |
from preprocessors.preprocessor import PdfPreprocessor | |
from postprocessors.postprocessor import ClaudePostprocessor, GPTPostprocessor | |
load_dotenv() | |
class Model: | |
BASE_URL: str | None = None | |
API_KEY: str | None = None | |
MODEL: str | None = None | |
def __init_subclass__(cls) -> None: | |
"""Initialize subclass.""" | |
super().__init_subclass__() | |
def __init__(self): | |
"""Init self""" | |
def extract(self, file_path: str) -> str: | |
"""Extract model. | |
Args: | |
file_path: path to file to extract | |
Returns: | |
str: output markdown | |
""" | |
raise NotImplementedError("Model extract method is not implemented") | |
class AnyParserModel(Model): | |
BASE_URL = "https://k7u1c342dc.execute-api.us-west-2.amazonaws.com/v1/extract" | |
API_KEY = os.getenv('ANYPARSER_RT_API_KEY') | |
def extract(self, file_path: str) -> str: | |
"""Extract data in real-time. | |
Args: | |
file_path (str): The path to the file to be parsed. | |
Returns: | |
str: The extracted data. | |
""" | |
file_extension = Path(file_path).suffix.lower().lstrip(".") | |
# Check if the file exists | |
if not Path(file_path).is_file(): | |
return "Error: File does not exist", "File does not exist" | |
if file_extension in ["pdf", "docx"]: | |
# Encode the PDF file content in base64 | |
with open(file_path, "rb") as file: | |
encoded_file = base64.b64encode(file.read()).decode("utf-8") | |
else: | |
return "Error: Unsupported file type", "Unsupported file type" | |
# Create the JSON payload | |
payload = { | |
"file_content": encoded_file, | |
"file_type": file_extension, | |
} | |
# Set the headers | |
headers = { | |
"Content-Type": "application/json", | |
"x-api-key": self.API_KEY, | |
} | |
# Send the POST request | |
response = requests.post( | |
self.BASE_URL, headers=headers, data=json.dumps(payload), timeout=30 | |
) | |
# Check if the request was successful | |
if response.status_code == 200: | |
try: | |
response_data = response.json() | |
response_list = [] | |
for text in response_data["markdown"]: | |
response_list.append(text) | |
markdown_text = "\n".join(response_list) | |
return markdown_text | |
except json.JSONDecodeError: | |
return "Error: Invalid JSON response", f"Response: {response.text}" | |
else: | |
return f"Error: {response.status_code}", f"Response: {response.text}" | |
class LlamaParseModel(Model): | |
BASE_URL = None | |
API_KEY = os.getenv('LLAMA_CLOUD_API_KEY') | |
def __init__(self): | |
"""Init.""" | |
super().__init__() | |
if not self.API_KEY: | |
raise ValueError("The API key is required. Please set the LLAMA_CLOUD_API_KEY environment variable.") | |
def extract(self, file_path: str) -> str: | |
"""Extract data in real-time. | |
Args: | |
file_path (str): The path to the file to be parsed. | |
Returns: | |
str: The extracted data. | |
""" | |
try: | |
parser = LlamaParse( | |
result_type="markdown", | |
num_workers=4, | |
verbose=True, | |
language="en", | |
) | |
file_extractor = {".pdf": parser} | |
documents = SimpleDirectoryReader(input_files=[file_path], file_extractor=file_extractor).load_data() | |
markdown = "\n\n".join([doc.text for doc in documents]) | |
return markdown | |
except Exception as e: | |
print(f"Error processing input: {str(e)}") | |
return f"Error processing with LlamaParse: {str(e)}" | |
class UnstructuredModel(Model): | |
BASE_URL = None | |
API_KEY = None | |
def __init__(self): | |
"""Init.""" | |
super().__init__() | |
def extract(self, file_path: str) -> str: | |
"""Extract data in real-time. | |
Args: | |
file_path (str): The path to the file to be parsed. | |
Returns: | |
str: The extracted data. | |
""" | |
try: | |
elements = partition(file_path) | |
parsed_text = "\n".join(str(element) for element in elements) | |
markdown = parsed_text if parsed_text else "No content parsed" | |
return markdown | |
except Exception as e: | |
return f"Error processing UnstructuredModel: {str(e)}" | |
class GPTModel(Model): | |
BASE_URL = None | |
API_KEY = os.getenv("OPENAI_API_KEY") | |
MODEL = "gpt-4o-mini" | |
REQUIRES_OPENAI = True | |
def __init__(self): | |
"""Init.""" | |
super().__init__() | |
if not self.API_KEY: | |
raise ValueError( | |
"The API key is required. Please set the OPENAI_API_KEY environment variable." | |
) | |
self._client = openai.OpenAI(api_key=self.API_KEY) | |
def extract(self, file_path: str) -> str: | |
"""Extract data in real-time. | |
Args: | |
file_path (str): The path to the file to be parsed. | |
Returns: | |
str: The extracted data. | |
""" | |
pdf_preprocessor = PdfPreprocessor() | |
gpt_postprocessor = GPTPostprocessor() | |
file_contents = pdf_preprocessor.run(file_path) | |
contents = [] | |
for content in file_contents: | |
contents.append( | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{content}", | |
}, | |
}) | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": "Convert this image to markdown"}, | |
*contents, | |
], | |
} | |
] | |
response = self._client.chat.completions.create( | |
model=self.MODEL, | |
messages=messages, | |
) | |
return gpt_postprocessor.run(response.choices[0].message.content) | |
class ClaudeModel(Model): | |
BASE_URL = "http://103.114.163.134:3000/v1/" | |
API_KEY = os.getenv("ANTHROPIC_API_KEY") | |
MODEL = "claude-3-5-sonnet-20240620" | |
REQUIRES_OPENAI = True | |
def __init__(self): | |
"""Init.""" | |
super().__init__() | |
if not self.API_KEY: | |
raise ValueError( | |
"The API key is required. Please set the ANTHROPIC_API_KEY environment variable." | |
) | |
self._client = anthropic.Anthropic( | |
api_key=self.API_KEY, | |
) | |
def extract(self, file_path: str) -> str: | |
"""Extract data in real-time. | |
Args: | |
file_path (str): The path to the file to be parsed. | |
Returns: | |
str: The extracted data. | |
""" | |
prompt = "Convert this image to markdown." | |
pdf_preprocessor = PdfPreprocessor() | |
claude_postprocessor = ClaudePostprocessor() | |
file_contents = pdf_preprocessor.run(file_path) | |
contents = [] | |
for content in file_contents: | |
contents.append( | |
{ | |
"type": "image", | |
"source": { | |
"type": "base64", | |
"media_type": "image/jpeg", | |
"data": content, | |
} | |
}) | |
messages = [ | |
{"role": "user", "content": [ | |
{"type": "text", "text": prompt}, | |
*contents, | |
]} | |
] | |
try: | |
response = self._client.messages.create( | |
model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=messages | |
) | |
print(response.content[0].text) | |
return claude_postprocessor.run(response.content[0].text) | |
except Exception as e: | |
return f"Error processing ClaudeModel: {str(e)}" | |