from pdf2image import convert_from_path from io import BytesIO import base64 import requests import gradio as gr import os os.system("apt-get update") os.system("apt-get install poppler-utils") def convert_pdf_to_images(pdf_file): return convert_from_path(pdf_file) def encode_image_to_base64(image): buffered = BytesIO() image.save(buffered, format="JPEG") return base64.b64encode(buffered.getvalue()).decode("utf-8") def process_pdf(pdf_file, user_prompt): images = convert_pdf_to_images(pdf_file.name) content_responses = [] api_key= os.getenv('OPENAI_API_KEY') headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_key}" } for image in images: base64_image = encode_image_to_base64(image) # Combine the predefined prompt with the user prompt combined_prompt = f"""{user_prompt} REPLY ONLY IN JSON FORMAT mentioned below. Perform OCR and look for mistakes, that can be spelling mistakes, grammar mistakes, contexual errors and wrong definitions. The spellings are according to British English, don't change them to American. Do not provide OCR if there's no mistake on the page. The output you provide should be only of the images you find mistakes in. For example, at one point the definition of NAME is wrong, fix all the similar and any other mistakes you find. The changes you make to these mistakes should be described what the mistake was and why it had to changed. Make sure to Identify the page number and write it in your response as well. DO NOT MISS ANY WRONG DEFINITIONS, ALL THE DATA SHOULD BE FIXED. JSON Respsonse Format: {{ "page_number": "page number", "original_text": "original text", "corrected_text": "corrected text", }} """ payload = { "model": "gpt-4-vision-preview", "messages": [ { "role": "user", "content": [ { "type": "text", "text": combined_prompt }, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}" } } ] } ], "max_tokens": 300 } response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload).json() response_content = response.get('choices', [{}])[0].get('message', {}).get('content', '') if response_content: # Parse the content to remove unwanted characters content_cleaned = response_content.replace("`", "").replace("json ", "").replace("{", "").replace("}", "").replace("\"", "") content_responses.append(content_cleaned) return content_responses iface = gr.Interface( fn=process_pdf, inputs=[gr.File(type="filepath"), gr.Textbox(label="Enter your custom prompt")], outputs="json" ) iface.queue().launch(share=False)