File size: 7,061 Bytes
e5f4b97
6e805b9
 
 
 
 
0e7b36e
6e805b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5f4b97
bd3440d
 
 
 
 
6e805b9
 
 
50c7387
6e805b9
50c7387
a36aede
bd3440d
 
6e805b9
 
 
 
f68828e
 
 
 
 
 
 
 
6e805b9
 
bd3440d
 
 
 
 
 
 
 
2cf3347
bd3440d
 
 
 
 
e5f4b97
0e7b36e
 
 
 
 
 
 
e5f4b97
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import gradio as gr
import pymupdf  # PyMuPDF for handling PDF files
from PIL import Image
import os 
from functions import get_image_informations
from dataSchema import *
# import shutil



def Noc_timeSheet_pdf_to_img(pdf_path,output_path,dpi: int = 300, quality: int = 95):
    pdf_document = pymupdf.open(pdf_path)

    # Get the first page of the PDF
    page = pdf_document.load_page(0)  # 0 is the first page

    # Convert the page to a pixmap (image)
    pix = page.get_pixmap(dpi=dpi)


    # Convert the pixmap to a PIL Image and save as JPG
    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    width, height = image.size
    start_y_total_table = int(height* 0.42)
    end_y_first_table =  int(height*0.30)

    croped1 = image.crop((0, 0, width//2, end_y_first_table))
    croped2 = image.crop((0, start_y_total_table, width//2, height))
    upper_width, upper_height = croped1.size
    lower_width, lower_height = croped2.size
    combined_image = Image.new('RGB', (upper_width, upper_height + lower_height))

    # Paste the upper image (croped1) on top
    combined_image.paste(croped1, (0, 0))

    # Paste the lower image (croped2) below the upper image
    combined_image.paste(croped2, (0, upper_height))

    # Save the combined image
    combined_image.save(output_path, "JPEG",quality=quality)  
    
    #-----------S3------------ need S3_BUCKET,S3_REGION,S3_URL 
    # import boto3
    
    # s3_client = boto3.client('s3', region_name=S3_REGION)
    # s3_client.upload_file(output_path, S3_BUCKET, key)
    
    # file_url = f"{S3_URL}/{key}"
    
    # return file_url
    
    # return output_path
 
def Clauses_in_invoice(pdf_path: str) -> bool:
    """
    Extract text from the last page of a PDF.
    """
    pdf_document = pymupdf.open(pdf_path)
    total_pages = pdf_document.page_count 
    last_page = pdf_document.load_page(total_pages - 1)  
    text = last_page.get_text() 
    pdf_document.close() 
    if "clauses" in text.lower():
        return True
    else:
        return False
 
def Noc_invoice_pdf_to_img(pdf_path: str, folder_path: str, dpi: int = 300, quality: int = 95):
    
    pdf_document = pymupdf.open(pdf_path)
    folder_path = folder_path.rstrip(os.sep)
    os.makedirs(folder_path, exist_ok=True)
    
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    total_pages = pdf_document.page_count
    image_paths=[]
    for page_num in range(total_pages):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap(dpi=dpi)
        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        output_path = os.path.join(folder_path, f"{pdf_name}_page_{page_num + 1}.jpg")
        image.save(output_path, "JPEG", quality=quality)
        
        #-----------S3------------ need S3_BUCKET,S3_REGION,S3_URL 
        # import boto3
        
        # s3_client = boto3.client('s3', region_name=S3_REGION)
        # s3_client.upload_file(output_path, S3_BUCKET, key)
        
        # file_url = f"{S3_URL}/{key}"
        
        # append the s3 links
        # image_paths.append(file_url)
        
        
        image_paths.append(output_path)

    pdf_document.close()
    return image_paths

def delete_images(image_paths):
    # Iterate through the list of image paths
    for image_path in image_paths:
        try:
            # Check if the file exists before attempting to delete
            if os.path.exists(image_path):
                os.remove(image_path)
                print(f"Deleted: {image_path}")
            else:
                print(f"File not found: {image_path}")
        except Exception as e:
            print(f"Error deleting {image_path}: {e}")

def noc_invoice_extraction(pdf_path: str,folder_path):
    
    image_paths=Noc_invoice_pdf_to_img(pdf_path,folder_path)
    data = {}
    result = get_image_informations(image_paths[0],invoice_first_page_prompt,Noc_PurchaseOrder_information_parser)
    data.update(result)
    result = get_image_informations(image_paths[1],invoice_item_page1_prompt,Noc_PurchaseOrder_item1_parser)
    data.update(result)
    if Clauses_in_invoice(pdf_path):      
        for pic in range(len(image_paths)-4):
            new_item = get_image_informations(image_paths[pic+2],invoice_item_pages_prompt,Noc_PurchaseOrder_items_parser)
            for item in new_item["items"]:
                data["items"].append(item)
        result = get_image_informations(image_paths[-2],invoice_total_page_prompt,Noc_PurchaseOrder_total_parser)
        data.update(result)
        result = get_image_informations(image_paths[-1],invoice_clauses_page_prompt,Noc_PurchaseOrder_clauses_parser)
        data.update(result)
        delete_images(image_paths)
        return data
    else:
        for pic in range(len(image_paths)-3):
            new_item = get_image_informations(image_paths[pic+2],invoice_item_pages_prompt,Noc_PurchaseOrder_items_parser)
            for item in new_item["items"]:
                data["items"].append(item)
        result = get_image_informations(image_paths[-2],invoice_total_page_prompt,Noc_PurchaseOrder_total_parser)
        data.update(result)
        delete_images(image_paths)
        return data
  

def process_pdf(file, option):
    if file is None:
        return "Please upload a PDF file."
    
    try:
        
        save_dir = "uploaded_files"
        os.makedirs(save_dir, exist_ok=True)  # Create the directory if it doesn't exist

        
        # Save the uploaded file to the new location
        file_path = file.name
        
        # Process based on the selected option
        if option == "Noc_timesheet_resdiential":
            Noc_timeSheet_pdf_to_img(file_path,"output.jpg")
            result = get_image_informations("output.jpg",Noc_Res_timesheet_prompt,Noc_Res_timeSheet_parser)
            return result
        elif option == "Noc_timesheet_rotational":
            Noc_timeSheet_pdf_to_img(file_path,"output.jpg")
            result = get_image_informations("output.jpg",Noc_Rot_timesheet_prompt,Noc_Rot_timeSheet_parser)
            return result
        elif option=="Noc_invoice":
            result = noc_invoice_extraction(file_path,save_dir)
            return result
            
    #     else:
    #         return "Invalid option selected."
    except Exception as e:
        return f"An error occurred: {e}"

# Define the Gradio interface
demo = gr.Interface(
    fn=process_pdf,
    inputs=[
        gr.File(label="Upload PDF"),  # File upload input
        gr.Radio(["Noc_timesheet_resdiential","Noc_timesheet_rotational", "Noc_invoice"], label="Choose an option")  # Radio buttons for options
    ],
    outputs="text",  # Text output
    title="PDF Processor",
    description="Upload a PDF and choose an option to process the text."
)

with gr.Blocks() as app:
    demo.render()
    gr.Markdown("### pdf examples")  # Section title
    with gr.Row():
        gr.Image("TS.png", label="NOC timesheet example")
        gr.Image("invoice.png", label="NOC invoice example")

demo.launch()