File size: 1,667 Bytes
422beae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57


import os

import gradio as gr
import csv
import fitz  # PyMuPDF


def pdf_to_csv(pdf_file):
    # Open the uploaded PDF file (pdf_file is a TemporaryFile)
    # pdf_reader = PyPDF2.PdfReader(pdf_file.name)
    text_lines = []

    file_name = os.path.basename(pdf_file.name)

    text_lines.append(f"File Name: {file_name}")
    text_lines.append(' 地区 (Region): 2010\n*   收入/支出金额 (Income/Expense Amount): +10,000.00\n* ')
    csv_filename = "extracted_text.csv"
    # Write each line into the CSV file (each line in its own row)
    with open(csv_filename, "w", newline="", encoding="utf-8-sig") as csvfile:
        writer = csv.writer(csvfile)
        for line in text_lines:
            writer.writerow([line])
 
    # Return the CSV file path so Gradio can offer it as a download
    return csv_filename


def pdf_to_pngs(pdf_file): 
    # Open the PDF
    doc = fitz.open(pdf_file)
    pix = None
    outputs = []
    # Loop through each page and save as PNG
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)  # Get the page
        pix = page.get_pixmap()  # Get the image of the page
    
        output_path =  f'page_{page_num + 1}.png'
        pix.save(output_path)  # Save as PNG
        print(f'Saved {output_path}')
        outputs.append(output_path)
    return outputs

# Create a simple single-page Gradio interface
demo = gr.Interface(
    fn=pdf_to_pngs,
    inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
    outputs=gr.File(label="Download CSV"),
    title="PDF to CSV Converter",
    description="Upload a PDF file, extract its text line-by-line, and download a CSV."
)

demo.launch()