JasonData commited on
Commit
422beae
·
verified ·
1 Parent(s): d2e606b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -36
app.py CHANGED
@@ -1,36 +1,56 @@
1
-
2
-
3
- import os
4
-
5
- import gradio as gr
6
- import csv
7
-
8
- def pdf_to_csv(pdf_file):
9
- # Open the uploaded PDF file (pdf_file is a TemporaryFile)
10
- # pdf_reader = PyPDF2.PdfReader(pdf_file.name)
11
- text_lines = []
12
-
13
- file_name = os.path.basename(pdf_file.name)
14
-
15
- text_lines.append(f"File Name: {file_name}")
16
- csv_filename = "extracted_text.csv"
17
- # Write each line into the CSV file (each line in its own row)
18
- with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile:
19
- writer = csv.writer(csvfile)
20
- for line in text_lines:
21
- writer.writerow([line])
22
-
23
- # Return the CSV file path so Gradio can offer it as a download
24
- return csv_filename
25
-
26
- # Create a simple single-page Gradio interface
27
- demo = gr.Interface(
28
- fn=pdf_to_csv,
29
- inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
30
- outputs=gr.File(label="Download CSV"),
31
- title="PDF to CSV Converter",
32
- description="Upload a PDF file, extract its text line-by-line, and download a CSV."
33
- )
34
-
35
- demo.launch()
36
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import os
4
+
5
+ import gradio as gr
6
+ import csv
7
+ import fitz # PyMuPDF
8
+
9
+
10
+ def pdf_to_csv(pdf_file):
11
+ # Open the uploaded PDF file (pdf_file is a TemporaryFile)
12
+ # pdf_reader = PyPDF2.PdfReader(pdf_file.name)
13
+ text_lines = []
14
+
15
+ file_name = os.path.basename(pdf_file.name)
16
+
17
+ text_lines.append(f"File Name: {file_name}")
18
+ text_lines.append(' 地区 (Region): 2010\n* 收入/支出金额 (Income/Expense Amount): +10,000.00\n* ')
19
+ csv_filename = "extracted_text.csv"
20
+ # Write each line into the CSV file (each line in its own row)
21
+ with open(csv_filename, "w", newline="", encoding="utf-8-sig") as csvfile:
22
+ writer = csv.writer(csvfile)
23
+ for line in text_lines:
24
+ writer.writerow([line])
25
+
26
+ # Return the CSV file path so Gradio can offer it as a download
27
+ return csv_filename
28
+
29
+
30
+ def pdf_to_pngs(pdf_file):
31
+ # Open the PDF
32
+ doc = fitz.open(pdf_file)
33
+ pix = None
34
+ outputs = []
35
+ # Loop through each page and save as PNG
36
+ for page_num in range(doc.page_count):
37
+ page = doc.load_page(page_num) # Get the page
38
+ pix = page.get_pixmap() # Get the image of the page
39
+
40
+ output_path = f'page_{page_num + 1}.png'
41
+ pix.save(output_path) # Save as PNG
42
+ print(f'Saved {output_path}')
43
+ outputs.append(output_path)
44
+ return outputs
45
+
46
+ # Create a simple single-page Gradio interface
47
+ demo = gr.Interface(
48
+ fn=pdf_to_pngs,
49
+ inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
50
+ outputs=gr.File(label="Download CSV"),
51
+ title="PDF to CSV Converter",
52
+ description="Upload a PDF file, extract its text line-by-line, and download a CSV."
53
+ )
54
+
55
+ demo.launch()
56
+