napatswift commited on
Commit
c6d506e
·
1 Parent(s): bfa762a

Add application file

Browse files
Files changed (1) hide show
  1. app.py +118 -0
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz
3
+ import re
4
+ import os
5
+ import requests
6
+
7
+ months = """มกราคม
8
+ กุมภาพันธ์
9
+ มีนาคม
10
+ เมษายน
11
+ พฤษภาคม
12
+ มิถุนายน
13
+ กรกฎาคม
14
+ สิงหาคม
15
+ กันยายน
16
+ ตุลาคม
17
+ พฤศจิกายน
18
+ ธันวาคม""".split('\n')
19
+ months = {m.strip():i for i,m in enumerate(months)}
20
+
21
+ def download_pdf(url):
22
+ """
23
+ Downloads a PDF file from a given URL and saves it to the local filesystem. If the file already exists on the local
24
+ filesystem, the function returns the path to the existing file instead of downloading it again.
25
+
26
+ Args:
27
+ url (str): The URL of the PDF file to download.
28
+
29
+ Returns:
30
+ str: The path to the downloaded file on the local filesystem.
31
+ """
32
+ # Extract the filename from the URL
33
+ filename = url.split("/")[-1]
34
+
35
+ # Check if the file already exists on the local filesystem
36
+ if os.path.exists(filename):
37
+ # If it does, return the path to the existing file
38
+ return os.path.abspath(filename)
39
+
40
+ # If the file doesn't exist, download it from the URL
41
+ response = requests.get(url)
42
+
43
+ # Save the downloaded file to the local filesystem
44
+ with open(filename, "wb") as f:
45
+ f.write(response.content)
46
+
47
+ # Return the path to the downloaded file
48
+ return os.path.abspath(filename)
49
+
50
+ def greet(pdf_file: gr.File, link: str, replacer_string):
51
+ if pdf_file is None and link is None:
52
+ return "# Please updload file or link to ratchakitcha file", "Please add file"
53
+
54
+ if not replacer_string: replacer_string = ' ำ=ำ, า=ำ,้หนา=หน้า,่เลม=เล่ม,๐=0,๑=1,๒=2,๓=3,๔=4,๕=5,๖=6,๗=7,๘=8,๙=9'
55
+ if pdf_file:
56
+ pdf_path = pdf_file.name
57
+ else:
58
+ pdf_path = download_pdf()
59
+ doc = fitz.open(pdf_path)
60
+ md_string = read_lines(doc)
61
+ replacer = re.findall('(([^=]*)=([^,]*),?)', replacer_string)
62
+ for g, s_from, s_to in replacer:
63
+ md_string = md_string.replace(s_from, s_to)
64
+ md_string = get_metainfo(md_string)
65
+ return md_string, md_string
66
+
67
+ def get_metainfo(md_string):
68
+ pattern = 'หน้า \d+\s+เล่ม (\d+) ตอนที่ (\d+) ([ก-ฮ]+) ราชกิจจานุเบกษา (\d+) ([^\s]+) (\d+)\s+'
69
+ info = re.findall(pattern, md_string)
70
+ pattern = '(หน้า \d+\s+เล่ม \d+ ตอนที่ \d+ [ก-ฮ]+ ราชกิจจานุเบกษา \d+ [^\s]+ \d+)\s+'
71
+ info = [i for i in info[0]]
72
+ info[4] = months.get(info[4], info[4])
73
+ md_string = re.sub(pattern, r'\n[//]: # (\1)\n\n', md_string)
74
+
75
+ md_string = """---
76
+ เล่ม: {}
77
+ ตอนที่: {}
78
+ ประเภท: {}
79
+ date: {}-{}-{}
80
+ ---
81
+ """.format(*info) + md_string
82
+ return md_string
83
+
84
+ def read_lines(doc):
85
+ lines = ''
86
+ for page in doc.pages():
87
+ words = page.get_text_words()
88
+ words.sort(key=lambda x: (x[1], x[0]))
89
+ curr_y = 0
90
+ drawings = page.get_drawings()
91
+ is_header = True
92
+ for x0,y0,x1,y1,text, _, _, _ in words:
93
+ if y0 != curr_y:
94
+ if is_header:
95
+ lines += '\n'
96
+ elif x0 > 100:
97
+ lines += '\n\n'
98
+
99
+ for l in drawings:
100
+ r = l['rect']
101
+ if curr_y < r.y0 < y0:
102
+ lines += '\n----\n\n'; drawings = []; is_header = False; break;
103
+
104
+ lines += text.strip() + ' '
105
+ curr_y = y0
106
+ lines += '\n'
107
+ return lines
108
+
109
+ demo = gr.Interface(fn=greet,
110
+ inputs=[gr.File(), gr.Text(), gr.Text(interactive=True)],
111
+ outputs=[gr.Markdown(), gr.TextArea()],
112
+ examples=[[None,
113
+ 'https://ratchakitcha.soc.go.th/documents/140A014N0000000002600.pdf',
114
+ ' ำ=ำ, า=ำ,้หนา=หน้า,่เลม=เล่ม,๐=0,๑=1,๒=2,๓=3,๔=4,๕=5,๖=6,๗=7,๘=8,๙=9'
115
+ ]])
116
+
117
+ if __name__ == "__main__":
118
+ demo.launch(debug=True)