Spaces:
Runtime error
Runtime error
File size: 4,016 Bytes
c6d506e a845606 c6d506e 2872dd7 c6d506e a845606 c6d506e 097da64 c6d506e 712498e 2872dd7 c6d506e 097da64 c6d506e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import gradio as gr
import fitz
import re
import os
import requests
months = """มกราคม
กุมภาพันธ์
มีนาคม
เมษายน
พฤษภาคม
มิถุนายน
กรกฎาคม
สิงหาคม
กันยายน
ตุลาคม
พฤศจิกายน
ธันวาคม""".split('\n')
months = {m.strip():i for i,m in enumerate(months)}
def download_pdf(url):
"""
Downloads a PDF file from a given URL and saves it to the local filesystem. If the file already exists on the local
filesystem, the function returns the path to the existing file instead of downloading it again.
Args:
url (str): The URL of the PDF file to download.
Returns:
str: The path to the downloaded file on the local filesystem.
"""
# Extract the filename from the URL
filename = url.split("/")[-1]
# Check if the file already exists on the local filesystem
if os.path.exists(filename):
# If it does, return the path to the existing file
return os.path.abspath(filename)
# If the file doesn't exist, download it from the URL
response = requests.get(url)
# Save the downloaded file to the local filesystem
with open(filename, "wb") as f:
f.write(response.content)
# Return the path to the downloaded file
return os.path.abspath(filename)
def greet(pdf_file: gr.File, pdf_url: str, replacer_string):
if pdf_file is None and pdf_url is None:
return "# Please updload file or link to ratchakitcha file", "Please add file"
if not replacer_string:
replacer_string = ' ำ=ำ, า=ำ,้หนา=หน้า,่เลม=เล่ม,๐=0,๑=1,๒=2,๓=3,๔=4,๕=5,๖=6,๗=7,๘=8,๙=9'
if pdf_file:
pdf_path = pdf_file.name
else:
pdf_path = download_pdf(pdf_url)
doc = fitz.open(pdf_path)
md_string = read_lines(doc)
replacer = re.findall('(([^=]*)=([^,]*),?)', replacer_string)
for g, s_from, s_to in replacer:
md_string = md_string.replace(s_from, s_to)
md_string = get_metainfo(md_string)
return md_string, md_string
def get_metainfo(md_string):
pattern = 'หน้า \d+\s+เล่ม (\d+) ตอน[^\s]+ (\d+) ([ก-ฮ]+) ราชกิจจานุเบกษา (\d+) ([^\s]+) (\d+)\s+'
info = re.findall(pattern, md_string)
pattern = '(หน้า \d+\s+เล่ม \d+ ตอน[^\s]+ \d+ [ก-ฮ]+ ราชกิจจานุเบกษา \d+ [^\s]+ \d+)\s+'
if not info: return md_string
info = [i for i in info[0]]
info[4] = months.get(info[4], info[4])
md_string = re.sub(pattern, r'\n[//]: # (\1)\n\n', md_string)
md_string = """---
เล่ม: {}
ตอนที่: {}
ประเภท: {}
date: {}-{}-{}
---
""".format(*info) + md_string
return md_string
def read_lines(doc):
lines = ''
for page in doc.pages():
words = page.get_text_words()
words.sort(key=lambda x: (x[1], x[0]))
curr_y = 0
drawings = page.get_drawings()
is_header = True
for x0,y0,x1,y1,text, _, _, _ in words:
if y0 != curr_y:
if is_header:
lines += '\n'
elif x0 > 100:
lines += '\n\n'
for l in drawings:
r = l['rect']
if curr_y < r.y0 < y0:
lines += '\n----\n\n'; drawings = []; is_header = False; break;
lines += text.strip() + ' '
curr_y = y0
lines += '\n'
return lines
demo = gr.Interface(fn=greet,
inputs=[gr.File(), gr.Text(), gr.Text(interactive=True)],
outputs=[gr.TextArea(), gr.Markdown()],
examples=[[None,
'https://ratchakitcha.soc.go.th/documents/140A014N0000000002600.pdf',
' ำ=ำ, า=ำ,้หนา=หน้า,่เลม=เล่ม,๐=0,๑=1,๒=2,๓=3,๔=4,๕=5,๖=6,๗=7,๘=8,๙=9'
]])
if __name__ == "__main__":
demo.launch(debug=True) |