Spaces:
Runtime error
Runtime error
import gradio as gr | |
import fitz | |
import re | |
import os | |
import requests | |
months = """มกราคม | |
กุมภาพันธ์ | |
มีนาคม | |
เมษายน | |
พฤษภาคม | |
มิถุนายน | |
กรกฎาคม | |
สิงหาคม | |
กันยายน | |
ตุลาคม | |
พฤศจิกายน | |
ธันวาคม""".split('\n') | |
months = {m.strip():i for i,m in enumerate(months)} | |
def download_pdf(url): | |
""" | |
Downloads a PDF file from a given URL and saves it to the local filesystem. If the file already exists on the local | |
filesystem, the function returns the path to the existing file instead of downloading it again. | |
Args: | |
url (str): The URL of the PDF file to download. | |
Returns: | |
str: The path to the downloaded file on the local filesystem. | |
""" | |
# Extract the filename from the URL | |
filename = url.split("/")[-1] | |
# Check if the file already exists on the local filesystem | |
if os.path.exists(filename): | |
# If it does, return the path to the existing file | |
return os.path.abspath(filename) | |
# If the file doesn't exist, download it from the URL | |
response = requests.get(url) | |
# Save the downloaded file to the local filesystem | |
with open(filename, "wb") as f: | |
f.write(response.content) | |
# Return the path to the downloaded file | |
return os.path.abspath(filename) | |
def greet(pdf_file: gr.File, pdf_url: str, replacer_string): | |
if pdf_file is None and pdf_url is None: | |
return "# Please updload file or link to ratchakitcha file", "Please add file" | |
if not replacer_string: | |
replacer_string = ' ำ=ำ, า=ำ,้หนา=หน้า,่เลม=เล่ม,๐=0,๑=1,๒=2,๓=3,๔=4,๕=5,๖=6,๗=7,๘=8,๙=9' | |
if pdf_file: | |
pdf_path = pdf_file.name | |
else: | |
pdf_path = download_pdf(pdf_url) | |
doc = fitz.open(pdf_path) | |
md_string = read_lines(doc) | |
replacer = re.findall('(([^=]*)=([^,]*),?)', replacer_string) | |
for g, s_from, s_to in replacer: | |
md_string = md_string.replace(s_from, s_to) | |
md_string = get_metainfo(md_string) | |
return md_string, md_string | |
def get_metainfo(md_string): | |
pattern = 'หน้า \d+\s+เล่ม (\d+) ตอน[^\s]+ (\d+) ([ก-ฮ]+) ราชกิจจานุเบกษา (\d+) ([^\s]+) (\d+)\s+' | |
info = re.findall(pattern, md_string) | |
pattern = '(หน้า \d+\s+เล่ม \d+ ตอน[^\s]+ \d+ [ก-ฮ]+ ราชกิจจานุเบกษา \d+ [^\s]+ \d+)\s+' | |
if not info: return md_string | |
info = [i for i in info[0]] | |
info[4] = months.get(info[4], info[4]) | |
md_string = re.sub(pattern, r'\n[//]: # (\1)\n\n', md_string) | |
md_string = """--- | |
เล่ม: {} | |
ตอนที่: {} | |
ประเภท: {} | |
date: {}-{}-{} | |
--- | |
""".format(*info) + md_string | |
return md_string | |
def read_lines(doc): | |
lines = '' | |
for page in doc.pages(): | |
words = page.get_text_words() | |
words.sort(key=lambda x: (x[1], x[0])) | |
curr_y = 0 | |
drawings = page.get_drawings() | |
is_header = True | |
for x0,y0,x1,y1,text, _, _, _ in words: | |
if y0 != curr_y: | |
if is_header: | |
lines += '\n' | |
elif x0 > 100: | |
lines += '\n\n' | |
for l in drawings: | |
r = l['rect'] | |
if curr_y < r.y0 < y0: | |
lines += '\n----\n\n'; drawings = []; is_header = False; break; | |
lines += text.strip() + ' ' | |
curr_y = y0 | |
lines += '\n' | |
return lines | |
demo = gr.Interface(fn=greet, | |
inputs=[gr.File(), gr.Text(), gr.Text(interactive=True)], | |
outputs=[gr.TextArea(), gr.Markdown()], | |
examples=[[None, | |
'https://ratchakitcha.soc.go.th/documents/140A014N0000000002600.pdf', | |
' ำ=ำ, า=ำ,้หนา=หน้า,่เลม=เล่ม,๐=0,๑=1,๒=2,๓=3,๔=4,๕=5,๖=6,๗=7,๘=8,๙=9' | |
]]) | |
if __name__ == "__main__": | |
demo.launch(debug=True) |