Spaces:

napatswift
/

ratchakitcha-pdf2md

Runtime error

napatswift

Update

712498e over 2 years ago

4.02 kB

	import gradio as gr
	import fitz
	import re
	import os
	import requests

	months = """มกราคม
	กุมภาพันธ์
	มีนาคม
	เมษายน
	พฤษภาคม
	มิถุนายน
	กรกฎาคม
	สิงหาคม
	กันยายน
	ตุลาคม
	พฤศจิกายน
	ธันวาคม""".split('\n')
	months = {m.strip():i for i,m in enumerate(months)}

	def download_pdf(url):
	"""
	Downloads a PDF file from a given URL and saves it to the local filesystem. If the file already exists on the local
	filesystem, the function returns the path to the existing file instead of downloading it again.

	Args:
	url (str): The URL of the PDF file to download.

	Returns:
	str: The path to the downloaded file on the local filesystem.
	"""
	# Extract the filename from the URL
	filename = url.split("/")[-1]

	# Check if the file already exists on the local filesystem
	if os.path.exists(filename):
	# If it does, return the path to the existing file
	return os.path.abspath(filename)

	# If the file doesn't exist, download it from the URL
	response = requests.get(url)

	# Save the downloaded file to the local filesystem
	with open(filename, "wb") as f:
	f.write(response.content)

	# Return the path to the downloaded file
	return os.path.abspath(filename)

	def greet(pdf_file: gr.File, pdf_url: str, replacer_string):
	if pdf_file is None and pdf_url is None:
	return "# Please updload file or link to ratchakitcha file", "Please add file"

	if not replacer_string:
	replacer_string = ' ำ=ำ, า=ำ,้หนา=หน้า,่เลม=เล่ม,๐=0,๑=1,๒=2,๓=3,๔=4,๕=5,๖=6,๗=7,๘=8,๙=9'

	if pdf_file:
	pdf_path = pdf_file.name
	else:
	pdf_path = download_pdf(pdf_url)
	doc = fitz.open(pdf_path)
	md_string = read_lines(doc)
	replacer = re.findall('(([^=])=([^,]),?)', replacer_string)
	for g, s_from, s_to in replacer:
	md_string = md_string.replace(s_from, s_to)
	md_string = get_metainfo(md_string)
	return md_string, md_string

	def get_metainfo(md_string):
	pattern = 'หน้า \d+\s+เล่ม (\d+) ตอน[^\s]+ (\d+) ([ก-ฮ]+) ราชกิจจานุเบกษา (\d+) ([^\s]+) (\d+)\s+'
	info = re.findall(pattern, md_string)
	pattern = '(หน้า \d+\s+เล่ม \d+ ตอน[^\s]+ \d+ [ก-ฮ]+ ราชกิจจานุเบกษา \d+ [^\s]+ \d+)\s+'
	if not info: return md_string
	info = [i for i in info[0]]
	info[4] = months.get(info[4], info[4])
	md_string = re.sub(pattern, r'\n[//]: # (\1)\n\n', md_string)

	md_string = """---
	เล่ม: {}
	ตอนที่: {}
	ประเภท: {}
	date: {}-{}-{}
	---
	""".format(*info) + md_string
	return md_string

	def read_lines(doc):
	lines = ''
	for page in doc.pages():
	words = page.get_text_words()
	words.sort(key=lambda x: (x[1], x[0]))
	curr_y = 0
	drawings = page.get_drawings()
	is_header = True
	for x0,y0,x1,y1,text, _, _, _ in words:
	if y0 != curr_y:
	if is_header:
	lines += '\n'
	elif x0 > 100:
	lines += '\n\n'

	for l in drawings:
	r = l['rect']
	if curr_y < r.y0 < y0:
	lines += '\n----\n\n'; drawings = []; is_header = False; break;

	lines += text.strip() + ' '
	curr_y = y0
	lines += '\n'
	return lines

	demo = gr.Interface(fn=greet,
	inputs=[gr.File(), gr.Text(), gr.Text(interactive=True)],
	outputs=[gr.TextArea(), gr.Markdown()],
	examples=[[None,
	'https://ratchakitcha.soc.go.th/documents/140A014N0000000002600.pdf',
	' ำ=ำ, า=ำ,้หนา=หน้า,่เลม=เล่ม,๐=0,๑=1,๒=2,๓=3,๔=4,๕=5,๖=6,๗=7,๘=8,๙=9'
	]])

	if __name__ == "__main__":
	demo.launch(debug=True)