File size: 4,016 Bytes
c6d506e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a845606
 
c6d506e
 
2872dd7
 
 
c6d506e
 
 
a845606
c6d506e
 
 
 
 
 
 
 
 
097da64
c6d506e
712498e
2872dd7
c6d506e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
097da64
c6d506e
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
import fitz
import re
import os
import requests

months = """มกราคม
กุมภาพันธ์
มีนาคม   
เมษายน
พฤษภาคม
มิถุนายน
กรกฎาคม
สิงหาคม
กันยายน
ตุลาคม
พฤศจิกายน
ธันวาคม""".split('\n')
months = {m.strip():i for i,m in enumerate(months)}

def download_pdf(url):
    """
    Downloads a PDF file from a given URL and saves it to the local filesystem. If the file already exists on the local
    filesystem, the function returns the path to the existing file instead of downloading it again.

    Args:
        url (str): The URL of the PDF file to download.

    Returns:
        str: The path to the downloaded file on the local filesystem.
    """
    # Extract the filename from the URL
    filename = url.split("/")[-1]

    # Check if the file already exists on the local filesystem
    if os.path.exists(filename):
        # If it does, return the path to the existing file
        return os.path.abspath(filename)

    # If the file doesn't exist, download it from the URL
    response = requests.get(url)

    # Save the downloaded file to the local filesystem
    with open(filename, "wb") as f:
        f.write(response.content)

    # Return the path to the downloaded file
    return os.path.abspath(filename)

def greet(pdf_file: gr.File, pdf_url: str, replacer_string):
  if pdf_file is None and pdf_url is None:
    return "# Please updload file or link to ratchakitcha file", "Please add file"

  if not replacer_string:
    replacer_string = ' ำ=ำ, า=ำ,้หนา=หน้า,่เลม=เล่ม,๐=0,๑=1,๒=2,๓=3,๔=4,๕=5,๖=6,๗=7,๘=8,๙=9'

  if pdf_file:
    pdf_path = pdf_file.name
  else:
    pdf_path = download_pdf(pdf_url)
  doc = fitz.open(pdf_path)
  md_string = read_lines(doc)
  replacer = re.findall('(([^=]*)=([^,]*),?)', replacer_string)
  for g, s_from, s_to in replacer:
    md_string = md_string.replace(s_from, s_to)
  md_string = get_metainfo(md_string)
  return md_string, md_string

def get_metainfo(md_string):
  pattern = 'หน้า \d+\s+เล่ม (\d+) ตอน[^\s]+ (\d+) ([ก-ฮ]+) ราชกิจจานุเบกษา (\d+) ([^\s]+) (\d+)\s+'
  info = re.findall(pattern, md_string)
  pattern = '(หน้า \d+\s+เล่ม \d+ ตอน[^\s]+ \d+ [ก-ฮ]+ ราชกิจจานุเบกษา \d+ [^\s]+ \d+)\s+'
  if not info: return md_string
  info = [i for i in info[0]]
  info[4] = months.get(info[4], info[4])
  md_string = re.sub(pattern, r'\n[//]: # (\1)\n\n', md_string)

  md_string = """---
เล่ม: {}
ตอนที่: {}
ประเภท: {}
date: {}-{}-{}
---
""".format(*info) + md_string
  return md_string

def read_lines(doc):
  lines = ''
  for page in doc.pages():
    words = page.get_text_words()
    words.sort(key=lambda x: (x[1], x[0]))
    curr_y = 0
    drawings = page.get_drawings()
    is_header = True
    for x0,y0,x1,y1,text, _, _, _ in words:
      if y0 != curr_y:
        if is_header:
          lines += '\n'
        elif x0 > 100:
          lines += '\n\n'

        for l in drawings:
          r = l['rect']
          if curr_y < r.y0 < y0:
            lines += '\n----\n\n'; drawings = []; is_header = False; break;

      lines += text.strip() + ' '
      curr_y = y0
    lines += '\n'
  return lines

demo = gr.Interface(fn=greet,
                    inputs=[gr.File(), gr.Text(), gr.Text(interactive=True)],
                    outputs=[gr.TextArea(), gr.Markdown()],
                    examples=[[None,
                               'https://ratchakitcha.soc.go.th/documents/140A014N0000000002600.pdf',
                               ' ำ=ำ, า=ำ,้หนา=หน้า,่เลม=เล่ม,๐=0,๑=1,๒=2,๓=3,๔=4,๕=5,๖=6,๗=7,๘=8,๙=9'
                               ]])
    
if __name__ == "__main__":
    demo.launch(debug=True)