File size: 1,813 Bytes
b2f3d99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import gradio as gr
import re
import json
from pyidaungsu import pds

# Function to split text into bite-sized paragraphs
def split_text_into_paragraphs(text):
    # Define sentence-ending punctuation for Burmese
    sentence_endings = re.compile(r'။|။\s+')

    # Split text by sentences
    sentences = sentence_endings.split(text)

    # Join sentences into bite-sized paragraphs
    paragraphs = []
    paragraph = ""
    for sentence in sentences:
        if len(paragraph) + len(sentence) < 1000:  # Adjust the size limit as needed
            paragraph += sentence + "။"
        else:
            paragraphs.append(paragraph.strip())
            paragraph = sentence + "။"
    if paragraph:
        paragraphs.append(paragraph.strip())

    return paragraphs

# Function to extract keywords using the pyidaungsu library
def extract_keywords(paragraph):
    # Tokenize the paragraph into words
    words = pds.tokenize(paragraph, form="word")
    # Extract keywords (this can be further improved with more advanced techniques)
    keywords = [word for word in words if len(word) > 1]  # Simple keyword extraction
    return keywords

# Main processing function
def process_text(text):
    paragraphs = split_text_into_paragraphs(text)
    result = []
    for para in paragraphs:
        keywords = extract_keywords(para)
        result.append({"paragraph": para, "keywords": keywords})
    return json.dumps(result, ensure_ascii=False, indent=2)

# Create the Gradio interface
iface = gr.Interface(
    fn=process_text,
    inputs="text",
    outputs="text",
    title="Lazy Reader",
    description="This app takes large Burmese text without line breaks as input and outputs a JSON of each paragraph and keywords to fetch images related to the paragraph."
)

# Launch the Gradio app
iface.launch()