import gradio as gr
import re
import json
from pyidaungsu import pds

# Function to split text into bite-sized paragraphs
def split_text_into_paragraphs(text):
    # Define sentence-ending punctuation for Burmese
    sentence_endings = re.compile(r'။|။\s+')

    # Split text by sentences
    sentences = sentence_endings.split(text)

    # Join sentences into bite-sized paragraphs
    paragraphs = []
    paragraph = ""
    for sentence in sentences:
        if len(paragraph) + len(sentence) < 1000:  # Adjust the size limit as needed
            paragraph += sentence + "။"
        else:
            paragraphs.append(paragraph.strip())
            paragraph = sentence + "။"
    if paragraph:
        paragraphs.append(paragraph.strip())

    return paragraphs

# Function to extract keywords using the pyidaungsu library
def extract_keywords(paragraph):
    # Tokenize the paragraph into words
    words = pds.tokenize(paragraph, form="word")
    # Extract keywords (this can be further improved with more advanced techniques)
    keywords = [word for word in words if len(word) > 1]  # Simple keyword extraction
    return keywords

# Main processing function
def process_text(text):
    paragraphs = split_text_into_paragraphs(text)
    result = []
    for para in paragraphs:
        keywords = extract_keywords(para)
        result.append({"paragraph": para, "keywords": keywords})
    return json.dumps(result, ensure_ascii=False, indent=2)

# Create the Gradio interface
iface = gr.Interface(
    fn=process_text,
    inputs="text",
    outputs="text",
    title="Lazy Reader",
    description="This app takes large Burmese text without line breaks as input and outputs a JSON of each paragraph and keywords to fetch images related to the paragraph."
)

# Launch the Gradio app
iface.launch()