Spaces:
Runtime error
Runtime error
import gradio as gr | |
import re | |
import json | |
from pyidaungsu import pds | |
# Function to split text into bite-sized paragraphs | |
def split_text_into_paragraphs(text): | |
# Define sentence-ending punctuation for Burmese | |
sentence_endings = re.compile(r'α|α\s+') | |
# Split text by sentences | |
sentences = sentence_endings.split(text) | |
# Join sentences into bite-sized paragraphs | |
paragraphs = [] | |
paragraph = "" | |
for sentence in sentences: | |
if len(paragraph) + len(sentence) < 1000: # Adjust the size limit as needed | |
paragraph += sentence + "α" | |
else: | |
paragraphs.append(paragraph.strip()) | |
paragraph = sentence + "α" | |
if paragraph: | |
paragraphs.append(paragraph.strip()) | |
return paragraphs | |
# Function to extract keywords using the pyidaungsu library | |
def extract_keywords(paragraph): | |
# Tokenize the paragraph into words | |
words = pds.tokenize(paragraph, form="word") | |
# Extract keywords (this can be further improved with more advanced techniques) | |
keywords = [word for word in words if len(word) > 1] # Simple keyword extraction | |
return keywords | |
# Main processing function | |
def process_text(text): | |
paragraphs = split_text_into_paragraphs(text) | |
result = [] | |
for para in paragraphs: | |
keywords = extract_keywords(para) | |
result.append({"paragraph": para, "keywords": keywords}) | |
return json.dumps(result, ensure_ascii=False, indent=2) | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=process_text, | |
inputs="text", | |
outputs="text", | |
title="Lazy Reader", | |
description="This app takes large Burmese text without line breaks as input and outputs a JSON of each paragraph and keywords to fetch images related to the paragraph." | |
) | |
# Launch the Gradio app | |
iface.launch() | |