a-guy-from-burma commited on
Commit
b2f3d99
β€’
1 Parent(s): a854520

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -0
app.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import re
3
+ import json
4
+ from pyidaungsu import pds
5
+
6
+ # Function to split text into bite-sized paragraphs
7
+ def split_text_into_paragraphs(text):
8
+ # Define sentence-ending punctuation for Burmese
9
+ sentence_endings = re.compile(r'။|။\s+')
10
+
11
+ # Split text by sentences
12
+ sentences = sentence_endings.split(text)
13
+
14
+ # Join sentences into bite-sized paragraphs
15
+ paragraphs = []
16
+ paragraph = ""
17
+ for sentence in sentences:
18
+ if len(paragraph) + len(sentence) < 1000: # Adjust the size limit as needed
19
+ paragraph += sentence + "။"
20
+ else:
21
+ paragraphs.append(paragraph.strip())
22
+ paragraph = sentence + "။"
23
+ if paragraph:
24
+ paragraphs.append(paragraph.strip())
25
+
26
+ return paragraphs
27
+
28
+ # Function to extract keywords using the pyidaungsu library
29
+ def extract_keywords(paragraph):
30
+ # Tokenize the paragraph into words
31
+ words = pds.tokenize(paragraph, form="word")
32
+ # Extract keywords (this can be further improved with more advanced techniques)
33
+ keywords = [word for word in words if len(word) > 1] # Simple keyword extraction
34
+ return keywords
35
+
36
+ # Main processing function
37
+ def process_text(text):
38
+ paragraphs = split_text_into_paragraphs(text)
39
+ result = []
40
+ for para in paragraphs:
41
+ keywords = extract_keywords(para)
42
+ result.append({"paragraph": para, "keywords": keywords})
43
+ return json.dumps(result, ensure_ascii=False, indent=2)
44
+
45
+ # Create the Gradio interface
46
+ iface = gr.Interface(
47
+ fn=process_text,
48
+ inputs="text",
49
+ outputs="text",
50
+ title="Lazy Reader",
51
+ description="This app takes large Burmese text without line breaks as input and outputs a JSON of each paragraph and keywords to fetch images related to the paragraph."
52
+ )
53
+
54
+ # Launch the Gradio app
55
+ iface.launch()