Spaces:
Sleeping
Sleeping
Dataset pages added
Browse files- .gitignore +2 -1
- fonts/TiroDevanagariHindi-Regular.ttf +0 -0
- requirments.txt +5 -334
- src/Demo.py +4 -8
- src/{ModelMethods.py → api/ModelMethods.py} +0 -0
- src/pages/LiteratureReview.py +0 -8
- src/pages/References.py +0 -14
- src/pages/📈DatasetAnalysis.py +540 -0
- src/pages/📊DatasetsPreparation.py +59 -0
.gitignore
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
# Folders to ignore
|
2 |
-
|
|
|
3 |
model-local/
|
4 |
__pycache__/
|
5 |
src/__pycache__/
|
|
|
1 |
# Folders to ignore
|
2 |
+
models/
|
3 |
+
datafiles/
|
4 |
model-local/
|
5 |
__pycache__/
|
6 |
src/__pycache__/
|
fonts/TiroDevanagariHindi-Regular.ttf
ADDED
Binary file (415 kB). View file
|
|
requirments.txt
CHANGED
@@ -1,334 +1,5 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
altgraph==0.17.3
|
7 |
-
aniso8601==9.0.1
|
8 |
-
annotated-types==0.6.0
|
9 |
-
anyio==4.2.0
|
10 |
-
appdirs==1.4.4
|
11 |
-
argon2-cffi==23.1.0
|
12 |
-
argon2-cffi-bindings==21.2.0
|
13 |
-
arrow==1.3.0
|
14 |
-
asgiref==3.5.0
|
15 |
-
asttokens==2.4.1
|
16 |
-
astunparse==1.6.3
|
17 |
-
async-lru==2.0.4
|
18 |
-
async-timeout==4.0.2
|
19 |
-
attrs==22.1.0
|
20 |
-
autopep8==1.6.0
|
21 |
-
Babel==2.14.0
|
22 |
-
backports.csv==1.0.7
|
23 |
-
backports.entry-points-selectable==1.1.1
|
24 |
-
beautifulsoup4==4.10.0
|
25 |
-
bitsandbytes==0.42.0
|
26 |
-
bleach==6.1.0
|
27 |
-
blinker==1.5
|
28 |
-
blis==0.7.11
|
29 |
-
boto3==1.34.19
|
30 |
-
botocore==1.34.19
|
31 |
-
branca==0.7.0
|
32 |
-
cachetools==5.2.0
|
33 |
-
catalogue==2.0.10
|
34 |
-
certifi==2022.6.15
|
35 |
-
cffi==1.15.1
|
36 |
-
chardet==4.0.0
|
37 |
-
charset-normalizer==2.1.1
|
38 |
-
cheroot==8.6.0
|
39 |
-
CherryPy==18.6.1
|
40 |
-
click==8.1.3
|
41 |
-
cloudpathlib==0.16.0
|
42 |
-
colorama==0.4.5
|
43 |
-
comm==0.2.0
|
44 |
-
commonmark==0.9.1
|
45 |
-
confection==0.1.4
|
46 |
-
contextualSpellCheck==0.4.4
|
47 |
-
contourpy==1.0.6
|
48 |
-
cryptography==38.0.1
|
49 |
-
cycler==0.11.0
|
50 |
-
cymem==2.0.8
|
51 |
-
Cython==3.0.6
|
52 |
-
datasets==2.16.1
|
53 |
-
dateparser==1.1.0
|
54 |
-
debugpy==1.8.0
|
55 |
-
decorator==5.1.1
|
56 |
-
defusedxml==0.7.1
|
57 |
-
dill==0.3.7
|
58 |
-
distlib==0.3.4
|
59 |
-
dj-database-url==0.5.0
|
60 |
-
Django==4.0.4
|
61 |
-
django-ckeditor==6.2.0
|
62 |
-
django-crispy-forms==1.14.0
|
63 |
-
django-heroku==0.3.1
|
64 |
-
django-js-asset==1.2.2
|
65 |
-
docker-pycreds==0.4.0
|
66 |
-
docopt==0.6.2
|
67 |
-
docutils==0.20.1
|
68 |
-
docx2pdf==0.1.8
|
69 |
-
editdistance==0.6.2
|
70 |
-
einops==0.7.0
|
71 |
-
entrypoints==0.4
|
72 |
-
et-xmlfile==1.1.0
|
73 |
-
evaluate==0.4.0
|
74 |
-
exceptiongroup==1.2.0
|
75 |
-
executing==2.0.1
|
76 |
-
ez-setup==0.9
|
77 |
-
fastjsonschema==2.19.1
|
78 |
-
feedparser==6.0.8
|
79 |
-
filelock==3.4.0
|
80 |
-
Flask==2.2.2
|
81 |
-
Flask-API==3.0.post1
|
82 |
-
Flask-Cors==3.0.10
|
83 |
-
Flask-RESTful==0.3.9
|
84 |
-
Flask-SQLAlchemy==2.5.1
|
85 |
-
flatbuffers==23.5.26
|
86 |
-
fonttools==4.38.0
|
87 |
-
fqdn==1.5.1
|
88 |
-
frozenlist==1.3.3
|
89 |
-
fsspec==2023.10.0
|
90 |
-
future==0.18.2
|
91 |
-
gast==0.5.4
|
92 |
-
gitdb==4.0.10
|
93 |
-
GitPython==3.1.29
|
94 |
-
google==3.0.0
|
95 |
-
google-auth==2.25.2
|
96 |
-
google-auth-oauthlib==1.2.0
|
97 |
-
google-pasta==0.2.0
|
98 |
-
GoogleNews==1.6.0
|
99 |
-
greenlet==1.1.3
|
100 |
-
grpcio==1.60.0
|
101 |
-
gunicorn==20.1.0
|
102 |
-
h5py==3.10.0
|
103 |
-
happytransformer==3.0.0
|
104 |
-
heroku==0.1.4
|
105 |
-
huggingface-hub==0.20.1
|
106 |
-
idna==3.3
|
107 |
-
imageio==2.19.3
|
108 |
-
imageio-ffmpeg==0.4.7
|
109 |
-
importlib-metadata==5.1.0
|
110 |
-
instaloader==4.9.6
|
111 |
-
ipykernel==6.28.0
|
112 |
-
ipyleaflet==0.18.1
|
113 |
-
ipython==8.19.0
|
114 |
-
ipywidgets==8.1.1
|
115 |
-
isoduration==20.11.0
|
116 |
-
itsdangerous==2.1.2
|
117 |
-
jaraco.classes==3.2.1
|
118 |
-
jaraco.collections==3.5.1
|
119 |
-
jaraco.context==4.1.1
|
120 |
-
jaraco.functools==3.5.0
|
121 |
-
jaraco.text==3.7.0
|
122 |
-
jedi==0.19.1
|
123 |
-
Jinja2==3.1.2
|
124 |
-
jmespath==1.0.1
|
125 |
-
joblib==1.3.2
|
126 |
-
json5==0.9.14
|
127 |
-
jsonlines==4.0.0
|
128 |
-
jsonpointer==2.4
|
129 |
-
jsonschema==4.17.3
|
130 |
-
jsonschema-specifications==2023.12.1
|
131 |
-
jupyter-events==0.9.0
|
132 |
-
jupyter-lsp==2.2.1
|
133 |
-
jupyter_client==8.6.0
|
134 |
-
jupyter_core==5.5.1
|
135 |
-
jupyter_server==2.12.1
|
136 |
-
jupyter_server_terminals==0.5.1
|
137 |
-
jupyterlab==4.0.10
|
138 |
-
jupyterlab-widgets==3.0.9
|
139 |
-
jupyterlab_pygments==0.3.0
|
140 |
-
jupyterlab_server==2.25.2
|
141 |
-
jwt==1.3.1
|
142 |
-
keras==2.15.0
|
143 |
-
keyring==24.3.0
|
144 |
-
kiwisolver==1.4.4
|
145 |
-
langcodes==3.3.0
|
146 |
-
Levenshtein==0.23.0
|
147 |
-
libclang==16.0.6
|
148 |
-
loralib==0.1.2
|
149 |
-
lxml==4.9.1
|
150 |
-
Markdown==3.5.1
|
151 |
-
markdown-it-py==3.0.0
|
152 |
-
MarkupSafe==2.1.1
|
153 |
-
matplotlib==3.7.1
|
154 |
-
matplotlib-inline==0.1.6
|
155 |
-
mdurl==0.1.2
|
156 |
-
mistune==3.0.2
|
157 |
-
ml-dtypes==0.2.0
|
158 |
-
more-itertools==8.12.0
|
159 |
-
moviepy==1.0.3
|
160 |
-
mpmath==1.3.0
|
161 |
-
multidict==6.0.4
|
162 |
-
multiprocess==0.70.15
|
163 |
-
multitasking==0.0.11
|
164 |
-
murmurhash==1.0.10
|
165 |
-
mysql-connector-python==8.0.31
|
166 |
-
mysqlclient==2.1.0
|
167 |
-
nbclient==0.9.0
|
168 |
-
nbconvert==7.13.1
|
169 |
-
nbformat==5.9.2
|
170 |
-
nest-asyncio==1.5.8
|
171 |
-
networkx==3.2.1
|
172 |
-
news==1.0
|
173 |
-
nh3==0.2.15
|
174 |
-
nltk==3.7
|
175 |
-
notebook_shim==0.2.3
|
176 |
-
numpy==1.23.5
|
177 |
-
oauthlib==3.2.2
|
178 |
-
openai==0.27.2
|
179 |
-
openpyxl==3.1.2
|
180 |
-
opt-einsum==3.3.0
|
181 |
-
overrides==7.4.0
|
182 |
-
packaging==21.3
|
183 |
-
pafy==0.5.5
|
184 |
-
pandas==1.5.2
|
185 |
-
pandocfilters==1.5.0
|
186 |
-
parso==0.8.3
|
187 |
-
Pattern==3.6
|
188 |
-
pdfminer.six==20211012
|
189 |
-
pefile==2023.2.7
|
190 |
-
peft==0.6.0
|
191 |
-
Pillow==9.3.0
|
192 |
-
pipreqs==0.4.11
|
193 |
-
pkginfo==1.9.6
|
194 |
-
platformdirs==4.1.0
|
195 |
-
portalocker==2.8.2
|
196 |
-
portend==3.1.0
|
197 |
-
preshed==3.0.9
|
198 |
-
proglog==0.1.10
|
199 |
-
prometheus-client==0.19.0
|
200 |
-
prompt-toolkit==3.0.43
|
201 |
-
protobuf==3.20.1
|
202 |
-
psutil==5.9.7
|
203 |
-
psycopg2==2.9.3
|
204 |
-
pure-eval==0.2.2
|
205 |
-
pyarrow==10.0.1
|
206 |
-
pyarrow-hotfix==0.6
|
207 |
-
pyasn1==0.4.8
|
208 |
-
pyasn1-modules==0.3.0
|
209 |
-
pycodestyle==2.8.0
|
210 |
-
pycparser==2.21
|
211 |
-
pydantic==2.5.3
|
212 |
-
pydantic_core==2.14.6
|
213 |
-
pydeck==0.8.0
|
214 |
-
Pygments==2.13.0
|
215 |
-
pyinstaller==5.13.0
|
216 |
-
pyinstaller-hooks-contrib==2023.6
|
217 |
-
PyJWT==2.4.0
|
218 |
-
Pympler==1.0.1
|
219 |
-
PyMuPDF==1.23.12
|
220 |
-
PyMuPDFb==1.23.9
|
221 |
-
pyparsing==3.0.9
|
222 |
-
PyQt5==5.15.10
|
223 |
-
PyQt5-Qt5==5.15.2
|
224 |
-
PyQt5-sip==12.13.0
|
225 |
-
pyrsistent==0.19.2
|
226 |
-
python-dateutil==2.8.2
|
227 |
-
python-docx==0.8.11
|
228 |
-
python-dotenv==1.0.0
|
229 |
-
python-json-logger==2.0.7
|
230 |
-
pytorch-pretrained-bert==0.6.2
|
231 |
-
pytube==12.1.0
|
232 |
-
pytz==2022.2.1
|
233 |
-
pytz-deprecation-shim==0.1.0.post0
|
234 |
-
pywin32==306
|
235 |
-
pywin32-ctypes==0.2.2
|
236 |
-
pywinpty==2.0.12
|
237 |
-
PyYAML==6.0.1
|
238 |
-
pyzmq==25.1.2
|
239 |
-
rapidfuzz==3.6.1
|
240 |
-
readme-renderer==42.0
|
241 |
-
referencing==0.32.0
|
242 |
-
regex==2021.11.10
|
243 |
-
requests==2.28.1
|
244 |
-
requests-oauthlib==1.3.1
|
245 |
-
requests-toolbelt==1.0.0
|
246 |
-
responses==0.18.0
|
247 |
-
rfc3339-validator==0.1.4
|
248 |
-
rfc3986==2.0.0
|
249 |
-
rfc3986-validator==0.1.1
|
250 |
-
rich==12.6.0
|
251 |
-
rouge-score==0.1.2
|
252 |
-
rpds-py==0.16.2
|
253 |
-
rsa==4.8
|
254 |
-
s3transfer==0.10.0
|
255 |
-
safetensors==0.4.1
|
256 |
-
scikit-learn==1.4.0
|
257 |
-
scipy==1.8.0
|
258 |
-
seaborn==0.13.0
|
259 |
-
semver==2.13.0
|
260 |
-
Send2Trash==1.8.2
|
261 |
-
sentencepiece==0.1.99
|
262 |
-
sentry-sdk==1.39.2
|
263 |
-
setproctitle==1.3.3
|
264 |
-
sgmllib3k==1.0.0
|
265 |
-
six==1.16.0
|
266 |
-
smart-open==6.4.0
|
267 |
-
smmap==5.0.0
|
268 |
-
sniffio==1.3.0
|
269 |
-
soupsieve==2.3.1
|
270 |
-
spacy==3.7.2
|
271 |
-
spacy-legacy==3.0.12
|
272 |
-
spacy-loggers==1.0.5
|
273 |
-
SQLAlchemy==1.4.41
|
274 |
-
sqlparse==0.4.2
|
275 |
-
srsly==2.4.8
|
276 |
-
stack-data==0.6.3
|
277 |
-
streamlit==1.15.1
|
278 |
-
streamlit-menu==1.0.9
|
279 |
-
streamlit-option-menu==0.3.12
|
280 |
-
sympy==1.12
|
281 |
-
tempora==5.0.1
|
282 |
-
tenacity==8.2.3
|
283 |
-
tensorboard==2.15.1
|
284 |
-
tensorboard-data-server==0.7.2
|
285 |
-
tensorflow==2.15.0
|
286 |
-
tensorflow-estimator==2.15.0
|
287 |
-
tensorflow-intel==2.15.0
|
288 |
-
tensorflow-io-gcs-filesystem==0.31.0
|
289 |
-
termcolor==2.4.0
|
290 |
-
terminado==0.18.0
|
291 |
-
test-nep-spell-synthetic-datautils==0.1.0
|
292 |
-
thinc==8.2.2
|
293 |
-
threadpoolctl==3.2.0
|
294 |
-
tinycss2==1.2.1
|
295 |
-
tokenizers==0.15.1
|
296 |
-
toml==0.10.2
|
297 |
-
tomli==2.0.1
|
298 |
-
toolz==0.12.0
|
299 |
-
torch==1.13.1
|
300 |
-
torchdata==0.5.1
|
301 |
-
tornado==6.2
|
302 |
-
tqdm==4.63.0
|
303 |
-
traitlets==5.14.0
|
304 |
-
traittypes==0.2.1
|
305 |
-
transformers @ git+https://github.com/huggingface/transformers.git@5b5e71dc41734a9798f3535bbd5039ab91883079
|
306 |
-
twine==5.0.0
|
307 |
-
typer==0.9.0
|
308 |
-
types-python-dateutil==2.8.19.14
|
309 |
-
typing_extensions==4.4.0
|
310 |
-
tzdata==2022.7
|
311 |
-
tzlocal==4.2
|
312 |
-
uri-template==1.3.0
|
313 |
-
urllib3==1.26.12
|
314 |
-
validators==0.20.0
|
315 |
-
virtualenv==20.10.0
|
316 |
-
wandb==0.16.2
|
317 |
-
wasabi==1.1.2
|
318 |
-
watchdog==2.1.9
|
319 |
-
wcwidth==0.2.12
|
320 |
-
weasel==0.3.4
|
321 |
-
webcolors==1.13
|
322 |
-
webencodings==0.5.1
|
323 |
-
websocket-client==1.7.0
|
324 |
-
Werkzeug==2.2.2
|
325 |
-
whitenoise==6.0.0
|
326 |
-
widgetsnbextension==4.0.9
|
327 |
-
wrapt==1.14.1
|
328 |
-
xxhash==3.4.1
|
329 |
-
xyzservices==2023.10.1
|
330 |
-
yarg==0.1.9
|
331 |
-
yarl==1.8.2
|
332 |
-
yfinance==0.1.87
|
333 |
-
zc.lockfile==2.0
|
334 |
-
zipp==3.11.0
|
|
|
1 |
+
transformers
|
2 |
+
streamlit
|
3 |
+
wordcloud
|
4 |
+
matplotlib
|
5 |
+
pandas
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/Demo.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
|
4 |
-
from ModelMethods import generate
|
5 |
|
6 |
|
7 |
st.set_page_config(page_title="DEMO", page_icon="👋", layout="wide")
|
@@ -27,7 +27,7 @@ def main():
|
|
27 |
|
28 |
st.header(appTitle)
|
29 |
left_column, right_column = st.columns(2)
|
30 |
-
correctedText= None
|
31 |
|
32 |
with left_column:
|
33 |
model_options = {"mT5", "mBART", "VartaT5"}
|
@@ -41,11 +41,7 @@ def main():
|
|
41 |
selected_example_text = examples[selected_example_key]
|
42 |
|
43 |
# Get user input
|
44 |
-
user_input = st.text_area(
|
45 |
-
"Enter a Nepali Sentence: ",
|
46 |
-
selected_example_text,
|
47 |
-
max_chars=512, # Set the maximum input length to 512 characters
|
48 |
-
)
|
49 |
if st.button("Check Spelling"):
|
50 |
if user_input:
|
51 |
correctedText = generate(selected_model, user_input)
|
@@ -58,7 +54,7 @@ def main():
|
|
58 |
if correctedText is not None:
|
59 |
st.write("Corrected Text:")
|
60 |
# st.write([f"{line['score']:.2f}: {line['sequence']}" for line in correctedText])
|
61 |
-
df = pd.DataFrame(correctedText, columns=["score","sequence"])
|
62 |
st.table(df)
|
63 |
|
64 |
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
|
4 |
+
from api.ModelMethods import generate
|
5 |
|
6 |
|
7 |
st.set_page_config(page_title="DEMO", page_icon="👋", layout="wide")
|
|
|
27 |
|
28 |
st.header(appTitle)
|
29 |
left_column, right_column = st.columns(2)
|
30 |
+
correctedText = None
|
31 |
|
32 |
with left_column:
|
33 |
model_options = {"mT5", "mBART", "VartaT5"}
|
|
|
41 |
selected_example_text = examples[selected_example_key]
|
42 |
|
43 |
# Get user input
|
44 |
+
user_input = st.text_area("Enter a Nepali Sentence: ", selected_example_text)
|
|
|
|
|
|
|
|
|
45 |
if st.button("Check Spelling"):
|
46 |
if user_input:
|
47 |
correctedText = generate(selected_model, user_input)
|
|
|
54 |
if correctedText is not None:
|
55 |
st.write("Corrected Text:")
|
56 |
# st.write([f"{line['score']:.2f}: {line['sequence']}" for line in correctedText])
|
57 |
+
df = pd.DataFrame(correctedText, columns=["score", "sequence"])
|
58 |
st.table(df)
|
59 |
|
60 |
|
src/{ModelMethods.py → api/ModelMethods.py}
RENAMED
File without changes
|
src/pages/LiteratureReview.py
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
|
3 |
-
st.set_page_config(
|
4 |
-
page_title="Literature Review",
|
5 |
-
page_icon="👋",
|
6 |
-
)
|
7 |
-
|
8 |
-
st.write("LiteratureReview")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/pages/References.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
|
3 |
-
st.set_page_config(
|
4 |
-
page_title="References",
|
5 |
-
page_icon="👋",
|
6 |
-
layout="wide"
|
7 |
-
)
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
st.sidebar.header("Plotting Demo")
|
13 |
-
|
14 |
-
st.write("References Here")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/pages/📈DatasetAnalysis.py
ADDED
@@ -0,0 +1,540 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
from collections import Counter
|
5 |
+
from wordcloud import WordCloud
|
6 |
+
from matplotlib.font_manager import FontProperties
|
7 |
+
|
8 |
+
st.set_page_config(page_title="Datasets Analysis", page_icon="👋", layout="wide")
|
9 |
+
|
10 |
+
|
11 |
+
data100k = "../datafiles/nep_spell_100k.csv"
|
12 |
+
|
13 |
+
# Preparing datafrmae
|
14 |
+
df = pd.read_csv(data100k)
|
15 |
+
|
16 |
+
# Count words
|
17 |
+
df["num_words"] = df["Correct"].apply(lambda x: len(x.split()))
|
18 |
+
|
19 |
+
# Count the number of sentences for each number of words
|
20 |
+
word_counts = df["num_words"].value_counts().sort_index()
|
21 |
+
|
22 |
+
# Create a Streamlit app
|
23 |
+
st.title("Dataset Analysis")
|
24 |
+
|
25 |
+
st.subheader("Word Count Analysis")
|
26 |
+
# Display the DataFrame (optional)
|
27 |
+
st.write(df)
|
28 |
+
st.write("---")
|
29 |
+
# Plot the data
|
30 |
+
plt.figure(figsize=(10, 6))
|
31 |
+
plt.bar(word_counts.index, word_counts.values, color="skyblue")
|
32 |
+
plt.xlabel("Number of Words in Sentence")
|
33 |
+
plt.ylabel("Number of Sentences")
|
34 |
+
plt.title("Number of Words vs. Number of Sentences")
|
35 |
+
plt.grid(True)
|
36 |
+
|
37 |
+
# Set the range in the x-axis to 70
|
38 |
+
plt.xlim(0, 70)
|
39 |
+
|
40 |
+
# Save the plot as an image file (optional)
|
41 |
+
# plt.savefig("word_count_plot.png", dpi=300)
|
42 |
+
|
43 |
+
# Display the plot in Streamlit
|
44 |
+
st.pyplot(plt)
|
45 |
+
|
46 |
+
st.write("---")
|
47 |
+
|
48 |
+
#########################
|
49 |
+
# Concatenate all sentences into a single string
|
50 |
+
all_sentences = " ".join(df["Correct"])
|
51 |
+
|
52 |
+
# Tokenize the sentences and calculate word frequency
|
53 |
+
words = all_sentences.split()
|
54 |
+
word_freq = Counter(words)
|
55 |
+
|
56 |
+
# Consider the top 1000 most common words
|
57 |
+
top_words = word_freq.most_common(1000)
|
58 |
+
|
59 |
+
# Generate the corpus for word cloud
|
60 |
+
corpus = {}
|
61 |
+
for word, frequency in top_words:
|
62 |
+
corpus[word] = frequency
|
63 |
+
|
64 |
+
# Define the font file path
|
65 |
+
font1 = "../fonts/TiroDevanagariHindi-Regular.ttf"
|
66 |
+
|
67 |
+
# Generate the word cloud
|
68 |
+
wordcloud_most_common = WordCloud(
|
69 |
+
width=1000,
|
70 |
+
height=500,
|
71 |
+
background_color="white",
|
72 |
+
min_font_size=10,
|
73 |
+
regexp=r"[\u0900-\u097F]+",
|
74 |
+
font_path=font1,
|
75 |
+
).generate_from_frequencies(corpus)
|
76 |
+
|
77 |
+
|
78 |
+
# Display the word cloud using Streamlit
|
79 |
+
st.subheader("Word Cloud of Most Frequent Words in Correct Sentences")
|
80 |
+
st.image(wordcloud_most_common.to_array(), use_column_width=True)
|
81 |
+
############################################
|
82 |
+
# WOrd cloud of least common
|
83 |
+
st.write("---")
|
84 |
+
|
85 |
+
|
86 |
+
# Concatenate all sentences into a single string
|
87 |
+
|
88 |
+
# Consider the least 1000 frequent words
|
89 |
+
least_common_words = word_freq.most_common()[: -1000 - 1 : -1]
|
90 |
+
|
91 |
+
# Generate the corpus for word cloud
|
92 |
+
corpus = {}
|
93 |
+
for word, frequency in least_common_words:
|
94 |
+
corpus[word] = frequency
|
95 |
+
|
96 |
+
# Generate the word cloud for least frequent words
|
97 |
+
wordcloud_least_frequent = WordCloud(
|
98 |
+
width=1000,
|
99 |
+
height=500,
|
100 |
+
background_color="white",
|
101 |
+
min_font_size=10,
|
102 |
+
regexp=r"[\u0900-\u097F]+",
|
103 |
+
font_path=font1,
|
104 |
+
).generate_from_frequencies(corpus)
|
105 |
+
|
106 |
+
# Display the word cloud using Streamlit
|
107 |
+
st.header("Word Cloud of Least Frequent Words in Correct Sentences")
|
108 |
+
st.image(wordcloud_least_frequent.to_array(), use_column_width=True)
|
109 |
+
|
110 |
+
|
111 |
+
########################################
|
112 |
+
st.write("---")
|
113 |
+
|
114 |
+
# Data
|
115 |
+
char_seq_in = [
|
116 |
+
"ि",
|
117 |
+
"ी",
|
118 |
+
"ु",
|
119 |
+
"ू",
|
120 |
+
"इ",
|
121 |
+
"ई",
|
122 |
+
"उ",
|
123 |
+
"ऊ",
|
124 |
+
"श",
|
125 |
+
"श",
|
126 |
+
"स",
|
127 |
+
"स",
|
128 |
+
"ष",
|
129 |
+
"ष",
|
130 |
+
"ब",
|
131 |
+
"व",
|
132 |
+
"त",
|
133 |
+
"ट",
|
134 |
+
"द",
|
135 |
+
"ध",
|
136 |
+
"ं",
|
137 |
+
"ँ",
|
138 |
+
]
|
139 |
+
char_seq_out = [
|
140 |
+
"ी",
|
141 |
+
"ि",
|
142 |
+
"ू",
|
143 |
+
"ु",
|
144 |
+
"ई",
|
145 |
+
"इ",
|
146 |
+
"ऊ",
|
147 |
+
"उ",
|
148 |
+
"स",
|
149 |
+
"ष",
|
150 |
+
"श",
|
151 |
+
"ष",
|
152 |
+
"श",
|
153 |
+
"स",
|
154 |
+
"व",
|
155 |
+
"ब",
|
156 |
+
"ट",
|
157 |
+
"त",
|
158 |
+
"ध",
|
159 |
+
"द",
|
160 |
+
"ँ",
|
161 |
+
"ं",
|
162 |
+
]
|
163 |
+
datapoints_in_percentage = [
|
164 |
+
5,
|
165 |
+
5,
|
166 |
+
5,
|
167 |
+
5,
|
168 |
+
2.5,
|
169 |
+
2.5,
|
170 |
+
2.5,
|
171 |
+
2.5,
|
172 |
+
1.5,
|
173 |
+
0.5,
|
174 |
+
1.5,
|
175 |
+
0.5,
|
176 |
+
0.5,
|
177 |
+
0.5,
|
178 |
+
1,
|
179 |
+
1,
|
180 |
+
1,
|
181 |
+
0.6,
|
182 |
+
0.5,
|
183 |
+
0.5,
|
184 |
+
1,
|
185 |
+
1,
|
186 |
+
]
|
187 |
+
|
188 |
+
# Plot
|
189 |
+
plt.figure(figsize=(10, 6))
|
190 |
+
plt.bar(char_seq_in, datapoints_in_percentage, color="skyblue")
|
191 |
+
plt.xlabel("Character Sequence (Input)")
|
192 |
+
plt.ylabel("Percentage of Datapoints")
|
193 |
+
plt.title("Distribution of Character Substitution Errors")
|
194 |
+
# Specify font properties
|
195 |
+
font_prop = FontProperties(fname=font1)
|
196 |
+
plt.xticks(char_seq_in, char_seq_in, fontproperties=font_prop)
|
197 |
+
|
198 |
+
plt.grid(axis="y")
|
199 |
+
|
200 |
+
# Save the image
|
201 |
+
# plt.savefig("character_substitution.png", dpi=300, bbox_inches="tight")
|
202 |
+
# Show plot
|
203 |
+
plt.tight_layout()
|
204 |
+
# Display the plot in Streamlit
|
205 |
+
st.subheader("Character substitution error")
|
206 |
+
st.pyplot(plt)
|
207 |
+
|
208 |
+
##################################
|
209 |
+
|
210 |
+
st.write("---")
|
211 |
+
|
212 |
+
# Existing data
|
213 |
+
characters = [
|
214 |
+
" ",
|
215 |
+
"ा",
|
216 |
+
"ि",
|
217 |
+
"ी",
|
218 |
+
"ु",
|
219 |
+
"ू",
|
220 |
+
"े",
|
221 |
+
"ै",
|
222 |
+
"ो",
|
223 |
+
"ौ",
|
224 |
+
"ृ",
|
225 |
+
"्",
|
226 |
+
"ः",
|
227 |
+
"क",
|
228 |
+
"ख",
|
229 |
+
"ग",
|
230 |
+
"घ",
|
231 |
+
"ङ",
|
232 |
+
"च",
|
233 |
+
"छ",
|
234 |
+
"ज",
|
235 |
+
"झ",
|
236 |
+
"ञ",
|
237 |
+
"ट",
|
238 |
+
"ठ",
|
239 |
+
"ड",
|
240 |
+
"ढ",
|
241 |
+
"ण",
|
242 |
+
"त",
|
243 |
+
"थ",
|
244 |
+
"द",
|
245 |
+
"ध",
|
246 |
+
"न",
|
247 |
+
"प",
|
248 |
+
"फ",
|
249 |
+
"ब",
|
250 |
+
"भ",
|
251 |
+
"म",
|
252 |
+
"य",
|
253 |
+
"र",
|
254 |
+
"ल",
|
255 |
+
"व",
|
256 |
+
"श",
|
257 |
+
"स",
|
258 |
+
"ष",
|
259 |
+
"ह",
|
260 |
+
"अ",
|
261 |
+
"आ",
|
262 |
+
"इ",
|
263 |
+
"ई",
|
264 |
+
"उ",
|
265 |
+
"ऊ",
|
266 |
+
"ऋ",
|
267 |
+
"ए",
|
268 |
+
"ऐ",
|
269 |
+
"ओ",
|
270 |
+
"औ",
|
271 |
+
]
|
272 |
+
datapoints_in_percentage = [
|
273 |
+
1.5,
|
274 |
+
1.5,
|
275 |
+
1.5,
|
276 |
+
1.5,
|
277 |
+
1.5,
|
278 |
+
1.5,
|
279 |
+
1,
|
280 |
+
1,
|
281 |
+
1,
|
282 |
+
1,
|
283 |
+
1.2,
|
284 |
+
1,
|
285 |
+
0.5,
|
286 |
+
0.25,
|
287 |
+
0.25,
|
288 |
+
0.25,
|
289 |
+
0.25,
|
290 |
+
0.25,
|
291 |
+
0.25,
|
292 |
+
0.25,
|
293 |
+
0.25,
|
294 |
+
0.25,
|
295 |
+
0.25,
|
296 |
+
0.25,
|
297 |
+
0.25,
|
298 |
+
0.25,
|
299 |
+
0.25,
|
300 |
+
0.25,
|
301 |
+
0.25,
|
302 |
+
0.25,
|
303 |
+
0.25,
|
304 |
+
0.25,
|
305 |
+
0.25,
|
306 |
+
0.25,
|
307 |
+
0.25,
|
308 |
+
0.25,
|
309 |
+
0.25,
|
310 |
+
0.25,
|
311 |
+
0.25,
|
312 |
+
0.25,
|
313 |
+
0.25,
|
314 |
+
0.25,
|
315 |
+
0.25,
|
316 |
+
0.25,
|
317 |
+
0.25,
|
318 |
+
0.25,
|
319 |
+
0.25,
|
320 |
+
0.25,
|
321 |
+
0.25,
|
322 |
+
0.25,
|
323 |
+
0.25,
|
324 |
+
0.25,
|
325 |
+
0.25,
|
326 |
+
0.25,
|
327 |
+
0.25,
|
328 |
+
0.25,
|
329 |
+
0.25,
|
330 |
+
]
|
331 |
+
|
332 |
+
# Additional data
|
333 |
+
additional_characters = ["क्ष", "त्र", "ज्ञ", "अं", "अः"]
|
334 |
+
additional_datapoints_in_percentage = [0.15, 0.15, 0.15, 0.15, 0.15]
|
335 |
+
|
336 |
+
# Combine the existing and additional data
|
337 |
+
characters += additional_characters
|
338 |
+
datapoints_in_percentage += additional_datapoints_in_percentage
|
339 |
+
|
340 |
+
# Plot
|
341 |
+
plt.figure(figsize=(12, 6))
|
342 |
+
plt.bar(characters, datapoints_in_percentage, color="skyblue")
|
343 |
+
plt.xlabel("Character")
|
344 |
+
plt.ylabel("Percentage of Datapoints")
|
345 |
+
plt.title("Distribution of Character Additions Errors")
|
346 |
+
plt.xticks(rotation=90)
|
347 |
+
|
348 |
+
# Specify font properties
|
349 |
+
font_prop = FontProperties(fname=font1)
|
350 |
+
plt.xticks(characters, characters, fontproperties=font_prop)
|
351 |
+
|
352 |
+
plt.grid(axis="y")
|
353 |
+
|
354 |
+
# Save the image
|
355 |
+
# plt.savefig("character_addition.png", dpi=300, bbox_inches="tight")
|
356 |
+
|
357 |
+
# Show plot
|
358 |
+
plt.tight_layout()
|
359 |
+
st.subheader("Character Addition Error")
|
360 |
+
st.pyplot(plt)
|
361 |
+
############################################################
|
362 |
+
|
363 |
+
st.write("---")
|
364 |
+
|
365 |
+
# Data
|
366 |
+
characters = [
|
367 |
+
" ",
|
368 |
+
"ा",
|
369 |
+
"ि",
|
370 |
+
"ी",
|
371 |
+
"ु",
|
372 |
+
"ू",
|
373 |
+
"े",
|
374 |
+
"ै",
|
375 |
+
"ो",
|
376 |
+
"ौ",
|
377 |
+
"ृ",
|
378 |
+
"्",
|
379 |
+
"ः",
|
380 |
+
"क",
|
381 |
+
"ख",
|
382 |
+
"ग",
|
383 |
+
"घ",
|
384 |
+
"ङ",
|
385 |
+
"च",
|
386 |
+
"छ",
|
387 |
+
"ज",
|
388 |
+
"झ",
|
389 |
+
"ञ",
|
390 |
+
"ट",
|
391 |
+
"ठ",
|
392 |
+
"ड",
|
393 |
+
"ढ",
|
394 |
+
"ण",
|
395 |
+
"त",
|
396 |
+
"थ",
|
397 |
+
"द",
|
398 |
+
"ध",
|
399 |
+
"न",
|
400 |
+
"प",
|
401 |
+
"फ",
|
402 |
+
"ब",
|
403 |
+
"भ",
|
404 |
+
"म",
|
405 |
+
"य",
|
406 |
+
"र",
|
407 |
+
"ल",
|
408 |
+
"व",
|
409 |
+
"श",
|
410 |
+
"स",
|
411 |
+
"ष",
|
412 |
+
"ह",
|
413 |
+
"अ",
|
414 |
+
"आ",
|
415 |
+
"इ",
|
416 |
+
"ई",
|
417 |
+
"उ",
|
418 |
+
"ऊ",
|
419 |
+
"ऋ",
|
420 |
+
"ए",
|
421 |
+
"ऐ",
|
422 |
+
"ओ",
|
423 |
+
"औ",
|
424 |
+
"क्ष",
|
425 |
+
"त्र",
|
426 |
+
"ज्ञ",
|
427 |
+
"अं",
|
428 |
+
"अः",
|
429 |
+
]
|
430 |
+
datapoints_in_percentage = [
|
431 |
+
1.5,
|
432 |
+
1.5,
|
433 |
+
1.5,
|
434 |
+
1.5,
|
435 |
+
1.5,
|
436 |
+
1.5,
|
437 |
+
1,
|
438 |
+
1,
|
439 |
+
1,
|
440 |
+
1,
|
441 |
+
1,
|
442 |
+
1.25,
|
443 |
+
0.5,
|
444 |
+
0.25,
|
445 |
+
0.25,
|
446 |
+
0.25,
|
447 |
+
0.25,
|
448 |
+
0.25,
|
449 |
+
0.25,
|
450 |
+
0.25,
|
451 |
+
0.25,
|
452 |
+
0.25,
|
453 |
+
0.25,
|
454 |
+
0.25,
|
455 |
+
0.25,
|
456 |
+
0.25,
|
457 |
+
0.25,
|
458 |
+
0.25,
|
459 |
+
0.25,
|
460 |
+
0.25,
|
461 |
+
0.25,
|
462 |
+
0.25,
|
463 |
+
0.25,
|
464 |
+
0.25,
|
465 |
+
0.25,
|
466 |
+
0.25,
|
467 |
+
0.25,
|
468 |
+
0.25,
|
469 |
+
0.25,
|
470 |
+
0.25,
|
471 |
+
0.25,
|
472 |
+
0.25,
|
473 |
+
0.25,
|
474 |
+
0.25,
|
475 |
+
0.25,
|
476 |
+
0.25,
|
477 |
+
0.25,
|
478 |
+
0.25,
|
479 |
+
0.25,
|
480 |
+
0.25,
|
481 |
+
0.25,
|
482 |
+
0.25,
|
483 |
+
0.25,
|
484 |
+
0.25,
|
485 |
+
0.25,
|
486 |
+
0.25,
|
487 |
+
0.25,
|
488 |
+
0.15,
|
489 |
+
0.15,
|
490 |
+
0.15,
|
491 |
+
0.15,
|
492 |
+
0.15,
|
493 |
+
]
|
494 |
+
|
495 |
+
# Plot
|
496 |
+
plt.figure(figsize=(10, 6))
|
497 |
+
plt.bar(characters, datapoints_in_percentage, color="skyblue")
|
498 |
+
plt.xlabel("Character")
|
499 |
+
plt.ylabel("Percentage of Datapoints")
|
500 |
+
plt.title("Distribution of Character Deletion Errors")
|
501 |
+
plt.xticks(rotation=90)
|
502 |
+
|
503 |
+
# Specify font properties
|
504 |
+
font_prop = FontProperties(fname=font1)
|
505 |
+
plt.xticks(characters, characters, fontproperties=font_prop)
|
506 |
+
|
507 |
+
plt.grid(axis="y")
|
508 |
+
|
509 |
+
# Save the image
|
510 |
+
# plt.savefig("character_deletion.png", dpi=300, bbox_inches="tight")
|
511 |
+
|
512 |
+
# Show plot
|
513 |
+
plt.tight_layout()
|
514 |
+
|
515 |
+
st.subheader("Character Deletion Error")
|
516 |
+
st.pyplot(plt)
|
517 |
+
############################################
|
518 |
+
|
519 |
+
|
520 |
+
st.write("---")
|
521 |
+
|
522 |
+
# Data
|
523 |
+
error_types = ["Deletion", "Addition", "Substitution", "Double Substitution"]
|
524 |
+
error_percentages = [28.5, 28.45, 40.1, 2.95]
|
525 |
+
|
526 |
+
# Create horizontal bar graph
|
527 |
+
plt.figure(figsize=(10, 6))
|
528 |
+
plt.barh(error_types, error_percentages)
|
529 |
+
|
530 |
+
# Add labels and title
|
531 |
+
plt.xlabel("Error Percentage")
|
532 |
+
plt.ylabel("Error Type")
|
533 |
+
plt.title("Error Types Distribution")
|
534 |
+
|
535 |
+
# Save the image
|
536 |
+
# plt.savefig("error_type_distribution.png", dpi=300, bbox_inches="tight")
|
537 |
+
|
538 |
+
# Show plot
|
539 |
+
st.subheader("Distribution of Error Types")
|
540 |
+
st.pyplot(plt)
|
src/pages/📊DatasetsPreparation.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
st.set_page_config(page_title="Datasets Preparation", page_icon="👋", layout="wide")
|
5 |
+
|
6 |
+
##########################################
|
7 |
+
|
8 |
+
# Read lines from the text file
|
9 |
+
with open("../datafiles/sample_nep_corpus.txt") as file:
|
10 |
+
items = file.readlines()
|
11 |
+
|
12 |
+
# Split each line into separate columns
|
13 |
+
datacorpus = pd.DataFrame(items, columns=["Content"])
|
14 |
+
# datacorpus.columns =["Content"]
|
15 |
+
|
16 |
+
|
17 |
+
# st.write(f"{datacorpus}")
|
18 |
+
|
19 |
+
datasentences = pd.read_csv("../datafiles/sample_nep_sentences.csv")
|
20 |
+
|
21 |
+
data100k = pd.read_csv(
|
22 |
+
r"../datafiles/sample_nep_spell_100k.csv",
|
23 |
+
nrows=50,
|
24 |
+
)
|
25 |
+
|
26 |
+
|
27 |
+
###########################################
|
28 |
+
|
29 |
+
|
30 |
+
st.title("Dataset Preparation")
|
31 |
+
|
32 |
+
st.write("---")
|
33 |
+
st.header(
|
34 |
+
"""
|
35 |
+
A Large Nepali Text Corpus
|
36 |
+
"""
|
37 |
+
)
|
38 |
+
|
39 |
+
st.caption("**Table 1.** A Large Nepali Text Corpus")
|
40 |
+
|
41 |
+
st.dataframe(datacorpus, use_container_width=True)
|
42 |
+
|
43 |
+
st.write("---")
|
44 |
+
st.header(
|
45 |
+
"""
|
46 |
+
Sentence extrancted from A Large Nepali Text Corpus
|
47 |
+
"""
|
48 |
+
)
|
49 |
+
st.caption("**Table 2.** Extracted sentences")
|
50 |
+
st.dataframe(datasentences, use_container_width=True)
|
51 |
+
|
52 |
+
st.write("---")
|
53 |
+
st.header(
|
54 |
+
"""
|
55 |
+
Parallel dataset using extracted sentences
|
56 |
+
"""
|
57 |
+
)
|
58 |
+
st.caption("**Table 3.** 100k Dataset used for training")
|
59 |
+
st.dataframe(data100k, use_container_width=True)
|