Spaces:
Running
Running
duraad
commited on
Commit
·
1ddad36
0
Parent(s):
Initial Commit
Browse files- .gitignore +9 -0
- README.md +15 -0
- requirments.txt +334 -0
- src/Demo.py +66 -0
- src/ModelMethods.py +166 -0
- src/pages/LiteratureReview.py +8 -0
- src/pages/References.py +14 -0
.gitignore
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Folders to ignore
|
2 |
+
model/
|
3 |
+
model-local/
|
4 |
+
__pycache__/
|
5 |
+
src/__pycache__/
|
6 |
+
|
7 |
+
# Files to ignore
|
8 |
+
notes.md
|
9 |
+
*.pyc
|
README.md
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Nepali Spelling Correction
|
2 |
+
|
3 |
+
## Models used
|
4 |
+
|
5 |
+
- `google/mt5-small`
|
6 |
+
- `facebook/mbart-large-cc25`
|
7 |
+
- `rahular/varta-t5`
|
8 |
+
|
9 |
+
|
10 |
+
## How to setup?
|
11 |
+
1. Clone this repo
|
12 |
+
2. Install the dependencies
|
13 |
+
2. Create a folder `models` inside the repo
|
14 |
+
3. Inside the `models` repo, `clone` the models from huggingface
|
15 |
+
4. Update the model names in `ModelMethods.py`
|
requirments.txt
ADDED
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.0.0
|
2 |
+
accelerate @ git+https://github.com/huggingface/accelerate.git@162a82164e9bdcc01a173cbee43b686437aaead8
|
3 |
+
aiohttp==3.8.4
|
4 |
+
aiosignal==1.3.1
|
5 |
+
altair==4.2.0
|
6 |
+
altgraph==0.17.3
|
7 |
+
aniso8601==9.0.1
|
8 |
+
annotated-types==0.6.0
|
9 |
+
anyio==4.2.0
|
10 |
+
appdirs==1.4.4
|
11 |
+
argon2-cffi==23.1.0
|
12 |
+
argon2-cffi-bindings==21.2.0
|
13 |
+
arrow==1.3.0
|
14 |
+
asgiref==3.5.0
|
15 |
+
asttokens==2.4.1
|
16 |
+
astunparse==1.6.3
|
17 |
+
async-lru==2.0.4
|
18 |
+
async-timeout==4.0.2
|
19 |
+
attrs==22.1.0
|
20 |
+
autopep8==1.6.0
|
21 |
+
Babel==2.14.0
|
22 |
+
backports.csv==1.0.7
|
23 |
+
backports.entry-points-selectable==1.1.1
|
24 |
+
beautifulsoup4==4.10.0
|
25 |
+
bitsandbytes==0.42.0
|
26 |
+
bleach==6.1.0
|
27 |
+
blinker==1.5
|
28 |
+
blis==0.7.11
|
29 |
+
boto3==1.34.19
|
30 |
+
botocore==1.34.19
|
31 |
+
branca==0.7.0
|
32 |
+
cachetools==5.2.0
|
33 |
+
catalogue==2.0.10
|
34 |
+
certifi==2022.6.15
|
35 |
+
cffi==1.15.1
|
36 |
+
chardet==4.0.0
|
37 |
+
charset-normalizer==2.1.1
|
38 |
+
cheroot==8.6.0
|
39 |
+
CherryPy==18.6.1
|
40 |
+
click==8.1.3
|
41 |
+
cloudpathlib==0.16.0
|
42 |
+
colorama==0.4.5
|
43 |
+
comm==0.2.0
|
44 |
+
commonmark==0.9.1
|
45 |
+
confection==0.1.4
|
46 |
+
contextualSpellCheck==0.4.4
|
47 |
+
contourpy==1.0.6
|
48 |
+
cryptography==38.0.1
|
49 |
+
cycler==0.11.0
|
50 |
+
cymem==2.0.8
|
51 |
+
Cython==3.0.6
|
52 |
+
datasets==2.16.1
|
53 |
+
dateparser==1.1.0
|
54 |
+
debugpy==1.8.0
|
55 |
+
decorator==5.1.1
|
56 |
+
defusedxml==0.7.1
|
57 |
+
dill==0.3.7
|
58 |
+
distlib==0.3.4
|
59 |
+
dj-database-url==0.5.0
|
60 |
+
Django==4.0.4
|
61 |
+
django-ckeditor==6.2.0
|
62 |
+
django-crispy-forms==1.14.0
|
63 |
+
django-heroku==0.3.1
|
64 |
+
django-js-asset==1.2.2
|
65 |
+
docker-pycreds==0.4.0
|
66 |
+
docopt==0.6.2
|
67 |
+
docutils==0.20.1
|
68 |
+
docx2pdf==0.1.8
|
69 |
+
editdistance==0.6.2
|
70 |
+
einops==0.7.0
|
71 |
+
entrypoints==0.4
|
72 |
+
et-xmlfile==1.1.0
|
73 |
+
evaluate==0.4.0
|
74 |
+
exceptiongroup==1.2.0
|
75 |
+
executing==2.0.1
|
76 |
+
ez-setup==0.9
|
77 |
+
fastjsonschema==2.19.1
|
78 |
+
feedparser==6.0.8
|
79 |
+
filelock==3.4.0
|
80 |
+
Flask==2.2.2
|
81 |
+
Flask-API==3.0.post1
|
82 |
+
Flask-Cors==3.0.10
|
83 |
+
Flask-RESTful==0.3.9
|
84 |
+
Flask-SQLAlchemy==2.5.1
|
85 |
+
flatbuffers==23.5.26
|
86 |
+
fonttools==4.38.0
|
87 |
+
fqdn==1.5.1
|
88 |
+
frozenlist==1.3.3
|
89 |
+
fsspec==2023.10.0
|
90 |
+
future==0.18.2
|
91 |
+
gast==0.5.4
|
92 |
+
gitdb==4.0.10
|
93 |
+
GitPython==3.1.29
|
94 |
+
google==3.0.0
|
95 |
+
google-auth==2.25.2
|
96 |
+
google-auth-oauthlib==1.2.0
|
97 |
+
google-pasta==0.2.0
|
98 |
+
GoogleNews==1.6.0
|
99 |
+
greenlet==1.1.3
|
100 |
+
grpcio==1.60.0
|
101 |
+
gunicorn==20.1.0
|
102 |
+
h5py==3.10.0
|
103 |
+
happytransformer==3.0.0
|
104 |
+
heroku==0.1.4
|
105 |
+
huggingface-hub==0.20.1
|
106 |
+
idna==3.3
|
107 |
+
imageio==2.19.3
|
108 |
+
imageio-ffmpeg==0.4.7
|
109 |
+
importlib-metadata==5.1.0
|
110 |
+
instaloader==4.9.6
|
111 |
+
ipykernel==6.28.0
|
112 |
+
ipyleaflet==0.18.1
|
113 |
+
ipython==8.19.0
|
114 |
+
ipywidgets==8.1.1
|
115 |
+
isoduration==20.11.0
|
116 |
+
itsdangerous==2.1.2
|
117 |
+
jaraco.classes==3.2.1
|
118 |
+
jaraco.collections==3.5.1
|
119 |
+
jaraco.context==4.1.1
|
120 |
+
jaraco.functools==3.5.0
|
121 |
+
jaraco.text==3.7.0
|
122 |
+
jedi==0.19.1
|
123 |
+
Jinja2==3.1.2
|
124 |
+
jmespath==1.0.1
|
125 |
+
joblib==1.3.2
|
126 |
+
json5==0.9.14
|
127 |
+
jsonlines==4.0.0
|
128 |
+
jsonpointer==2.4
|
129 |
+
jsonschema==4.17.3
|
130 |
+
jsonschema-specifications==2023.12.1
|
131 |
+
jupyter-events==0.9.0
|
132 |
+
jupyter-lsp==2.2.1
|
133 |
+
jupyter_client==8.6.0
|
134 |
+
jupyter_core==5.5.1
|
135 |
+
jupyter_server==2.12.1
|
136 |
+
jupyter_server_terminals==0.5.1
|
137 |
+
jupyterlab==4.0.10
|
138 |
+
jupyterlab-widgets==3.0.9
|
139 |
+
jupyterlab_pygments==0.3.0
|
140 |
+
jupyterlab_server==2.25.2
|
141 |
+
jwt==1.3.1
|
142 |
+
keras==2.15.0
|
143 |
+
keyring==24.3.0
|
144 |
+
kiwisolver==1.4.4
|
145 |
+
langcodes==3.3.0
|
146 |
+
Levenshtein==0.23.0
|
147 |
+
libclang==16.0.6
|
148 |
+
loralib==0.1.2
|
149 |
+
lxml==4.9.1
|
150 |
+
Markdown==3.5.1
|
151 |
+
markdown-it-py==3.0.0
|
152 |
+
MarkupSafe==2.1.1
|
153 |
+
matplotlib==3.7.1
|
154 |
+
matplotlib-inline==0.1.6
|
155 |
+
mdurl==0.1.2
|
156 |
+
mistune==3.0.2
|
157 |
+
ml-dtypes==0.2.0
|
158 |
+
more-itertools==8.12.0
|
159 |
+
moviepy==1.0.3
|
160 |
+
mpmath==1.3.0
|
161 |
+
multidict==6.0.4
|
162 |
+
multiprocess==0.70.15
|
163 |
+
multitasking==0.0.11
|
164 |
+
murmurhash==1.0.10
|
165 |
+
mysql-connector-python==8.0.31
|
166 |
+
mysqlclient==2.1.0
|
167 |
+
nbclient==0.9.0
|
168 |
+
nbconvert==7.13.1
|
169 |
+
nbformat==5.9.2
|
170 |
+
nest-asyncio==1.5.8
|
171 |
+
networkx==3.2.1
|
172 |
+
news==1.0
|
173 |
+
nh3==0.2.15
|
174 |
+
nltk==3.7
|
175 |
+
notebook_shim==0.2.3
|
176 |
+
numpy==1.23.5
|
177 |
+
oauthlib==3.2.2
|
178 |
+
openai==0.27.2
|
179 |
+
openpyxl==3.1.2
|
180 |
+
opt-einsum==3.3.0
|
181 |
+
overrides==7.4.0
|
182 |
+
packaging==21.3
|
183 |
+
pafy==0.5.5
|
184 |
+
pandas==1.5.2
|
185 |
+
pandocfilters==1.5.0
|
186 |
+
parso==0.8.3
|
187 |
+
Pattern==3.6
|
188 |
+
pdfminer.six==20211012
|
189 |
+
pefile==2023.2.7
|
190 |
+
peft==0.6.0
|
191 |
+
Pillow==9.3.0
|
192 |
+
pipreqs==0.4.11
|
193 |
+
pkginfo==1.9.6
|
194 |
+
platformdirs==4.1.0
|
195 |
+
portalocker==2.8.2
|
196 |
+
portend==3.1.0
|
197 |
+
preshed==3.0.9
|
198 |
+
proglog==0.1.10
|
199 |
+
prometheus-client==0.19.0
|
200 |
+
prompt-toolkit==3.0.43
|
201 |
+
protobuf==3.20.1
|
202 |
+
psutil==5.9.7
|
203 |
+
psycopg2==2.9.3
|
204 |
+
pure-eval==0.2.2
|
205 |
+
pyarrow==10.0.1
|
206 |
+
pyarrow-hotfix==0.6
|
207 |
+
pyasn1==0.4.8
|
208 |
+
pyasn1-modules==0.3.0
|
209 |
+
pycodestyle==2.8.0
|
210 |
+
pycparser==2.21
|
211 |
+
pydantic==2.5.3
|
212 |
+
pydantic_core==2.14.6
|
213 |
+
pydeck==0.8.0
|
214 |
+
Pygments==2.13.0
|
215 |
+
pyinstaller==5.13.0
|
216 |
+
pyinstaller-hooks-contrib==2023.6
|
217 |
+
PyJWT==2.4.0
|
218 |
+
Pympler==1.0.1
|
219 |
+
PyMuPDF==1.23.12
|
220 |
+
PyMuPDFb==1.23.9
|
221 |
+
pyparsing==3.0.9
|
222 |
+
PyQt5==5.15.10
|
223 |
+
PyQt5-Qt5==5.15.2
|
224 |
+
PyQt5-sip==12.13.0
|
225 |
+
pyrsistent==0.19.2
|
226 |
+
python-dateutil==2.8.2
|
227 |
+
python-docx==0.8.11
|
228 |
+
python-dotenv==1.0.0
|
229 |
+
python-json-logger==2.0.7
|
230 |
+
pytorch-pretrained-bert==0.6.2
|
231 |
+
pytube==12.1.0
|
232 |
+
pytz==2022.2.1
|
233 |
+
pytz-deprecation-shim==0.1.0.post0
|
234 |
+
pywin32==306
|
235 |
+
pywin32-ctypes==0.2.2
|
236 |
+
pywinpty==2.0.12
|
237 |
+
PyYAML==6.0.1
|
238 |
+
pyzmq==25.1.2
|
239 |
+
rapidfuzz==3.6.1
|
240 |
+
readme-renderer==42.0
|
241 |
+
referencing==0.32.0
|
242 |
+
regex==2021.11.10
|
243 |
+
requests==2.28.1
|
244 |
+
requests-oauthlib==1.3.1
|
245 |
+
requests-toolbelt==1.0.0
|
246 |
+
responses==0.18.0
|
247 |
+
rfc3339-validator==0.1.4
|
248 |
+
rfc3986==2.0.0
|
249 |
+
rfc3986-validator==0.1.1
|
250 |
+
rich==12.6.0
|
251 |
+
rouge-score==0.1.2
|
252 |
+
rpds-py==0.16.2
|
253 |
+
rsa==4.8
|
254 |
+
s3transfer==0.10.0
|
255 |
+
safetensors==0.4.1
|
256 |
+
scikit-learn==1.4.0
|
257 |
+
scipy==1.8.0
|
258 |
+
seaborn==0.13.0
|
259 |
+
semver==2.13.0
|
260 |
+
Send2Trash==1.8.2
|
261 |
+
sentencepiece==0.1.99
|
262 |
+
sentry-sdk==1.39.2
|
263 |
+
setproctitle==1.3.3
|
264 |
+
sgmllib3k==1.0.0
|
265 |
+
six==1.16.0
|
266 |
+
smart-open==6.4.0
|
267 |
+
smmap==5.0.0
|
268 |
+
sniffio==1.3.0
|
269 |
+
soupsieve==2.3.1
|
270 |
+
spacy==3.7.2
|
271 |
+
spacy-legacy==3.0.12
|
272 |
+
spacy-loggers==1.0.5
|
273 |
+
SQLAlchemy==1.4.41
|
274 |
+
sqlparse==0.4.2
|
275 |
+
srsly==2.4.8
|
276 |
+
stack-data==0.6.3
|
277 |
+
streamlit==1.15.1
|
278 |
+
streamlit-menu==1.0.9
|
279 |
+
streamlit-option-menu==0.3.12
|
280 |
+
sympy==1.12
|
281 |
+
tempora==5.0.1
|
282 |
+
tenacity==8.2.3
|
283 |
+
tensorboard==2.15.1
|
284 |
+
tensorboard-data-server==0.7.2
|
285 |
+
tensorflow==2.15.0
|
286 |
+
tensorflow-estimator==2.15.0
|
287 |
+
tensorflow-intel==2.15.0
|
288 |
+
tensorflow-io-gcs-filesystem==0.31.0
|
289 |
+
termcolor==2.4.0
|
290 |
+
terminado==0.18.0
|
291 |
+
test-nep-spell-synthetic-datautils==0.1.0
|
292 |
+
thinc==8.2.2
|
293 |
+
threadpoolctl==3.2.0
|
294 |
+
tinycss2==1.2.1
|
295 |
+
tokenizers==0.15.1
|
296 |
+
toml==0.10.2
|
297 |
+
tomli==2.0.1
|
298 |
+
toolz==0.12.0
|
299 |
+
torch==1.13.1
|
300 |
+
torchdata==0.5.1
|
301 |
+
tornado==6.2
|
302 |
+
tqdm==4.63.0
|
303 |
+
traitlets==5.14.0
|
304 |
+
traittypes==0.2.1
|
305 |
+
transformers @ git+https://github.com/huggingface/transformers.git@5b5e71dc41734a9798f3535bbd5039ab91883079
|
306 |
+
twine==5.0.0
|
307 |
+
typer==0.9.0
|
308 |
+
types-python-dateutil==2.8.19.14
|
309 |
+
typing_extensions==4.4.0
|
310 |
+
tzdata==2022.7
|
311 |
+
tzlocal==4.2
|
312 |
+
uri-template==1.3.0
|
313 |
+
urllib3==1.26.12
|
314 |
+
validators==0.20.0
|
315 |
+
virtualenv==20.10.0
|
316 |
+
wandb==0.16.2
|
317 |
+
wasabi==1.1.2
|
318 |
+
watchdog==2.1.9
|
319 |
+
wcwidth==0.2.12
|
320 |
+
weasel==0.3.4
|
321 |
+
webcolors==1.13
|
322 |
+
webencodings==0.5.1
|
323 |
+
websocket-client==1.7.0
|
324 |
+
Werkzeug==2.2.2
|
325 |
+
whitenoise==6.0.0
|
326 |
+
widgetsnbextension==4.0.9
|
327 |
+
wrapt==1.14.1
|
328 |
+
xxhash==3.4.1
|
329 |
+
xyzservices==2023.10.1
|
330 |
+
yarg==0.1.9
|
331 |
+
yarl==1.8.2
|
332 |
+
yfinance==0.1.87
|
333 |
+
zc.lockfile==2.0
|
334 |
+
zipp==3.11.0
|
src/Demo.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
from ModelMethods import generate
|
5 |
+
|
6 |
+
|
7 |
+
st.set_page_config(page_title="DEMO", page_icon="👋", layout="wide")
|
8 |
+
|
9 |
+
|
10 |
+
# Make basic configuration for the app
|
11 |
+
appTitle = "Nepali Spell Correction"
|
12 |
+
|
13 |
+
|
14 |
+
# Some test examples here
|
15 |
+
example = (
|
16 |
+
"अबको स्थायी कमिटी ओली सरकारलाई दीएको समर्थन फिर्ताको तयारि रहेको साहले जानकारी दिए।"
|
17 |
+
)
|
18 |
+
examples = {
|
19 |
+
"Examples": "",
|
20 |
+
"अखिलेस झा धेरै दिनदेखि अनुपस्थीत थिए ।": "अखिलेस झा धेरै दिनदेखि अनुपस्थीत थिए ।",
|
21 |
+
"आठौँ तह उपनिर्देषक पदमा दुई जना उत्तीर्ण भएका छन्।": "आठौँ तह उपनिर्देषक पदमा दुई जना उत्तीर्ण भएका छन्।",
|
22 |
+
"उनीहरूमा रोगसँग लड्ने क्षमता मज्जाले बिकसित भइसकेको हुँदैन।": "उनीहरूमा रोगसँग लड्ने क्षमता मज्जाले बिकसित भइसकेको हुँदैन।",
|
23 |
+
}
|
24 |
+
|
25 |
+
|
26 |
+
def main():
|
27 |
+
|
28 |
+
st.header(appTitle)
|
29 |
+
left_column, right_column = st.columns(2)
|
30 |
+
correctedText= None
|
31 |
+
|
32 |
+
with left_column:
|
33 |
+
model_options = {"mT5", "mBART", "VartaT5"}
|
34 |
+
|
35 |
+
# Display the radio options in a single line
|
36 |
+
selected_model = st.radio("Select the model", model_options, index=0)
|
37 |
+
|
38 |
+
# Create a dropdown menu
|
39 |
+
selected_example_key = st.selectbox("Select an example", list(examples.keys()))
|
40 |
+
# Display the selected example text in a text area
|
41 |
+
selected_example_text = examples[selected_example_key]
|
42 |
+
|
43 |
+
# Get user input
|
44 |
+
user_input = st.text_area(
|
45 |
+
"Enter a Nepali Sentence: ",
|
46 |
+
selected_example_text,
|
47 |
+
max_chars=512, # Set the maximum input length to 512 characters
|
48 |
+
)
|
49 |
+
if st.button("Check Spelling"):
|
50 |
+
if user_input:
|
51 |
+
correctedText = generate(selected_model, user_input)
|
52 |
+
# # Perfrom grammer correction
|
53 |
+
# st.subheader("Corrected Text:")
|
54 |
+
# st.write([f"{line['score']:.2f}: {line['sequence']}"for line in correctedText])
|
55 |
+
else:
|
56 |
+
st.warning("Please enter some text to check.")
|
57 |
+
with right_column:
|
58 |
+
if correctedText is not None:
|
59 |
+
st.write("Corrected Text:")
|
60 |
+
# st.write([f"{line['score']:.2f}: {line['sequence']}" for line in correctedText])
|
61 |
+
df = pd.DataFrame(correctedText, columns=["score","sequence"])
|
62 |
+
st.table(df)
|
63 |
+
|
64 |
+
|
65 |
+
if __name__ == "__main__":
|
66 |
+
main()
|
src/ModelMethods.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import HappyTextToText from Happy Transformer
|
2 |
+
from happytransformer import HappyTextToText, TTSettings
|
3 |
+
|
4 |
+
# Huggingface Transformers
|
5 |
+
from transformers import (
|
6 |
+
MT5ForConditionalGeneration,
|
7 |
+
MT5Tokenizer,
|
8 |
+
MBartForConditionalGeneration,
|
9 |
+
MBartTokenizer,
|
10 |
+
T5ForConditionalGeneration,
|
11 |
+
T5TokenizerFast,
|
12 |
+
GenerationConfig,
|
13 |
+
)
|
14 |
+
|
15 |
+
import torch
|
16 |
+
import re
|
17 |
+
|
18 |
+
|
19 |
+
"""
|
20 |
+
Some global variables
|
21 |
+
Add path to the models here
|
22 |
+
"""
|
23 |
+
mt5ModelPath = "../models/nep-spell-hft-23epochs"
|
24 |
+
mbartModelPath = "../models/happytt_mBART_plus_10"
|
25 |
+
vartat5ModelPath = "../models/vartat5-using-100K-plus-1"
|
26 |
+
|
27 |
+
|
28 |
+
"""
|
29 |
+
Function: generate
|
30 |
+
|
31 |
+
This function takes a model name and input text as parameters and
|
32 |
+
returns the output text generated by the specified model.
|
33 |
+
It supports multiple models such as mT5, mBART, and VartaT5.
|
34 |
+
If the specified model is not available,
|
35 |
+
it returns a message indicating the unavailability of the model.
|
36 |
+
|
37 |
+
Parameters:
|
38 |
+
- model (str): Name of the model to use for text generation.
|
39 |
+
- input (str): Input text for the model to generate output from.
|
40 |
+
|
41 |
+
Returns:
|
42 |
+
- str: Output text generated by the specified model or a message indicating model unavailability.
|
43 |
+
"""
|
44 |
+
|
45 |
+
|
46 |
+
def generate(model, input):
|
47 |
+
|
48 |
+
if model == "mT5":
|
49 |
+
return mt5Inference(input)
|
50 |
+
elif model == "mBART":
|
51 |
+
return mbartInference(input)
|
52 |
+
elif model == "VartaT5":
|
53 |
+
return vartat5Inference(input)
|
54 |
+
else:
|
55 |
+
return f"Model: {model} not available"
|
56 |
+
|
57 |
+
# काकाले काकिलाइ माया गर्नू हुन्छ।
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
"""
|
62 |
+
Below are the 3 different models for inference
|
63 |
+
"""
|
64 |
+
def mt5Inference(input):
|
65 |
+
print("Processing mt5")
|
66 |
+
|
67 |
+
model = MT5ForConditionalGeneration.from_pretrained(mt5ModelPath)
|
68 |
+
tokenizer = MT5Tokenizer.from_pretrained(mt5ModelPath)
|
69 |
+
input_ids = tokenizer("grammar: " + input, return_tensors="pt").input_ids
|
70 |
+
outputs = model.generate(
|
71 |
+
input_ids=input_ids,
|
72 |
+
max_length=512,
|
73 |
+
num_beams=5,
|
74 |
+
num_return_sequences=5,
|
75 |
+
return_dict_in_generate=True,
|
76 |
+
output_scores=True,
|
77 |
+
)
|
78 |
+
sequences = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
|
79 |
+
return postProcessOutput(sequences,outputs["sequences_scores"])
|
80 |
+
|
81 |
+
|
82 |
+
def mbartInference(input):
|
83 |
+
print("Processing mbart")
|
84 |
+
tokenizer = MBartTokenizer.from_pretrained(
|
85 |
+
mbartModelPath, src_lang="ne_NP", tgt_lang="ne_NP"
|
86 |
+
)
|
87 |
+
model = MBartForConditionalGeneration.from_pretrained(mbartModelPath)
|
88 |
+
inputs = tokenizer("grammar: " + input, return_tensors="pt")
|
89 |
+
outputs = model.generate(
|
90 |
+
**inputs,
|
91 |
+
decoder_start_token_id=tokenizer.lang_code_to_id["ne_NP"],
|
92 |
+
max_length=512,
|
93 |
+
num_beams=5,
|
94 |
+
num_return_sequences=5,
|
95 |
+
return_dict_in_generate=True,
|
96 |
+
output_scores=True,
|
97 |
+
)
|
98 |
+
sequences = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
|
99 |
+
return postProcessOutput(sequences, outputs["sequences_scores"])
|
100 |
+
# return outputs
|
101 |
+
|
102 |
+
|
103 |
+
def vartat5Inference(input):
|
104 |
+
print("Processing varta")
|
105 |
+
model = T5ForConditionalGeneration.from_pretrained(vartat5ModelPath)
|
106 |
+
# return "model ready"
|
107 |
+
tokenizer = T5TokenizerFast.from_pretrained(vartat5ModelPath)
|
108 |
+
input_ids = tokenizer("grammar: " + input, return_tensors="pt")
|
109 |
+
outputs = model.generate(
|
110 |
+
**input_ids,
|
111 |
+
max_length=512,
|
112 |
+
num_beams=5,
|
113 |
+
num_return_sequences=5,
|
114 |
+
return_dict_in_generate=True,
|
115 |
+
output_scores=True,
|
116 |
+
)
|
117 |
+
sequences = tokenizer.batch_decode(outputs["sequences"], skip_special_tokens=True)
|
118 |
+
return postProcessOutput(sequences,outputs["sequences_scores"])
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
+
"""
|
123 |
+
Post processing the model output
|
124 |
+
"""
|
125 |
+
|
126 |
+
def postProcessOutput(sequences, sequences_scores):
|
127 |
+
probabilities = torch.exp(sequences_scores)
|
128 |
+
unique_sequences = set()
|
129 |
+
# Initialize the list to store filtered items
|
130 |
+
filtered_outputs = []
|
131 |
+
|
132 |
+
# Iterate through sequences and formatted_scores
|
133 |
+
for sequence, score in zip(sequences, probabilities):
|
134 |
+
# Check if the sequence is not in the set of unique sequences
|
135 |
+
if sequence not in unique_sequences:
|
136 |
+
# Add the sequence to the set of unique sequences
|
137 |
+
unique_sequences.add(sequence)
|
138 |
+
# Append the sequence and score to the filtered_outputs list
|
139 |
+
filtered_outputs.append({"sequence": sequence, "score": score.item()})
|
140 |
+
|
141 |
+
return filtered_outputs
|
142 |
+
|
143 |
+
|
144 |
+
"""
|
145 |
+
For working with paragraph processing
|
146 |
+
"""
|
147 |
+
|
148 |
+
def split_nepali_paragraph_into_sentences(nepali_text):
|
149 |
+
|
150 |
+
# Define a regex pattern to split sentences
|
151 |
+
# We'll split on periods, question marks, and exclamation marks
|
152 |
+
sentence_pattern = r"(?<=[।?!\n])\s+"
|
153 |
+
|
154 |
+
# Split the Nepali text into sentences
|
155 |
+
sentences = re.split(sentence_pattern, nepali_text)
|
156 |
+
|
157 |
+
return sentences
|
158 |
+
|
159 |
+
|
160 |
+
def process_paragraph(model, paragraph):
|
161 |
+
sentenceList = split_nepali_paragraph_into_sentences(paragraph)
|
162 |
+
out_sentence = []
|
163 |
+
for s in sentenceList:
|
164 |
+
out_sentence.append(generate(model, s))
|
165 |
+
nepali_paragraph = " ".join(out_sentence)
|
166 |
+
return nepali_paragraph
|
src/pages/LiteratureReview.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.set_page_config(
|
4 |
+
page_title="Literature Review",
|
5 |
+
page_icon="👋",
|
6 |
+
)
|
7 |
+
|
8 |
+
st.write("LiteratureReview")
|
src/pages/References.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.set_page_config(
|
4 |
+
page_title="References",
|
5 |
+
page_icon="👋",
|
6 |
+
layout="wide"
|
7 |
+
)
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
st.sidebar.header("Plotting Demo")
|
13 |
+
|
14 |
+
st.write("References Here")
|