wissamantoun commited on
Commit
9c398de
·
1 Parent(s): cfd45f1

first commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,2 +1,9 @@
1
- # Arabic-NLP-app
2
- Arabic NLP app built on streamlit to showcase models
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Arabic GPT2 (AraGPT2)
3
+ emoji: ⌨
4
+ colorFrom: purple
5
+ colorTo: green
6
+ sdk: streamlit
7
+ app_file: app.py
8
+ pinned: false
9
+ ---
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import awesome_streamlit as ast
3
+ import pages.home
4
+ import pages.processor
5
+
6
+
7
+ st.set_page_config(
8
+ page_title="TEST", page_icon="📖", initial_sidebar_state="expanded", layout="wide"
9
+ )
10
+
11
+ PAGES = {"Home": pages.home, "Arabic Text Preprocessor": pages.processor}
12
+
13
+
14
+ def main():
15
+ """Main function."""
16
+ st.sidebar.title("Navigation")
17
+ selection = st.sidebar.radio("Pages", list(PAGES.keys()))
18
+
19
+ page = PAGES[selection]
20
+ ast.shared.components.write_page(page)
21
+
22
+
23
+ if __name__ == "__main__":
24
+ main()
backend.py ADDED
File without changes
images/AraELECTRA.png ADDED
images/AraGPT2.png ADDED
images/arabert_logo.png ADDED
pages/__init__.py ADDED
File without changes
pages/home.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import awesome_streamlit as ast
3
+
4
+
5
+ def write():
6
+ st.markdown(
7
+ """
8
+ # Arabic Natural Language Processing
9
+
10
+
11
+ In this HuggingFace space you will be able to test the different Arabic NLP models that my colleges at [AUB MIND Lab](https://sites.aub.edu.lb/mindlab/) have built, with some other applications.
12
+
13
+ Check the **Navigation bar** to access the apps:
14
+ - Arabic Text Preprocessor: Test how text imput is treated by our preprocessor
15
+ - Arabic Language Generation: Generate Arabic text using our AraGPT2 language models
16
+ - Arabic Sentiment Analysis: Test the senitment analysis model that won the [Arabic Senitment Analysis competition @ KAUST](https://www.kaggle.com/c/arabic-sentiment-analysis-2021-kaust)
17
+ - Arabic Masked Language Modeling: Test our AraBERT models MLM capabilities
18
+ """
19
+ )
20
+ st.markdown("#")
21
+ col1, col2, col3 = st.columns(3)
22
+
23
+ col1.write("## **AraBERT**")
24
+ col1.image("images/arabert_logo.png", width=200)
25
+
26
+ col2.write("## **AraGPT2**")
27
+ col2.image("images/AraGPT2.png", width=200)
28
+
29
+ col3.write("## **AraElectra**")
30
+ col3.image("images/AraELECTRA.png", width=200)
31
+
32
+ st.markdown(
33
+ """
34
+
35
+ You can find the more details in the source code and paper linked in our repository on GitHub [repo](https://github.com/aub-mind/arabert).
36
+
37
+ ## Dataset
38
+
39
+ The pretraining data used for the new **AraBERT** model is also used for **AraGPT2 and AraELECTRA**.
40
+
41
+ The dataset consists of 77GB or 200,095,961 lines or 8,655,948,860 words or 82,232,988,358 chars (before applying Farasa Segmentation)
42
+
43
+ Our large models were train a TPUv3-128 provided by TFRC.
44
+
45
+ For the new dataset we added the unshuffled OSCAR corpus, after we thoroughly filter it, to the previous dataset used in AraBERTv1 but with out the websites that we previously crawled:
46
+ - OSCAR unshuffled and filtered.
47
+ - [Arabic Wikipedia dump](https://archive.org/details/arwiki-20190201) from 2020/09/01
48
+ - [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4)
49
+ - [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619)
50
+ - Assafir news articles. Huge thank you for Assafir for the data
51
+
52
+ ## Models
53
+
54
+ Model | HuggingFace Model Name | Size (MB/Params)| Pre-Segmentation | Hardware | Sequence Length | Batch Size | Num of Steps | Total Time (in Days) |
55
+ ---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:
56
+ AraBERTv0.2-base | [bert-base-arabertv02](https://huggingface.co/aubmindlab/bert-base-arabertv02) | 543MB / 136M | No | TPUv3-8 | 128 /512 | 2560/384 | 1M/ 2M | 36 |
57
+ AraBERTv0.2-large| [bert-large-arabertv02](https://huggingface.co/aubmindlab/bert-large-arabertv02) | 1.38G / 371M | No | TPUv3-128 | 128 /512 | 13440 / 2056 | 250K / 300K | 7 |
58
+ AraBERTv2-base| [bert-base-arabertv2](https://huggingface.co/aubmindlab/bert-base-arabertv2) | 543MB / 136M | Yes | TPUv3-8 |128 /512 | 2560 / 384 | 1M / 2M | 36 |
59
+ AraBERTv2-large| [bert-large-arabertv2](https://huggingface.co/aubmindlab/bert-large-arabertv2) | 1.38G / 371M | Yes | TPUv3-128 |128 /512 | 13440 / 2056| 250K / 300K | 7 |
60
+ AraBERTv0.1-base| [bert-base-arabertv01](https://huggingface.co/aubmindlab/bert-base-arabertv01) | 543MB / 136M | No | TPUv2-8 |128 /512 |128 / 512 | 900K / 300K| 4 |
61
+ AraBERTv1-base| [bert-base-arabert](https://huggingface.co/aubmindlab/bert-base-arabert) | 543MB / 136M | Yes | TPUv2-8 |128 /512 |128 / 512 | 900K / 300K| 4 |
62
+ AraGPT2-base | [aragpt2-base](https://huggingface.co/aubmindlab/aragpt2-base) | 527MB/135M | No | TPUv3-128 | 9.7M | 1792 | 125K | 1.5 |
63
+ AraGPT2-medium | [aragpt2-medium](https://huggingface.co/aubmindlab/aragpt2-medium) | 1.38G/370M | No |TPUv3-8 | 9.7M | 80 | 1M | 15 |
64
+ AraGPT2-large | [aragpt2-large](https://huggingface.co/aubmindlab/aragpt2-large) | 2.98GB/792M | No |TPUv3-128 | 9.7M | 256 | 220k | 3 |
65
+ AraGPT2-mega | [aragpt2-mega](https://huggingface.co/aubmindlab/aragpt2-mega) | 5.5GB/1.46B |No |TPUv3-128 | 9.7M | 256 | 800K | 9 |
66
+ AraELECTRA-base-generator | [araelectra-base-generator](https://huggingface.co/aubmindlab/araelectra-base-generator) | 227MB/60M | No | TPUv3-8 | 512 | 256 | 2M | 24
67
+ AraELECTRA-base-discriminator | [araelectra-base-discriminator](https://huggingface.co/aubmindlab/araelectra-base-discriminator) | 516MB/135M | No | TPUv3-8 | 512 | 256 | 2M | 24
68
+
69
+ All models are available in the `HuggingFace` model page under the [aubmindlab](https://huggingface.co/aubmindlab/) name. Checkpoints are available in PyTorch, TF2 and TF1 formats.
70
+
71
+ # Preprocessing
72
+
73
+ You can test the Arabic Preprocessing pipeline in the Arabic Text Preprocessing page.
74
+
75
+ It is recommended to apply our preprocessing function before training/testing on any dataset.
76
+ **Install farasapy to segment text for AraBERT v1 & v2 `pip install farasapy`**
77
+
78
+ ```python
79
+ from arabert.preprocess import ArabertPreprocessor
80
+
81
+ model_name = "aubmindlab/bert-base-arabertv2"
82
+ arabert_prep = ArabertPreprocessor(model_name=model_name)
83
+
84
+ text = "ولن نبالغ إذا قلنا: إن 'هاتف' أو 'كمبيوتر المكتب' في زمننا هذا ضروري"
85
+ arabert_prep.preprocess(text)
86
+ >>>"و+ لن نبالغ إذا قل +نا : إن ' هاتف ' أو ' كمبيوتر ال+ مكتب ' في زمن +نا هذا ضروري"
87
+ ```
88
+
89
+ You can also use the `unpreprocess()` function to reverse the preprocessing changes, by fixing the spacing around non alphabetical characters, and also de-segmenting if the model selected need pre-segmentation. We highly recommend unprocessing generated content of `AraGPT2` model, to make it look more natural.
90
+ ```python
91
+ output_text = "و+ لن نبالغ إذا قل +نا : إن ' هاتف ' أو ' كمبيوتر ال+ مكتب ' في زمن +نا هذا ضروري"
92
+ arabert_prep.unpreprocess(output_text)
93
+ >>>"ولن نبالغ إذا قلنا: إن 'هاتف' أو 'كمبيوتر المكتب' في زمننا هذا ضروري"
94
+ ```
95
+
96
+ # If you used this model please cite us as :
97
+
98
+ ## AraBERT
99
+ Google Scholar has our Bibtex wrong (missing name), use this instead
100
+ ```
101
+ @inproceedings{antoun2020arabert,
102
+ title={AraBERT: Transformer-based Model for Arabic Language Understanding},
103
+ author={Antoun, Wissam and Baly, Fady and Hajj, Hazem},
104
+ booktitle={LREC 2020 Workshop Language Resources and Evaluation Conference 11--16 May 2020},
105
+ pages={9}
106
+ }
107
+ ```
108
+ ## AraGPT2
109
+ ```
110
+ @inproceedings{antoun-etal-2021-aragpt2,
111
+ title = "{A}ra{GPT}2: Pre-Trained Transformer for {A}rabic Language Generation",
112
+ author = "Antoun, Wissam and
113
+ Baly, Fady and
114
+ Hajj, Hazem",
115
+ booktitle = "Proceedings of the Sixth Arabic Natural Language Processing Workshop",
116
+ month = apr,
117
+ year = "2021",
118
+ address = "Kyiv, Ukraine (Virtual)",
119
+ publisher = "Association for Computational Linguistics",
120
+ url = "https://www.aclweb.org/anthology/2021.wanlp-1.21",
121
+ pages = "196--207",
122
+ }
123
+ ```
124
+
125
+ ## AraELECTRA
126
+ ```
127
+ @inproceedings{antoun-etal-2021-araelectra,
128
+ title = "{A}ra{ELECTRA}: Pre-Training Text Discriminators for {A}rabic Language Understanding",
129
+ author = "Antoun, Wissam and
130
+ Baly, Fady and
131
+ Hajj, Hazem",
132
+ booktitle = "Proceedings of the Sixth Arabic Natural Language Processing Workshop",
133
+ month = apr,
134
+ year = "2021",
135
+ address = "Kyiv, Ukraine (Virtual)",
136
+ publisher = "Association for Computational Linguistics",
137
+ url = "https://www.aclweb.org/anthology/2021.wanlp-1.20",
138
+ pages = "191--195",
139
+ }
140
+ ```
141
+
142
+
143
+ # Acknowledgments
144
+ Thanks to TensorFlow Research Cloud (TFRC) for the free access to Cloud TPUs, couldn't have done it without this program, and to the [AUB MIND Lab](https://sites.aub.edu.lb/mindlab/) Members for the continous support. Also thanks to [Yakshof](https://www.yakshof.com/#/) and Assafir for data and storage access. Another thanks for Habib Rahal (https://www.behance.net/rahalhabib), for putting a face to AraBERT.
145
+
146
+ # Contacts
147
+ **Wissam Antoun**: [Linkedin](https://www.linkedin.com/in/wissam-antoun-622142b4/) | [Twitter](https://twitter.com/wissam_antoun) | [Github](https://github.com/WissamAntoun) | wfa07 (AT) mail (DOT) aub (DOT) edu | wissam.antoun (AT) gmail (DOT) com
148
+
149
+ **Fady Baly**: [Linkedin](https://www.linkedin.com/in/fadybaly/) | [Twitter](https://twitter.com/fadybaly) | [Github](https://github.com/fadybaly) | fgb06 (AT) mail (DOT) aub (DOT) edu | baly.fady (AT) gmail (DOT) com
150
+
151
+ """
152
+ )
pages/preprocess.py ADDED
@@ -0,0 +1,736 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import html
2
+ import logging
3
+ import re
4
+ from typing import List
5
+ from farasa.segmenter import FarasaSegmenter
6
+ import emoji
7
+
8
+ import pyarabic.araby as araby
9
+
10
+ ACCEPTED_MODELS = [
11
+ "bert-base-arabertv01",
12
+ "bert-base-arabert",
13
+ "bert-base-arabertv02",
14
+ "bert-base-arabertv2",
15
+ "bert-large-arabertv02",
16
+ "bert-large-arabertv2",
17
+ "araelectra-base",
18
+ "araelectra-base-discriminator",
19
+ "araelectra-base-generator",
20
+ "araelectra-base-artydiqa",
21
+ "aragpt2-base",
22
+ "aragpt2-medium",
23
+ "aragpt2-large",
24
+ "aragpt2-mega",
25
+ ]
26
+
27
+ SEGMENTED_MODELS = [
28
+ "bert-base-arabert",
29
+ "bert-base-arabertv2",
30
+ "bert-large-arabertv2",
31
+ ]
32
+
33
+ SECOND_GEN_MODELS = [
34
+ "bert-base-arabertv02",
35
+ "bert-base-arabertv2",
36
+ "bert-large-arabertv02",
37
+ "bert-large-arabertv2",
38
+ "araelectra-base",
39
+ "araelectra-base-discriminator",
40
+ "araelectra-base-generator",
41
+ "araelectra-base-artydiqa",
42
+ "aragpt2-base",
43
+ "aragpt2-medium",
44
+ "aragpt2-large",
45
+ "aragpt2-mega",
46
+ ]
47
+
48
+ farasa_segmenter = FarasaSegmenter(interactive=True)
49
+
50
+
51
+ class ArabertPreprocessor:
52
+ """
53
+ A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo.
54
+ It also can unprocess the text ouput of the generated text
55
+
56
+ Args:
57
+
58
+ model_name (:obj:`str`): model name from the HuggingFace Models page without
59
+ the aubmindlab tag. Will default to a base Arabic preprocessor if model name was not found.
60
+ Current accepted models are:
61
+
62
+ - "bert-base-arabertv01": No farasa segmentation.
63
+ - "bert-base-arabert": with farasa segmentation.
64
+ - "bert-base-arabertv02": No farasas egmentation.
65
+ - "bert-base-arabertv2": with farasa segmentation.
66
+ - "bert-large-arabertv02": No farasas egmentation.
67
+ - "bert-large-arabertv2": with farasa segmentation.
68
+ - "araelectra-base": No farasa segmentation.
69
+ - "araelectra-base-discriminator": No farasa segmentation.
70
+ - "araelectra-base-generator": No farasa segmentation.
71
+ - "aragpt2-base": No farasa segmentation.
72
+ - "aragpt2-medium": No farasa segmentation.
73
+ - "aragpt2-large": No farasa segmentation.
74
+ - "aragpt2-mega": No farasa segmentation.
75
+
76
+
77
+ keep_emojis(:obj:`bool`, `optional`, defaults to :obj:`False`): don't remove emojis while preprocessing.
78
+
79
+ remove_html_markup(:obj: `bool`, `optional`, defaults to :obj:`True`): Whether to remove html artfacts,
80
+ should be set to False when preprocessing TyDi QA.
81
+
82
+ replace_urls_emails_mentions(:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to replace email urls
83
+ and mentions by special tokens.
84
+
85
+ strip_tashkeel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA,
86
+ KASRA, SUKUN, SHADDA).
87
+
88
+ strip_tatweel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove tatweel '\\u0640'.
89
+
90
+ insert_white_spaces(:obj:`bool`, `optional`, defaults to :obj:`True`): insert whitespace before and after all non Arabic digits
91
+ or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace
92
+ between words and numbers or numbers and words.
93
+
94
+ remove_non_digit_repetition(:obj:`bool`, `optional`, defaults to :obj:`True`): replace repetition of more than 2 non-digit character with
95
+ 2 of this character.
96
+
97
+ replace_slash_with_dash(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in AraBERTv02,
98
+ AraELECTRA and AraGPT2.
99
+ Set to False to force disable, and True to force enable. Replaces the "/" with "-",
100
+ since "/" is missing from AraBERTv2, AraELECTRA and ARAGPT2 vocabulary.
101
+
102
+ map_hindi_numbers_to_arabic(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in
103
+ AraBERTv02, AraELECTRA and AraGPT2.Set to False to force disable, and True to force enable.
104
+ Replaces hindi numbers with the corresponding Arabic one. ex: "١٩٩٥" --> "1995".
105
+ This is behavior is present by default in AraBERTv1 and v2 (with pre-segmentation),
106
+ and fixes the issue of caused by a bug when inserting white spaces.
107
+
108
+ apply_farasa_segmentation(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in
109
+ AraBERTv2, and AraBERTv1. Set to False to force disable, and True to force enable.
110
+
111
+
112
+
113
+ Returns:
114
+
115
+ ArabertPreprocessor: A preprocessor instance
116
+
117
+ Example:
118
+
119
+ from preprocess import ArabertPreprocessor
120
+
121
+ arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2")
122
+
123
+ arabert_prep.preprocess("SOME ARABIC TEXT")
124
+ """
125
+
126
+ def __init__(
127
+ self,
128
+ model_name: str,
129
+ keep_emojis: bool = False,
130
+ remove_html_markup: bool = True,
131
+ replace_urls_emails_mentions: bool = True,
132
+ strip_tashkeel: bool = True,
133
+ strip_tatweel: bool = True,
134
+ insert_white_spaces: bool = True,
135
+ remove_non_digit_repetition: bool = True,
136
+ replace_slash_with_dash: bool = None,
137
+ map_hindi_numbers_to_arabic: bool = None,
138
+ apply_farasa_segmentation: bool = None,
139
+ ):
140
+
141
+ model_name = model_name.replace("aubmindlab/", "").replace("wissamantoun/", "")
142
+
143
+ if model_name not in ACCEPTED_MODELS:
144
+ logging.warning(
145
+ """Model provided is not in the accepted model list. Preprocessor will default to a base Arabic preprocessor"""
146
+ )
147
+ self.model_name = "bert-base-arabertv02"
148
+ else:
149
+ self.model_name = model_name
150
+
151
+ if apply_farasa_segmentation is None:
152
+ if self.model_name in SEGMENTED_MODELS:
153
+ self.apply_farasa_segmentation = True
154
+ else:
155
+ self.apply_farasa_segmentation = False
156
+ else:
157
+ if (
158
+ apply_farasa_segmentation == False
159
+ and self.model_name in SEGMENTED_MODELS
160
+ ):
161
+ logging.warning(
162
+ "The selected model_name requires Farasa pre-segmentation, but apply_farasa_segmentation was set to False!"
163
+ )
164
+
165
+ self.apply_farasa_segmentation = apply_farasa_segmentation
166
+
167
+ self.keep_emojis = keep_emojis
168
+ self.remove_html_markup = remove_html_markup
169
+ self.replace_urls_emails_mentions = replace_urls_emails_mentions
170
+ self.strip_tashkeel = strip_tashkeel
171
+ self.strip_tatweel = strip_tatweel
172
+ self.insert_white_spaces = insert_white_spaces
173
+ self.remove_non_digit_repetition = remove_non_digit_repetition
174
+
175
+ if replace_slash_with_dash is None:
176
+ if self.model_name in SECOND_GEN_MODELS:
177
+ self.replace_slash_with_dash = True
178
+ else:
179
+ self.replace_slash_with_dash = False
180
+ else:
181
+ self.replace_slash_with_dash = replace_slash_with_dash
182
+
183
+ if map_hindi_numbers_to_arabic is None:
184
+ if self.model_name in SECOND_GEN_MODELS:
185
+ self.map_hindi_numbers_to_arabic = True
186
+ else:
187
+ self.map_hindi_numbers_to_arabic = False
188
+ else:
189
+ self.map_hindi_numbers_to_arabic = map_hindi_numbers_to_arabic
190
+
191
+ def preprocess(self, text: str) -> str:
192
+ """
193
+ Preprocess takes an input text line an applies the same preprocessing used in AraBERT
194
+ pretraining, or according to settings
195
+
196
+ Args:
197
+
198
+ text (:obj:`str`): inout text string
199
+
200
+ Returns:
201
+
202
+ string: A preprocessed string depending on which model was selected
203
+ """
204
+ if (
205
+ self.model_name == "bert-base-arabert"
206
+ or self.model_name == "bert-base-arabertv01"
207
+ ):
208
+ return self._preprocess_v1(
209
+ text,
210
+ do_farasa_tokenization=self.apply_farasa_segmentation,
211
+ )
212
+
213
+ if self.model_name in SECOND_GEN_MODELS:
214
+ return self._preprocess_v2(text)
215
+
216
+ return self._preprocess_v3(text)
217
+
218
+ def unpreprocess(self, text: str, desegment: bool = True) -> str:
219
+ """Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces.
220
+ The objective is to make the generated text of any model appear natural and not preprocessed.
221
+
222
+ Args:
223
+ text (:obj:`str`): input text to be un-preprocessed
224
+ desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before]..
225
+
226
+ Returns:
227
+ str: The unpreprocessed (and possibly Farasa-desegmented) text.
228
+ """
229
+
230
+ if self.apply_farasa_segmentation and desegment:
231
+ text = self.desegment(text)
232
+
233
+ # removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple
234
+ # https://stackoverflow.com/a/53436792/5381220
235
+ text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text)
236
+ text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text)
237
+ text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text)
238
+ text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text)
239
+
240
+ # during generation, sometimes the models don't put a space after the dot, this handles it
241
+ text = text.replace(".", " . ")
242
+ text = " ".join(text.split())
243
+
244
+ # handle decimals
245
+ text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text)
246
+ text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text)
247
+
248
+ text = re.sub(left_and_right_spaced_chars, r"\1", text)
249
+ text = re.sub(left_spaced_chars, r"\1", text)
250
+ text = re.sub(right_spaced_chars, r"\1", text)
251
+
252
+ return text
253
+
254
+ def desegment(self, text: str) -> str:
255
+ """
256
+ Use this function if sentence tokenization was done using
257
+ `from arabert.preprocess_arabert import preprocess` with Farasa enabled
258
+ AraBERT segmentation using Farasa adds a space after the '+' for prefixes,
259
+ and after before the '+' for suffixes
260
+
261
+ Example:
262
+ >>> desegment('ال+ دراس +ات')
263
+ الدراسات
264
+ """
265
+ text = text.replace("+ ", "+")
266
+ text = text.replace(" +", "+")
267
+ text = " ".join([self._desegmentword(word) for word in text.split(" ")])
268
+ return text
269
+
270
+ def _desegmentword(self, orig_word: str) -> str:
271
+ """
272
+ Word segmentor that takes a Farasa Segmented Word and removes the '+' signs
273
+
274
+ Example:
275
+ >>> _desegmentword("ال+يومي+ة")
276
+ اليومية
277
+ """
278
+ word = orig_word.replace("ل+ال+", "لل")
279
+ if "ال+ال" not in orig_word:
280
+ word = word.replace("ل+ال", "لل")
281
+ word = word.replace("+", "")
282
+ word = word.replace("للل", "لل")
283
+ return word
284
+
285
+ def _preprocess_v3(self, text: str) -> str:
286
+ text = str(text)
287
+ text = html.unescape(text)
288
+ if self.strip_tashkeel:
289
+ text = araby.strip_tashkeel(text)
290
+ if self.strip_tatweel:
291
+ text = araby.strip_tatweel(text)
292
+
293
+ if self.replace_urls_emails_mentions:
294
+ # replace all possible URLs
295
+ for reg in url_regexes:
296
+ text = re.sub(reg, " [رابط] ", text)
297
+ # REplace Emails with [بريد]
298
+ for reg in email_regexes:
299
+ text = re.sub(reg, " [بريد] ", text)
300
+ # replace mentions with [مستخدم]
301
+ text = re.sub(user_mention_regex, " [مستخدم] ", text)
302
+
303
+ if self.remove_html_markup:
304
+ # remove html line breaks
305
+ text = re.sub("<br />", " ", text)
306
+ # remove html markup
307
+ text = re.sub("</?[^>]+>", " ", text)
308
+
309
+ if self.map_hindi_numbers_to_arabic:
310
+ text = text.translate(hindi_to_arabic_map)
311
+
312
+ # remove repeated characters >2
313
+ if self.remove_non_digit_repetition:
314
+ text = self._remove_non_digit_repetition(text)
315
+
316
+ # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
317
+ if self.insert_white_spaces:
318
+ text = re.sub(
319
+ "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z ])",
320
+ r" \1 ",
321
+ text,
322
+ )
323
+
324
+ # re-fix brackets
325
+ text = text.replace("[ رابط ]", "[رابط]")
326
+ text = text.replace("[ بريد ]", "[بريد]")
327
+ text = text.replace("[ مستخدم ]", "[مستخدم]")
328
+
329
+ # insert whitespace between words and numbers or numbers and words
330
+ text = re.sub(
331
+ "(\d+)([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)",
332
+ r" \1 \2 ",
333
+ text,
334
+ )
335
+ text = re.sub(
336
+ "([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)(\d+)",
337
+ r" \1 \2 ",
338
+ text,
339
+ )
340
+
341
+ # remove unwanted characters
342
+ if self.keep_emojis:
343
+ emoji_regex = "".join(list(emoji.UNICODE_EMOJI["en"].keys()))
344
+ rejected_chars_regex2 = "[^%s%s]" % (chars_regexv2, emoji_regex)
345
+ text = re.sub(rejected_chars_regex2, " ", text)
346
+ else:
347
+ text = re.sub(rejected_chars_regexv2, " ", text)
348
+
349
+ # remove extra spaces
350
+ text = " ".join(text.replace("\uFE0F", "").split())
351
+
352
+ if self.apply_farasa_segmentation:
353
+ if self.keep_emojis:
354
+ new_text = []
355
+ for word in text.split():
356
+ if word in list(emoji.UNICODE_EMOJI["en"].keys()):
357
+ new_text.append(word)
358
+ else:
359
+ new_text.append(farasa_segmenter.segment(word))
360
+ text = " ".join(new_text)
361
+ else:
362
+ text = farasa_segmenter.segment(text)
363
+ return self._farasa_segment(text)
364
+
365
+ # ALl the other models dont require Farasa Segmentation
366
+ return text
367
+
368
+ def _preprocess_v2(self, text: str) -> str:
369
+ text = str(text)
370
+ text = html.unescape(text)
371
+ if self.strip_tashkeel:
372
+ text = araby.strip_tashkeel(text)
373
+ if self.strip_tatweel:
374
+ text = araby.strip_tatweel(text)
375
+
376
+ if self.replace_urls_emails_mentions:
377
+ # replace all possible URLs
378
+ for reg in url_regexes:
379
+ text = re.sub(reg, " [رابط] ", text)
380
+ # REplace Emails with [بريد]
381
+ for reg in email_regexes:
382
+ text = re.sub(reg, " [بريد] ", text)
383
+ # replace mentions with [مستخدم]
384
+ text = re.sub(user_mention_regex, " [مستخدم] ", text)
385
+
386
+ if self.remove_html_markup:
387
+ # remove html line breaks
388
+ text = re.sub("<br />", " ", text)
389
+ # remove html markup
390
+ text = re.sub("</?[^>]+>", " ", text)
391
+
392
+ if self.map_hindi_numbers_to_arabic:
393
+ text = text.translate(hindi_to_arabic_map)
394
+
395
+ # remove repeated characters >2
396
+ if self.remove_non_digit_repetition:
397
+ text = self._remove_non_digit_repetition(text)
398
+
399
+ # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
400
+ if self.insert_white_spaces:
401
+ text = re.sub(
402
+ "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])",
403
+ r" \1 ",
404
+ text,
405
+ )
406
+
407
+ # insert whitespace between words and numbers or numbers and words
408
+ text = re.sub(
409
+ "(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text
410
+ )
411
+ text = re.sub(
412
+ "([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text
413
+ )
414
+
415
+ if self.replace_slash_with_dash:
416
+ text = text.replace("/", "-")
417
+
418
+ # remove unwanted characters
419
+ if self.keep_emojis:
420
+ emoji_regex = "".join(list(emoji.UNICODE_EMOJI["en"].keys()))
421
+ rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex)
422
+ text = re.sub(rejected_chars_regex2, " ", text)
423
+ else:
424
+ text = re.sub(rejected_chars_regex, " ", text)
425
+
426
+ # remove extra spaces
427
+ text = " ".join(text.replace("\uFE0F", "").split())
428
+
429
+ if (
430
+ self.model_name == "bert-base-arabertv2"
431
+ or self.model_name == "bert-large-arabertv2"
432
+ ):
433
+ if self.keep_emojis:
434
+ new_text = []
435
+ for word in text.split():
436
+ if word in list(emoji.UNICODE_EMOJI["en"].keys()):
437
+ new_text.append(word)
438
+ else:
439
+ new_text.append(farasa_segmenter.segment(word))
440
+ text = " ".join(new_text)
441
+ else:
442
+ text = farasa_segmenter.segment(text)
443
+ return self._farasa_segment(text)
444
+
445
+ # ALl the other models dont require Farasa Segmentation
446
+ return text
447
+
448
+ def _preprocess_v1(self, text: str, do_farasa_tokenization: bool) -> str:
449
+ """
450
+ AraBERTv1 preprocessing Function
451
+ """
452
+ text = str(text)
453
+ if self.strip_tashkeel:
454
+ text = araby.strip_tashkeel(text)
455
+
456
+ text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text)
457
+ text = re.sub("ـ", "", text)
458
+ text = re.sub("[«»]", ' " ', text)
459
+
460
+ if self.replace_urls_emails_mentions:
461
+ # replace the [رابط] token with space if you want to clean links
462
+ text = re.sub(regex_url_step1, "[رابط]", text)
463
+ text = re.sub(regex_url_step2, "[رابط]", text)
464
+ text = re.sub(regex_url, "[رابط]", text)
465
+ text = re.sub(regex_email, "[بريد]", text)
466
+ text = re.sub(regex_mention, "[مستخدم]", text)
467
+ text = re.sub("…", r"\.", text).strip()
468
+ text = self._remove_redundant_punct(text)
469
+
470
+ if self.replace_urls_emails_mentions:
471
+ text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text)
472
+ text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text)
473
+ text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]", " [مستخدم] ", text)
474
+
475
+ if self.remove_non_digit_repetition:
476
+ text = self._remove_non_digit_repetition(text)
477
+
478
+ if self.insert_white_spaces:
479
+ text = re.sub(
480
+ "([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])",
481
+ r" \1 ",
482
+ text,
483
+ )
484
+ if do_farasa_tokenization:
485
+ text = self._tokenize_arabic_words_farasa(text)
486
+
487
+ text = " ".join(text.split())
488
+
489
+ return text
490
+
491
+ def _farasa_segment(self, text: str) -> str:
492
+ line_farasa = text.split()
493
+ segmented_line = []
494
+ for index, word in enumerate(line_farasa):
495
+ if word in ["[", "]"]:
496
+ continue
497
+ if word in ["رابط", "بريد", "مستخدم"] and line_farasa[index - 1] in [
498
+ "[",
499
+ "]",
500
+ ]:
501
+ segmented_line.append("[" + word + "]")
502
+ continue
503
+ if "+" not in word:
504
+ segmented_line.append(word)
505
+ continue
506
+ segmented_word = self._split_farasa_output(word)
507
+ segmented_line.extend(segmented_word)
508
+
509
+ return " ".join(segmented_line)
510
+
511
+ def _split_farasa_output(self, word: str) -> str:
512
+ segmented_word = []
513
+ temp_token = ""
514
+ for i, c in enumerate(word):
515
+ if c == "+":
516
+ # if the token is KAF, it could be a suffix or prefix
517
+ if temp_token == "ك":
518
+ # if we are at the second token, then KAF is surely a prefix
519
+ if i == 1:
520
+ segmented_word.append(temp_token + "+")
521
+ temp_token = ""
522
+ # If the KAF token is between 2 tokens
523
+ elif word[i - 2] == "+":
524
+ # if the previous token is prefix, then this KAF must be a prefix
525
+ if segmented_word[-1][-1] == "+":
526
+ segmented_word.append(temp_token + "+")
527
+ temp_token = ""
528
+ # else it is a suffix, this KAF could not be a second suffix
529
+ else:
530
+ segmented_word.append("+" + temp_token)
531
+ temp_token = ""
532
+ # if Kaf is at the end, this is handled with the statement after the loop
533
+ elif temp_token in prefix_list:
534
+ segmented_word.append(temp_token + "+")
535
+ temp_token = ""
536
+ elif temp_token in suffix_list:
537
+ segmented_word.append("+" + temp_token)
538
+ temp_token = ""
539
+ else:
540
+ segmented_word.append(temp_token)
541
+ temp_token = ""
542
+ continue
543
+ temp_token += c
544
+ if temp_token != "":
545
+ if temp_token in suffix_list:
546
+ segmented_word.append("+" + temp_token)
547
+ else:
548
+ segmented_word.append(temp_token)
549
+ return segmented_word
550
+
551
+ def _tokenize_arabic_words_farasa(self, line_input: str) -> str:
552
+
553
+ if self.keep_emojis:
554
+ # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
555
+ line_farasa = []
556
+ for word in line_input.split():
557
+ if word in list(emoji.UNICODE_EMOJI["en"].keys()):
558
+ line_farasa.append(word)
559
+ else:
560
+ line_farasa.append(farasa_segmenter.segment(word))
561
+ else:
562
+ line_farasa = farasa_segmenter.segment(line_input).split()
563
+
564
+ segmented_line = []
565
+ for index, word in enumerate(line_farasa):
566
+ if word in ["[", "]"]:
567
+ continue
568
+ if word in ["رابط", "بريد", "مستخدم"] and line_farasa[index - 1] in [
569
+ "[",
570
+ "]",
571
+ ]:
572
+ segmented_line.append("[" + word + "]")
573
+ continue
574
+ segmented_word = []
575
+ for token in word.split("+"):
576
+ if token in prefix_list:
577
+ segmented_word.append(token + "+")
578
+ elif token in suffix_list:
579
+ segmented_word.append("+" + token)
580
+ else:
581
+ segmented_word.append(token)
582
+ segmented_line.extend(segmented_word)
583
+ return " ".join(segmented_line)
584
+
585
+ def _remove_non_digit_repetition(self, text: str) -> str:
586
+ """
587
+ :param text: the input text to remove elongation
588
+ :return: delongated text
589
+ """
590
+ # loop over the number of times the regex matched the text
591
+ # OLD
592
+ # for index_ in range(len(re.findall(regex_tatweel, text))):
593
+ # elongation = re.search(regex_tatweel, text)
594
+ # if elongation:
595
+ # elongation_pattern = elongation.group()
596
+ # elongation_replacement = elongation_pattern[0]
597
+ # elongation_pattern = re.escape(elongation_pattern)
598
+ # text = re.sub(
599
+ # elongation_pattern, elongation_replacement, text, flags=re.MULTILINE
600
+ # )
601
+ # else:
602
+ # break
603
+
604
+ # New
605
+ text = multiple_char_pattern.sub(r"\1\1", text)
606
+ return text
607
+
608
+ def _remove_redundant_punct(self, text: str) -> str:
609
+ text_ = text
610
+ result = re.search(redundant_punct_pattern, text)
611
+ dif = 0
612
+ while result:
613
+ sub = result.group()
614
+ sub = sorted(set(sub), key=sub.index)
615
+ sub = " " + "".join(list(sub)) + " "
616
+ text = "".join(
617
+ (text[: result.span()[0] + dif], sub, text[result.span()[1] + dif :])
618
+ )
619
+ text_ = "".join(
620
+ (text_[: result.span()[0]], text_[result.span()[1] :])
621
+ ).strip()
622
+ dif = abs(len(text) - len(text_))
623
+ result = re.search(redundant_punct_pattern, text_)
624
+ text = re.sub(r"\s+", " ", text)
625
+ return text.strip()
626
+
627
+
628
+ prefix_list = [
629
+ "ال",
630
+ "و",
631
+ "ف",
632
+ "ب",
633
+ "ك",
634
+ "ل",
635
+ "لل",
636
+ "\u0627\u0644",
637
+ "\u0648",
638
+ "\u0641",
639
+ "\u0628",
640
+ "\u0643",
641
+ "\u0644",
642
+ "\u0644\u0644",
643
+ "س",
644
+ ]
645
+ suffix_list = [
646
+ "ه",
647
+ "ها",
648
+ "ك",
649
+ "ي",
650
+ "هما",
651
+ "كما",
652
+ "نا",
653
+ "كم",
654
+ "هم",
655
+ "هن",
656
+ "كن",
657
+ "ا",
658
+ "ان",
659
+ "ين",
660
+ "ون",
661
+ "وا",
662
+ "ات",
663
+ "ت",
664
+ "ن",
665
+ "ة",
666
+ "\u0647",
667
+ "\u0647\u0627",
668
+ "\u0643",
669
+ "\u064a",
670
+ "\u0647\u0645\u0627",
671
+ "\u0643\u0645\u0627",
672
+ "\u0646\u0627",
673
+ "\u0643\u0645",
674
+ "\u0647\u0645",
675
+ "\u0647\u0646",
676
+ "\u0643\u0646",
677
+ "\u0627",
678
+ "\u0627\u0646",
679
+ "\u064a\u0646",
680
+ "\u0648\u0646",
681
+ "\u0648\u0627",
682
+ "\u0627\u062a",
683
+ "\u062a",
684
+ "\u0646",
685
+ "\u0629",
686
+ ]
687
+ other_tokens = ["[رابط]", "[مستخدم]", "[بريد]"]
688
+
689
+ # the never_split list is ussed with the transformers library
690
+ prefix_symbols = [x + "+" for x in prefix_list]
691
+ suffix_symblos = ["+" + x for x in suffix_list]
692
+ never_split_tokens = list(set(prefix_symbols + suffix_symblos + other_tokens))
693
+
694
+ url_regexes = [
695
+ r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)",
696
+ r"@(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS",
697
+ r"http[s]?://[a-zA-Z0-9_\-./~\?=%&]+",
698
+ r"www[a-zA-Z0-9_\-?=%&/.~]+",
699
+ r"[a-zA-Z]+\.com",
700
+ r"(?=http)[^\s]+",
701
+ r"(?=www)[^\s]+",
702
+ r"://",
703
+ ]
704
+ user_mention_regex = r"@[\w\d]+"
705
+ email_regexes = [r"[\w-]+@([\w-]+\.)+[\w-]+", r"\S+@\S+"]
706
+ redundant_punct_pattern = (
707
+ r"([!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ【»؛\s+«–…‘]{2,})"
708
+ )
709
+
710
+ regex_tatweel = r"(\D)\1{2,}"
711
+ multiple_char_pattern = re.compile(r"(\D)\1{2,}", re.DOTALL)
712
+
713
+ rejected_chars_regex = r"[^0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘]"
714
+ rejected_chars_regexv2 = r"[^0-9\u0621-\u063A\u0641-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘/]"
715
+
716
+ regex_url_step1 = r"(?=http)[^\s]+"
717
+ regex_url_step2 = r"(?=www)[^\s]+"
718
+ regex_url = r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
719
+ regex_mention = r"@[\w\d]+"
720
+ regex_email = r"\S+@\S+"
721
+
722
+ chars_regex = r"0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘"
723
+ chars_regexv2 = r"0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘/"
724
+
725
+ white_spaced_double_quotation_regex = r'\"\s+([^"]+)\s+\"'
726
+ white_spaced_single_quotation_regex = r"\'\s+([^']+)\s+\'"
727
+ white_spaced_back_quotation_regex = r"\`\s+([^`]+)\s+\`"
728
+ white_spaced_em_dash = r"\—\s+([^—]+)\s+\—"
729
+
730
+ left_spaced_chars = r" ([\]!#\$%\),\.:;\?}٪’،؟”؛…»·])"
731
+ right_spaced_chars = r"([\[\(\{“«‘*\~]) "
732
+ left_and_right_spaced_chars = r" ([\+\-\<\=\>\@\\\^\_\|\–]) "
733
+
734
+ hindi_nums = "٠١٢٣٤٥٦٧٨٩"
735
+ arabic_nums = "0123456789"
736
+ hindi_to_arabic_map = str.maketrans(hindi_nums, arabic_nums)
pages/processor.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import awesome_streamlit as ast
3
+ from .preprocess import (
4
+ ArabertPreprocessor,
5
+ white_spaced_back_quotation_regex,
6
+ white_spaced_double_quotation_regex,
7
+ white_spaced_em_dash,
8
+ white_spaced_single_quotation_regex,
9
+ left_and_right_spaced_chars,
10
+ left_spaced_chars,
11
+ right_spaced_chars,
12
+ )
13
+ import re
14
+
15
+ MODELS_to_SELECT = [
16
+ "None",
17
+ "bert-base-arabertv01",
18
+ "bert-base-arabert",
19
+ "bert-base-arabertv02",
20
+ "bert-base-arabertv2",
21
+ "bert-large-arabertv02",
22
+ "bert-large-arabertv2",
23
+ "araelectra-base",
24
+ "araelectra-base-discriminator",
25
+ "araelectra-base-generator",
26
+ "araelectra-base-artydiqa",
27
+ "aragpt2-base",
28
+ "aragpt2-medium",
29
+ "aragpt2-large",
30
+ "aragpt2-mega",
31
+ ]
32
+
33
+
34
+ def unpreprocess(text: str) -> str:
35
+ """Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces.
36
+ The objective is to make the generated text of any model appear natural and not preprocessed.
37
+
38
+ Args:
39
+ text (:obj:`str`): input text to be un-preprocessed
40
+ desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before]..
41
+
42
+ Returns:
43
+ str: The unpreprocessed (and possibly Farasa-desegmented) text.
44
+ """
45
+
46
+ text = desegment(text)
47
+
48
+ # removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple
49
+ # https://stackoverflow.com/a/53436792/5381220
50
+ text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text)
51
+ text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text)
52
+ text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text)
53
+ text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text)
54
+
55
+ # during generation, sometimes the models don't put a space after the dot, this handles it
56
+ text = text.replace(".", " . ")
57
+ text = " ".join(text.split())
58
+
59
+ # handle decimals
60
+ text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text)
61
+ text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text)
62
+
63
+ text = re.sub(left_and_right_spaced_chars, r"\1", text)
64
+ text = re.sub(left_spaced_chars, r"\1", text)
65
+ text = re.sub(right_spaced_chars, r"\1", text)
66
+
67
+ return text
68
+
69
+
70
+ def desegment(text: str) -> str:
71
+ """
72
+ Use this function if sentence tokenization was done using
73
+ `from arabert.preprocess_arabert import preprocess` with Farasa enabled
74
+ AraBERT segmentation using Farasa adds a space after the '+' for prefixes,
75
+ and after before the '+' for suffixes
76
+
77
+ Example:
78
+ >>> desegment('ال+ دراس +ات')
79
+ الدراسات
80
+ """
81
+ text = text.replace("+ ", "+")
82
+ text = text.replace(" +", "+")
83
+ text = " ".join([_desegmentword(word) for word in text.split(" ")])
84
+ return text
85
+
86
+
87
+ def _desegmentword(orig_word: str) -> str:
88
+ """
89
+ Word segmentor that takes a Farasa Segmented Word and removes the '+' signs
90
+
91
+ Example:
92
+ >>> _desegmentword("ال+يومي+ة")
93
+ اليومية
94
+ """
95
+ word = orig_word.replace("ل+ال+", "لل")
96
+ if "ال+ال" not in orig_word:
97
+ word = word.replace("ل+ال", "لل")
98
+ word = word.replace("+", "")
99
+ word = word.replace("للل", "لل")
100
+ return word
101
+
102
+
103
+ def write():
104
+ col1, _ = st.columns(2)
105
+
106
+ with col1:
107
+ col1.title("Arabic Text Pre-Processor")
108
+ st.markdown(
109
+ """
110
+ <style>
111
+ p, div, input, label {
112
+ text-align: right;
113
+ }
114
+ </style>
115
+ """,
116
+ unsafe_allow_html=True,
117
+ )
118
+ input_text = st.text_input(
119
+ "Text to Pre-Process",
120
+ value="ولن نبالغ إذا قلنا: إن 'هاتف' أو 'كمبيوتر المكتب' في زمننا هذا ضروري",
121
+ )
122
+
123
+ aligning_cols = st.columns(5)
124
+ model_selector = aligning_cols[0].selectbox("Model", options=MODELS_to_SELECT)
125
+ aligning_cols[1].write("#")
126
+ aligning_cols[1].write("Select None to enable further filters")
127
+ if model_selector == "None":
128
+ cols = st.columns(5)
129
+ keep_emojis = cols[0].checkbox("Keep emojis", False)
130
+ remove_html_markup = cols[0].checkbox("Remove html markup", True)
131
+ strip_tashkeel = cols[1].checkbox("Strip tashkeel", True)
132
+ replace_urls_emails_mentions = cols[1].checkbox("Replace urls and emails", True)
133
+ strip_tatweel = cols[2].checkbox("Strip tatweel", True)
134
+ insert_white_spaces = cols[2].checkbox("Insert white spaces", True)
135
+ remove_non_digit_repetition = cols[3].checkbox(
136
+ "Remove non-digit repetition", True
137
+ )
138
+ replace_slash_with_dash = cols[3].checkbox("Replace slash with dash", None)
139
+ map_hindi_numbers_to_arabic = cols[4].checkbox(
140
+ "Map hindi numbers to arabic", None
141
+ )
142
+ apply_farasa_segmentation = cols[4].checkbox("Apply farasa segmentation", None)
143
+
144
+ run_preprocessor = st.button("Run Pre-Processor")
145
+
146
+ prep_text = None
147
+ if run_preprocessor:
148
+ if model_selector == "None":
149
+ arabert_preprocessor = ArabertPreprocessor(
150
+ model_selector,
151
+ keep_emojis,
152
+ remove_html_markup,
153
+ replace_urls_emails_mentions,
154
+ strip_tashkeel,
155
+ strip_tatweel,
156
+ insert_white_spaces,
157
+ remove_non_digit_repetition,
158
+ replace_slash_with_dash,
159
+ map_hindi_numbers_to_arabic,
160
+ apply_farasa_segmentation,
161
+ )
162
+ else:
163
+ arabert_preprocessor = ArabertPreprocessor(model_name=model_selector)
164
+ prep_text = arabert_preprocessor._preprocess_v3(input_text)
165
+ st.write(prep_text)
166
+
167
+ st.write("-----")
168
+ input_text_unprep = st.text_input(
169
+ "Text to Undo the Pre-Processing",
170
+ value=prep_text
171
+ if prep_text
172
+ else "و+ لن نبالغ إذا قل +نا : إن ' هاتف ' أو ' كمبيوتر ال+ مكتب ' في زمن +نا هذا ضروري",
173
+ )
174
+ run_unpreprocessor = st.button("Run Un-Pre-Processor")
175
+
176
+ if run_unpreprocessor:
177
+ st.write(unpreprocess(input_text_unprep))
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit==0.88
2
+ arabic-reshaper==2.1.3
3
+ python-bidi==0.4.2
4
+ PyArabic
5
+ farasapy==0.0.14
6
+ emoji==1.4.2