Spaces:
Sleeping
Sleeping
Update query_preprocessing
Browse files
naive_chatbot/naive_chatbot.py
CHANGED
|
@@ -2,6 +2,9 @@
|
|
| 2 |
"""Naive Chatbot"""
|
| 3 |
import logging
|
| 4 |
import pickle
|
|
|
|
|
|
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
import tensorflow as tf
|
| 7 |
from camel_tools.utils.normalize import normalize_unicode
|
|
@@ -34,7 +37,20 @@ max_length = 32
|
|
| 34 |
oov_tok = '<OOV>' # Out of Vocabulary
|
| 35 |
training_portion = 1
|
| 36 |
previous_reply = 'احنا لسه في بداية الكلام'
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def load_pickle_data(filepath):
|
| 40 |
with open(filepath, 'rb') as pickle_file:
|
|
@@ -114,6 +130,17 @@ class NaiveChatbot:
|
|
| 114 |
pass
|
| 115 |
|
| 116 |
def preprocess_query(self, query):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
norm = normalize_unicode(query)
|
| 118 |
# Normalize alef variants to 'ا'
|
| 119 |
norm = normalize_alef_ar(norm)
|
|
|
|
| 2 |
"""Naive Chatbot"""
|
| 3 |
import logging
|
| 4 |
import pickle
|
| 5 |
+
import string
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
import numpy as np
|
| 9 |
import tensorflow as tf
|
| 10 |
from camel_tools.utils.normalize import normalize_unicode
|
|
|
|
| 37 |
oov_tok = '<OOV>' # Out of Vocabulary
|
| 38 |
training_portion = 1
|
| 39 |
previous_reply = 'احنا لسه في بداية الكلام'
|
| 40 |
+
arabic_punctuations = '''«»`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
|
| 41 |
+
english_punctuations = string.punctuation
|
| 42 |
+
punctuations_list = arabic_punctuations + english_punctuations
|
| 43 |
+
arabic_diacritics = re.compile("""
|
| 44 |
+
ّ | # Tashdid
|
| 45 |
+
َ | # Fatha
|
| 46 |
+
ً | # Tanwin Fath
|
| 47 |
+
ُ | # Damma
|
| 48 |
+
ٌ | # Tanwin Damm
|
| 49 |
+
ِ | # Kasra
|
| 50 |
+
ٍ | # Tanwin Kasr
|
| 51 |
+
ْ | # Sukun
|
| 52 |
+
ـ # Tatwil/Kashida
|
| 53 |
+
""", re.VERBOSE)
|
| 54 |
|
| 55 |
def load_pickle_data(filepath):
|
| 56 |
with open(filepath, 'rb') as pickle_file:
|
|
|
|
| 130 |
pass
|
| 131 |
|
| 132 |
def preprocess_query(self, query):
|
| 133 |
+
text = query.translate(str.maketrans('', '', punctuations_list))
|
| 134 |
+
# remove diacritics
|
| 135 |
+
text = re.sub(arabic_diacritics, '', str(text))
|
| 136 |
+
# remoce emoji
|
| 137 |
+
regrex_pattern = re.compile(pattern = "["
|
| 138 |
+
u"\U0001F600-\U0001F64F" # emoticons
|
| 139 |
+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
| 140 |
+
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
| 141 |
+
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
| 142 |
+
"]+", flags = re.UNICODE)
|
| 143 |
+
query = regrex_pattern.sub(r'',text)
|
| 144 |
norm = normalize_unicode(query)
|
| 145 |
# Normalize alef variants to 'ا'
|
| 146 |
norm = normalize_alef_ar(norm)
|