Spaces:

cis-lmu
/

glotlid-space

Running

kargaranamir commited on Oct 27, 2023

Commit

b21d736

1 Parent(s): 44dbf52

add preprocess.

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,7 +6,7 @@
 # This space is built based on AMR-KELEG/ALDi space.
 # GlotLID Space
 import constants
 import pandas as pd
 import streamlit as st
@@ -19,6 +19,7 @@ from altair import X, Y, Scale
 import base64
 import json
 import os
 @st.cache_resource
 def load_sp():
@@ -45,9 +46,42 @@ def get_script(text):
     else:
         all_scripts = 'Zyyy'
     return main_script, all_scripts
 @st.cache_data
 def language_names(json_path):
     with open(json_path, 'r') as json_file:
@@ -161,6 +195,8 @@ def compute(sentences, version = 'v2'):
     probs = []
     labels = []
     for index, sent in enumerate(sentences):
         output = model_choice.predict(sent)
@@ -227,7 +263,6 @@ with tab1:
     clicked = st.button("Submit")
     if sent:
-        sent = sent.replace('\n', ' ')
         probs, labels = compute([sent], version=version)
         prob = probs[0]

 # This space is built based on AMR-KELEG/ALDi space.
 # GlotLID Space
+import string
 import constants
 import pandas as pd
 import streamlit as st
 import base64
 import json
 import os
+import re
 @st.cache_resource
 def load_sp():
     else:
         all_scripts = 'Zyyy'
+    for ws in all_scripts:
+        if ws in ['Kana', 'Hrkt', 'Hani', 'Hira']:
+            all_scripts.append('Jpan')
+    all_scripts = list(set(all_scripts))
     return main_script, all_scripts
+def preprocess_text(text):
+    """Apply preprocessing to the given text.
+    Args:
+        text: Thetext to be preprocessed.
+    Returns:
+        The preprocessed text.
+    """
+    # remove \n
+    text = text.replace('\n', ' ')
+    # get rid of characters that are ubiquitous
+    replace_by = " "
+    replacement_map = {
+        ord(c): replace_by
+        for c in ':•#{|}' + string.digits
+    }
+    text = text.translate(replacement_map)
+    # make multiple space one space
+    text = re.sub(r'\s+', ' ', text)
+    # strip the text
+    text = text.strip()
+    return text
 @st.cache_data
 def language_names(json_path):
     with open(json_path, 'r') as json_file:
     probs = []
     labels = []
+    sentences = [preprocess_text(sent) for sent in sentences]
     for index, sent in enumerate(sentences):
         output = model_choice.predict(sent)
     clicked = st.button("Submit")
     if sent:
         probs, labels = compute([sent], version=version)
         prob = probs[0]