Spaces:
Sleeping
Sleeping
ultima versao
Browse files- __pycache__/util.cpython-310.pyc +0 -0
- app.py +105 -57
- flagged/log.csv +2 -0
- requirements.txt +1 -0
- util.py +60 -0
__pycache__/util.cpython-310.pyc
ADDED
Binary file (2.54 kB). View file
|
|
app.py
CHANGED
@@ -4,6 +4,13 @@ from textblob import TextBlob
|
|
4 |
import textstat
|
5 |
from huggingface_hub import hf_hub_download
|
6 |
from joblib import load
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
titulo1 = """CLONE - Studio Dashboard: "default" and "Default Project" does not give clear information about Alloy and Project unless description is read."""
|
9 |
descricao1 = """Steps To Reproduce: 1. On dashboard on studio 3.0, navigate to Develop tab. 2. Notice "default" and "Default Project" & "two-tabbed" and "Tabbed Application" names. Actual: User does not get clear information from names that one is alloy project and another one is Titanium project unless he reads the description below. Expected: Naming convention or icon corresponding must suggest type"""
|
@@ -20,104 +27,144 @@ descricao4 = """During the compile process Alloy will attempt to remove files fr
|
|
20 |
titulo5 = """Resolve suboptimal compression from uglify-js v2 update"""
|
21 |
descricao5 = """The v2 update of uglify-js in Alloy, specifically version 2.2.5, has some suboptimal compressions, which are causing the optimizer.js test spec to fail in certain cases. Specifically the issues are around booleans and cascading of variables in assignments. These issues have been logged with the Uglifyjs2 project in the following links: * https://github.com/mishoo/UglifyJS2/issues/137 * https://github.com/mishoo/UglifyJS2/issues/138 When these issues are resolved and distributed in an npm release, we need to revisit these compressions and testing to ensure that the fixes are in place, and that new uglify-js version has no regressions that impact alloy."""
|
22 |
|
23 |
-
def calcula_MbR(titulo, descricao):
|
24 |
context = titulo + descricao
|
25 |
-
d = {"
|
26 |
-
df = pd.DataFrame(data=d, columns=["
|
27 |
-
model = load(hf_hub_download("model_effort_tawos", "model_tawos_aloy_mbr.joblib"))
|
28 |
-
story_points_MbR = model.predict(df["
|
29 |
return story_points_MbR
|
30 |
|
31 |
-
def calcula_Median(titulo, descricao):
|
32 |
context = titulo + descricao
|
33 |
-
d = {"
|
34 |
-
df = pd.DataFrame(data=d, columns=["
|
35 |
-
model = load(hf_hub_download("giseldo/model_effort_tawos", "model_tawos_aloy_median.joblib"))
|
36 |
-
story_points_MbR = model.predict(df["
|
37 |
return story_points_MbR
|
38 |
|
39 |
-
def calcula_NEOSP_SVR(titulo, descricao):
|
40 |
-
model = load(hf_hub_download("giseldo/model_effort_tawos", "model_tawos_aloy_neosp_svr.joblib"))
|
|
|
|
|
41 |
context = titulo + descricao
|
42 |
d = {"context": [context]}
|
43 |
df = pd.DataFrame(data=d, columns=["context"])
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
# features de legibilidade
|
46 |
-
df["
|
47 |
-
df["
|
48 |
-
df["
|
49 |
-
df["
|
50 |
-
df["
|
51 |
-
df["
|
52 |
-
df["
|
53 |
-
df["
|
54 |
-
df["
|
55 |
|
56 |
# feature de sentimento
|
57 |
-
df["
|
58 |
-
df["
|
59 |
|
60 |
-
X = df[["
|
61 |
-
"
|
62 |
-
"
|
63 |
|
64 |
story_points = model.predict(X)
|
65 |
return story_points
|
66 |
|
67 |
-
def calcula_NEOSP_Linear(titulo, descricao):
|
68 |
-
model = load(hf_hub_download("giseldo/model_effort_tawos", "model_tawos_aloy_neosp_linear.joblib"))
|
|
|
69 |
context = titulo + descricao
|
70 |
d = {"context": [context]}
|
71 |
df = pd.DataFrame(data=d, columns=["context"])
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
# features de legibilidade
|
74 |
-
df["
|
75 |
-
df["
|
76 |
-
df["
|
77 |
-
df["
|
78 |
-
df["
|
79 |
-
df["
|
80 |
-
df["
|
81 |
-
df["
|
82 |
-
df["
|
83 |
|
84 |
# feature de sentimento
|
85 |
-
df["
|
86 |
-
df["
|
87 |
|
88 |
-
X = df[["
|
89 |
-
"
|
90 |
-
"
|
91 |
|
92 |
story_points = model.predict(X)
|
93 |
return story_points
|
94 |
|
95 |
-
def calcula_TFIDF_SVR(titulo, descricao):
|
96 |
-
model = load(hf_hub_download("giseldo/model_effort_tawos", "model_tawos_aloy_tfidf_svr.joblib"))
|
97 |
context = titulo + descricao
|
98 |
-
d = {"
|
99 |
-
df = pd.DataFrame(data=d, columns=["
|
100 |
-
vectorizer = load(hf_hub_download("giseldo/model_effort_tawos", "vectorizer_tfidf.joblib"))
|
101 |
-
|
|
|
|
|
102 |
story_points = model.predict(X)
|
103 |
return story_points
|
104 |
|
105 |
-
def calcula_TFIDF_Linear(titulo, descricao):
|
106 |
-
model = load(hf_hub_download("giseldo/model_effort_tawos", "model_tawos_aloy_tfidf_linear.joblib"))
|
107 |
context = titulo + descricao
|
108 |
-
d = {"
|
109 |
-
df = pd.DataFrame(data=d, columns=["
|
110 |
-
vectorizer = load(hf_hub_download("giseldo/model_effort_tawos", "vectorizer_tfidf.joblib"))
|
111 |
-
|
|
|
|
|
112 |
story_points = model.predict(X)
|
113 |
return story_points
|
114 |
|
115 |
-
def calcula(titulo, descricao):
|
116 |
-
return calcula_MbR(titulo, descricao), calcula_Median(titulo, descricao), calcula_NEOSP_SVR(titulo, descricao), calcula_NEOSP_Linear(titulo, descricao),
|
117 |
|
118 |
demo = gr.Interface(fn=calcula,
|
119 |
inputs=[gr.Textbox(placeholder="Título", label="Título"),
|
120 |
-
gr.Textbox(lines=10, placeholder="Descrição", label="Descrição")
|
|
|
121 |
outputs=[gr.Textbox(label="Story Points Estimado Média"),
|
122 |
gr.Textbox(label="Story Points Estimado Mediana"),
|
123 |
gr.Textbox(label="Story Points Estimado NEOSP-SVR"),
|
@@ -125,6 +172,7 @@ demo = gr.Interface(fn=calcula,
|
|
125 |
gr.Textbox(label="Story Points Estimado TFIDF-SVR"),
|
126 |
gr.Textbox(label="Story Points Estimado TFIDF-Linear")],
|
127 |
title="Agile Task Story Point Estimator",
|
|
|
128 |
examples=[[titulo1, descricao1], [titulo2, descricao2], [titulo3, descricao3], [titulo4, descricao4], [titulo5, descricao5]])
|
129 |
|
130 |
demo.launch()
|
|
|
4 |
import textstat
|
5 |
from huggingface_hub import hf_hub_download
|
6 |
from joblib import load
|
7 |
+
from util import escape_tags_and_content, escape_tags, escape_strings, escape_links, escape_hex_character_codes, escape_punctuation_boundaries, escape_odd_spaces
|
8 |
+
|
9 |
+
import nltk
|
10 |
+
import nltk
|
11 |
+
from nltk.corpus import stopwords
|
12 |
+
|
13 |
+
nltk.download('stopwords')
|
14 |
|
15 |
titulo1 = """CLONE - Studio Dashboard: "default" and "Default Project" does not give clear information about Alloy and Project unless description is read."""
|
16 |
descricao1 = """Steps To Reproduce: 1. On dashboard on studio 3.0, navigate to Develop tab. 2. Notice "default" and "Default Project" & "two-tabbed" and "Tabbed Application" names. Actual: User does not get clear information from names that one is alloy project and another one is Titanium project unless he reads the description below. Expected: Naming convention or icon corresponding must suggest type"""
|
|
|
27 |
titulo5 = """Resolve suboptimal compression from uglify-js v2 update"""
|
28 |
descricao5 = """The v2 update of uglify-js in Alloy, specifically version 2.2.5, has some suboptimal compressions, which are causing the optimizer.js test spec to fail in certain cases. Specifically the issues are around booleans and cascading of variables in assignments. These issues have been logged with the Uglifyjs2 project in the following links: * https://github.com/mishoo/UglifyJS2/issues/137 * https://github.com/mishoo/UglifyJS2/issues/138 When these issues are resolved and distributed in an npm release, we need to revisit these compressions and testing to ensure that the fixes are in place, and that new uglify-js version has no regressions that impact alloy."""
|
29 |
|
30 |
+
def calcula_MbR(titulo, descricao, nome_projeto):
|
31 |
context = titulo + descricao
|
32 |
+
d = {"context_": [context]}
|
33 |
+
df = pd.DataFrame(data=d, columns=["context_"])
|
34 |
+
model = load(hf_hub_download("giseldo/model_effort_tawos", "model_tawos_aloy_mbr.joblib", force_download=False))
|
35 |
+
story_points_MbR = model.predict(df["context_"])
|
36 |
return story_points_MbR
|
37 |
|
38 |
+
def calcula_Median(titulo, descricao, nome_projeto):
|
39 |
context = titulo + descricao
|
40 |
+
d = {"context_": [context]}
|
41 |
+
df = pd.DataFrame(data=d, columns=["context_"])
|
42 |
+
model = load(hf_hub_download("giseldo/model_effort_tawos", "model_tawos_aloy_median.joblib", force_download=False))
|
43 |
+
story_points_MbR = model.predict(df["context_"])
|
44 |
return story_points_MbR
|
45 |
|
46 |
+
def calcula_NEOSP_SVR(titulo, descricao, nome_projeto):
|
47 |
+
model = load(hf_hub_download("giseldo/model_effort_tawos", "model_tawos_aloy_neosp_svr.joblib", force_download=False))
|
48 |
+
|
49 |
+
# criação de uma nova coluna
|
50 |
context = titulo + descricao
|
51 |
d = {"context": [context]}
|
52 |
df = pd.DataFrame(data=d, columns=["context"])
|
53 |
|
54 |
+
# pré-processamento
|
55 |
+
df["context"] = df["context"].apply(lambda x: escape_tags_and_content(x))
|
56 |
+
df["context"] = df["context"].apply(lambda x: escape_tags(x))
|
57 |
+
df["context"] = df["context"].apply(lambda x: escape_strings(x))
|
58 |
+
df["context"] = df["context"].apply(lambda x: escape_links(x))
|
59 |
+
df["context"] = df["context"].apply(lambda x: escape_hex_character_codes(x))
|
60 |
+
df["context"] = df["context"].apply(lambda x: escape_punctuation_boundaries(x))
|
61 |
+
df["context"] = df["context"].apply(lambda x: escape_odd_spaces(x))
|
62 |
+
|
63 |
+
# removendo stop-words
|
64 |
+
stop = stopwords.words('english')
|
65 |
+
df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
|
66 |
+
|
67 |
+
# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP
|
68 |
+
df = df.rename(columns={ "context": "context_"})
|
69 |
+
|
70 |
# features de legibilidade
|
71 |
+
df["gunning_fog_"] = df['context_'].apply(textstat.gunning_fog)#
|
72 |
+
df["flesch_reading_ease_"] = df['context_'].apply(textstat.flesch_reading_ease)#
|
73 |
+
df["flesch_kincaid_grade_"] = df['context_'].apply(textstat.flesch_kincaid_grade)#
|
74 |
+
df["smog_index_"] = df['context_'].apply(textstat.smog_index)
|
75 |
+
df["coleman_liau_index_"] = df['context_'].apply(textstat.coleman_liau_index)#
|
76 |
+
df["automated_readability_index_"] = df['context_'].apply(textstat.automated_readability_index) #
|
77 |
+
df["dale_chall_readability_score_"] = df['context_'].apply(textstat.dale_chall_readability_score)#
|
78 |
+
df["difficult_words_"] = df['context_'].apply(textstat.difficult_words)
|
79 |
+
df["linsear_write_formula_"] = df['context_'].apply(textstat.linsear_write_formula)#
|
80 |
|
81 |
# feature de sentimento
|
82 |
+
df["polarity_"] = df["context_"].apply(lambda x: TextBlob(x).sentiment.polarity)
|
83 |
+
df["subjectivity_"] = df["context_"].apply(lambda x: TextBlob(x).sentiment.subjectivity)
|
84 |
|
85 |
+
X = df[["gunning_fog_", "flesch_reading_ease_", "flesch_kincaid_grade_", "smog_index_", "coleman_liau_index_",
|
86 |
+
"automated_readability_index_", "dale_chall_readability_score_", "difficult_words_", "linsear_write_formula_",
|
87 |
+
"polarity_", "subjectivity_"]]
|
88 |
|
89 |
story_points = model.predict(X)
|
90 |
return story_points
|
91 |
|
92 |
+
def calcula_NEOSP_Linear(titulo, descricao, nome_projeto):
|
93 |
+
model = load(hf_hub_download("giseldo/model_effort_tawos", "model_tawos_aloy_neosp_linear.joblib", force_download=False))
|
94 |
+
# criação de uma nova coluna
|
95 |
context = titulo + descricao
|
96 |
d = {"context": [context]}
|
97 |
df = pd.DataFrame(data=d, columns=["context"])
|
98 |
|
99 |
+
# pré-processamento
|
100 |
+
df["context"] = df["context"].apply(lambda x: escape_tags_and_content(x))
|
101 |
+
df["context"] = df["context"].apply(lambda x: escape_tags(x))
|
102 |
+
df["context"] = df["context"].apply(lambda x: escape_strings(x))
|
103 |
+
df["context"] = df["context"].apply(lambda x: escape_links(x))
|
104 |
+
df["context"] = df["context"].apply(lambda x: escape_hex_character_codes(x))
|
105 |
+
df["context"] = df["context"].apply(lambda x: escape_punctuation_boundaries(x))
|
106 |
+
df["context"] = df["context"].apply(lambda x: escape_odd_spaces(x))
|
107 |
+
|
108 |
+
# removendo stop-words
|
109 |
+
stop = stopwords.words('english')
|
110 |
+
df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
|
111 |
+
|
112 |
+
# renomeando as colunas porque senão dá um problema com a extração de features do NEOSP
|
113 |
+
df = df.rename(columns={ "context": "context_"})
|
114 |
+
|
115 |
# features de legibilidade
|
116 |
+
df["gunning_fog_"] = df['context_'].apply(textstat.gunning_fog)#
|
117 |
+
df["flesch_reading_ease_"] = df['context_'].apply(textstat.flesch_reading_ease)#
|
118 |
+
df["flesch_kincaid_grade_"] = df['context_'].apply(textstat.flesch_kincaid_grade)#
|
119 |
+
df["smog_index_"] = df['context_'].apply(textstat.smog_index)
|
120 |
+
df["coleman_liau_index_"] = df['context_'].apply(textstat.coleman_liau_index)#
|
121 |
+
df["automated_readability_index_"] = df['context_'].apply(textstat.automated_readability_index) #
|
122 |
+
df["dale_chall_readability_score_"] = df['context_'].apply(textstat.dale_chall_readability_score)#
|
123 |
+
df["difficult_words_"] = df['context_'].apply(textstat.difficult_words)
|
124 |
+
df["linsear_write_formula_"] = df['context_'].apply(textstat.linsear_write_formula)#
|
125 |
|
126 |
# feature de sentimento
|
127 |
+
df["polarity_"] = df["context_"].apply(lambda x: TextBlob(x).sentiment.polarity)
|
128 |
+
df["subjectivity_"] = df["context_"].apply(lambda x: TextBlob(x).sentiment.subjectivity)
|
129 |
|
130 |
+
X = df[["gunning_fog_", "flesch_reading_ease_", "flesch_kincaid_grade_", "smog_index_", "coleman_liau_index_",
|
131 |
+
"automated_readability_index_", "dale_chall_readability_score_", "difficult_words_", "linsear_write_formula_",
|
132 |
+
"polarity_", "subjectivity_"]]
|
133 |
|
134 |
story_points = model.predict(X)
|
135 |
return story_points
|
136 |
|
137 |
+
def calcula_TFIDF_SVR(titulo, descricao, nome_projeto):
|
138 |
+
model = load(hf_hub_download("giseldo/model_effort_tawos", "model_tawos_aloy_tfidf_svr.joblib", force_download=False))
|
139 |
context = titulo + descricao
|
140 |
+
d = {"context_": [context]}
|
141 |
+
df = pd.DataFrame(data=d, columns=["context_"])
|
142 |
+
vectorizer = load(hf_hub_download("giseldo/model_effort_tawos", "vectorizer_tfidf.joblib", force_download=False))
|
143 |
+
X_vec = vectorizer.transform(df["context_"])
|
144 |
+
df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())
|
145 |
+
X = df_vec
|
146 |
story_points = model.predict(X)
|
147 |
return story_points
|
148 |
|
149 |
+
def calcula_TFIDF_Linear(titulo, descricao, nome_projeto):
|
150 |
+
model = load(hf_hub_download("giseldo/model_effort_tawos", "model_tawos_aloy_tfidf_linear.joblib", force_download=False))
|
151 |
context = titulo + descricao
|
152 |
+
d = {"context_": [context]}
|
153 |
+
df = pd.DataFrame(data=d, columns=["context_"])
|
154 |
+
vectorizer = load(hf_hub_download("giseldo/model_effort_tawos", "vectorizer_tfidf.joblib", force_download=False))
|
155 |
+
X_vec = vectorizer.transform(df["context_"])
|
156 |
+
df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())
|
157 |
+
X = df_vec
|
158 |
story_points = model.predict(X)
|
159 |
return story_points
|
160 |
|
161 |
+
def calcula(titulo, descricao, nome_projeto):
|
162 |
+
return calcula_MbR(titulo, descricao, nome_projeto), calcula_Median(titulo, descricao, nome_projeto), calcula_NEOSP_SVR(titulo, descricao, nome_projeto), calcula_NEOSP_Linear(titulo, descricao, nome_projeto), calcula_TFIDF_SVR(titulo, descricao, nome_projeto), calcula_TFIDF_Linear(titulo, descricao, nome_projeto)
|
163 |
|
164 |
demo = gr.Interface(fn=calcula,
|
165 |
inputs=[gr.Textbox(placeholder="Título", label="Título"),
|
166 |
+
gr.Textbox(lines=10, placeholder="Descrição", label="Descrição"),
|
167 |
+
gr.Dropdown(["ALOY", "XD", "TIMOB"], label="Projeto", value= "ALOY", interactive= False)], # info="Nome do projeto!"
|
168 |
outputs=[gr.Textbox(label="Story Points Estimado Média"),
|
169 |
gr.Textbox(label="Story Points Estimado Mediana"),
|
170 |
gr.Textbox(label="Story Points Estimado NEOSP-SVR"),
|
|
|
172 |
gr.Textbox(label="Story Points Estimado TFIDF-SVR"),
|
173 |
gr.Textbox(label="Story Points Estimado TFIDF-Linear")],
|
174 |
title="Agile Task Story Point Estimator",
|
175 |
+
#interpretation="default",
|
176 |
examples=[[titulo1, descricao1], [titulo2, descricao2], [titulo3, descricao3], [titulo4, descricao4], [titulo5, descricao5]])
|
177 |
|
178 |
demo.launch()
|
flagged/log.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Título,Descrição,Projeto,Story Points Estimado Média,Story Points Estimado Mediana,Story Points Estimado NEOSP-SVR,Story Points Estimado NEOSP-Linear,Story Points Estimado TFIDF-SVR,Story Points Estimado TFIDF-Linear,flag,username,timestamp
|
2 |
+
"CLONE - Studio Dashboard: ""default"" and ""Default Project"" does not give clear information about Alloy and Project unless description is read.","Steps To Reproduce: 1. On dashboard on studio 3.0, navigate to Develop tab. 2. Notice ""default"" and ""Default Project"" & ""two-tabbed"" and ""Tabbed Application"" names. Actual: User does not get clear information from names that one is alloy project and another one is Titanium project unless he reads the description below. Expected: Naming convention or icon corresponding must suggest type",ALOY,[3.70539419],[3.],[3.15496615],[3.84946518],[3.75963544],[6.99007244],,,2023-08-31 23:02:15.199700
|
requirements.txt
CHANGED
@@ -2,3 +2,4 @@ scikit-learn
|
|
2 |
gradio
|
3 |
textblob
|
4 |
textstat
|
|
|
|
2 |
gradio
|
3 |
textblob
|
4 |
textstat
|
5 |
+
nltk
|
util.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from string import punctuation
|
3 |
+
|
4 |
+
def escape_tags_and_content(text):
|
5 |
+
"""Escape tags and their content containing text, which is not written in natural language, such as code snippets"""
|
6 |
+
|
7 |
+
NO_TEXT_TAGS = "code", "noformat"
|
8 |
+
for tag in NO_TEXT_TAGS:
|
9 |
+
regex_matching_tag = re.compile("\{%s(.*?)\}(.*?)\{%s\}" % (tag, tag), re.DOTALL)
|
10 |
+
text = re.sub(regex_matching_tag, "", text)
|
11 |
+
|
12 |
+
return text
|
13 |
+
|
14 |
+
def escape_tags(text):
|
15 |
+
"""Escape markup tags, but retain their content"""
|
16 |
+
|
17 |
+
ESCAPE_TAGS = "color", "quote", "anchor", "panel"
|
18 |
+
for tag in ESCAPE_TAGS:
|
19 |
+
text = re.sub("\{%s(.*?)\}" % tag, "", text)
|
20 |
+
|
21 |
+
return text
|
22 |
+
|
23 |
+
def escape_strings(text):
|
24 |
+
"""Escape line breaks, tabulators, slashes and JIRA heading markup symbols"""
|
25 |
+
|
26 |
+
ESCAPE_STRINGS = "\\r", "\\n", "\\t", "\\f", "\\v", "\"", "\\\\", "h1. ", "h2. ", "h3. ", "h4. ", "h5. ", "h6. "
|
27 |
+
for escape_string in ESCAPE_STRINGS:
|
28 |
+
text = text.replace(escape_string, " ")
|
29 |
+
|
30 |
+
return text
|
31 |
+
|
32 |
+
def escape_links(text):
|
33 |
+
"""Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' """
|
34 |
+
|
35 |
+
LINK_STARTERS = r"\#", r"\^", r"http\:\/\/", r"https\:\/\/", r"malto\:", r"file\:", r"\~"
|
36 |
+
for link_starter in LINK_STARTERS:
|
37 |
+
text = re.sub("\[(.*?\\|)?%s(.*?)\]" % link_starter, "", text)
|
38 |
+
text = re.sub(r"\bhttps?://\S+", "", text)
|
39 |
+
|
40 |
+
return text
|
41 |
+
|
42 |
+
def escape_hex_character_codes(text):
|
43 |
+
"""Escape characters outside the latin alphabet which are converted to hex code representation"""
|
44 |
+
|
45 |
+
return re.sub(r"\\x\w\w", "", text)
|
46 |
+
|
47 |
+
def escape_punctuation_boundaries(text):
|
48 |
+
"""Remove all punctuation marks from the beginning and end of words,
|
49 |
+
except for trailing period at the end of words"""
|
50 |
+
|
51 |
+
return " ".join([word.strip(punctuation.replace(".", "")).lstrip(".") for word in text.split()])
|
52 |
+
|
53 |
+
def escape_odd_spaces(text):
|
54 |
+
"""Replace several consequent spaces with one space
|
55 |
+
and remove spaces from string start and end"""
|
56 |
+
|
57 |
+
text = re.sub(r"\s+", " ", text)
|
58 |
+
text = text.strip()
|
59 |
+
|
60 |
+
return text
|