AIdeaText commited on
Commit
fe00a07
·
verified ·
1 Parent(s): 9efeb1f

Update modules/text_analysis/morpho_analysis.py

Browse files
Files changed (1) hide show
  1. modules/text_analysis/morpho_analysis.py +160 -157
modules/text_analysis/morpho_analysis.py CHANGED
@@ -1,158 +1,161 @@
1
- import spacy
2
- from spacy import displacy
3
- from streamlit.components.v1 import html
4
- import base64
5
-
6
- from collections import Counter
7
- import re
8
- from ..utils.widget_utils import generate_unique_key
9
-
10
- import logging
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
- # Define colors for grammatical categories
15
- POS_COLORS = {
16
- 'ADJ': '#FFA07A', # Light Salmon
17
- 'ADP': '#98FB98', # Pale Green
18
- 'ADV': '#87CEFA', # Light Sky Blue
19
- 'AUX': '#DDA0DD', # Plum
20
- 'CCONJ': '#F0E68C', # Khaki
21
- 'DET': '#FFB6C1', # Light Pink
22
- 'INTJ': '#FF6347', # Tomato
23
- 'NOUN': '#90EE90', # Light Green
24
- 'NUM': '#FAFAD2', # Light Goldenrod Yellow
25
- 'PART': '#D3D3D3', # Light Gray
26
- 'PRON': '#FFA500', # Orange
27
- 'PROPN': '#20B2AA', # Light Sea Green
28
- 'SCONJ': '#DEB887', # Burlywood
29
- 'SYM': '#7B68EE', # Medium Slate Blue
30
- 'VERB': '#FF69B4', # Hot Pink
31
- 'X': '#A9A9A9', # Dark Gray
32
- }
33
-
34
- POS_TRANSLATIONS = {
35
- 'es': {
36
- 'ADJ': 'Adjetivo',
37
- 'ADP': 'Preposición',
38
- 'ADV': 'Adverbio',
39
- 'AUX': 'Auxiliar',
40
- 'CCONJ': 'Conjunción Coordinante',
41
- 'DET': 'Determinante',
42
- 'INTJ': 'Interjección',
43
- 'NOUN': 'Sustantivo',
44
- 'NUM': 'Número',
45
- 'PART': 'Partícula',
46
- 'PRON': 'Pronombre',
47
- 'PROPN': 'Nombre Propio',
48
- 'SCONJ': 'Conjunción Subordinante',
49
- 'SYM': 'Símbolo',
50
- 'VERB': 'Verbo',
51
- 'X': 'Otro',
52
- },
53
- 'en': {
54
- 'ADJ': 'Adjective',
55
- 'ADP': 'Preposition',
56
- 'ADV': 'Adverb',
57
- 'AUX': 'Auxiliary',
58
- 'CCONJ': 'Coordinating Conjunction',
59
- 'DET': 'Determiner',
60
- 'INTJ': 'Interjection',
61
- 'NOUN': 'Noun',
62
- 'NUM': 'Number',
63
- 'PART': 'Particle',
64
- 'PRON': 'Pronoun',
65
- 'PROPN': 'Proper Noun',
66
- 'SCONJ': 'Subordinating Conjunction',
67
- 'SYM': 'Symbol',
68
- 'VERB': 'Verb',
69
- 'X': 'Other',
70
- },
71
- 'fr': {
72
- 'ADJ': 'Adjectif',
73
- 'ADP': 'Préposition',
74
- 'ADV': 'Adverbe',
75
- 'AUX': 'Auxiliaire',
76
- 'CCONJ': 'Conjonction de Coordination',
77
- 'DET': 'Déterminant',
78
- 'INTJ': 'Interjection',
79
- 'NOUN': 'Nom',
80
- 'NUM': 'Nombre',
81
- 'PART': 'Particule',
82
- 'PRON': 'Pronom',
83
- 'PROPN': 'Nom Propre',
84
- 'SCONJ': 'Conjonction de Subordination',
85
- 'SYM': 'Symbole',
86
- 'VERB': 'Verbe',
87
- 'X': 'Autre',
88
- }
89
- }
90
-
91
- def generate_arc_diagram(doc):
92
- arc_diagrams = []
93
- for sent in doc.sents:
94
- words = [token.text for token in sent]
95
- # Calculamos el ancho del SVG basado en la longitud de la oración
96
- svg_width = max(600, len(words) * 120)
97
- # Altura fija para cada oración
98
- svg_height = 350 # Controla la altura del SVG
99
-
100
- # Renderizamos el diagrama de dependencias
101
- html = displacy.render(sent, style="dep", options={
102
- "add_lemma":False, # Introduced in version 2.2.4, this argument prints the lemma’s in a separate row below the token texts.
103
- "arrow_spacing": 12, #This argument is used for adjusting the spacing between arrows in px to avoid overlaps.
104
- "arrow_width": 2, #This argument is used for adjusting the width of arrow head in px.
105
- "arrow_stroke": 2, #This argument is used for adjusting the width of arrow path in px.
106
- "collapse_punct": True, #It attaches punctuation to the tokens.
107
- "collapse_phrases": False, # This argument merges the noun phrases into one token.
108
- "compact":False, # If you will take this argument as true, you will get the “Compact mode” with square arrows that takes up less space.
109
- "color": "#ffffff",
110
- "bg": "#0d6efd",
111
- "compact": False, #Put the value of this argument True, if you want to use fine-grained part-of-speech tags (Token.tag_), instead of coarse-grained tags (Token.pos_).
112
- "distance": 100, # Aumentamos la distancia entre palabras
113
- "fine_grained": False, #Put the value of this argument True, if you want to use fine-grained part-of-speech tags (Token.tag_), instead of coarse-grained tags (Token.pos_).
114
- "offset_x": 55, # This argument is used for spacing on left side of the SVG in px.
115
- "word_spacing": 25, #This argument is used for adjusting the vertical spacing between words and arcs in px.
116
- })
117
-
118
- # Ajustamos el tamaño del SVG y el viewBox
119
- html = re.sub(r'width="(\d+)"', f'width="{svg_width}"', html)
120
- html = re.sub(r'height="(\d+)"', f'height="{svg_height}"', html)
121
- html = re.sub(r'<svg', f'<svg viewBox="0 0 {svg_width} {svg_height}"', html)
122
-
123
- #html = re.sub(r'<svg[^>]*>', lambda m: m.group(0).replace('height="450"', 'height="300"'), html)
124
- #html = re.sub(r'<g [^>]*transform="translate\((\d+),(\d+)\)"', lambda m: f'<g transform="translate({m.group(1)},50)"', html)
125
-
126
- # Movemos todo el contenido hacia abajo
127
- #html = html.replace('<g', f'<g transform="translate(50, {svg_height - 200})"')
128
-
129
- # Movemos todo el contenido hacia arriba para eliminar el espacio vacío en la parte superior
130
- html = re.sub(r'<g transform="translate\((\d+),(\d+)\)"',
131
- lambda m: f'<g transform="translate({m.group(1)},10)"', html)
132
-
133
-
134
- # Ajustamos la posición de las etiquetas de las palabras
135
- html = html.replace('dy="1em"', 'dy="-1em"')
136
-
137
- # Ajustamos la posición de las etiquetas POS
138
- html = html.replace('dy="0.25em"', 'dy="-3em"')
139
-
140
- # Aumentamos el tamaño de la fuente para las etiquetas POS
141
- html = html.replace('.displacy-tag {', '.displacy-tag { font-size: 14px;')
142
-
143
- # Rotamos las etiquetas de las palabras para mejorar la legibilidad
144
- #html = html.replace('class="displacy-label"', 'class="displacy-label" transform="rotate(30)"')
145
-
146
- arc_diagrams.append(html)
147
- return arc_diagrams
148
- ##################################################################################################################################
149
-
150
-
151
- def perform_advanced_morphosyntactic_analysis(text, nlp):
152
- doc = nlp(text)
153
- arc_diagrams = generate_arc_diagram(doc)
154
- return {
155
- 'arc_diagrams': arc_diagrams,
156
- }
157
-
 
 
 
158
  __all__ = ['perform_advanced_morphosyntactic_analysis']
 
1
+ import spacy
2
+ from spacy import displacy
3
+ from streamlit.components.v1 import html
4
+ import base64
5
+
6
+ from collections import Counter
7
+ import re
8
+ from ..utils.widget_utils import generate_unique_key
9
+
10
+ import logging
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ # Define colors for grammatical categories
15
+ POS_COLORS = {
16
+ 'ADJ': '#FFA07A', # Light Salmon
17
+ 'ADP': '#98FB98', # Pale Green
18
+ 'ADV': '#87CEFA', # Light Sky Blue
19
+ 'AUX': '#DDA0DD', # Plum
20
+ 'CCONJ': '#F0E68C', # Khaki
21
+ 'DET': '#FFB6C1', # Light Pink
22
+ 'INTJ': '#FF6347', # Tomato
23
+ 'NOUN': '#90EE90', # Light Green
24
+ 'NUM': '#FAFAD2', # Light Goldenrod Yellow
25
+ 'PART': '#D3D3D3', # Light Gray
26
+ 'PRON': '#FFA500', # Orange
27
+ 'PROPN': '#20B2AA', # Light Sea Green
28
+ 'SCONJ': '#DEB887', # Burlywood
29
+ 'SYM': '#7B68EE', # Medium Slate Blue
30
+ 'VERB': '#FF69B4', # Hot Pink
31
+ 'X': '#A9A9A9', # Dark Gray
32
+ }
33
+
34
+ POS_TRANSLATIONS = {
35
+ 'es': {
36
+ 'ADJ': 'Adjetivo',
37
+ 'ADP': 'Preposición',
38
+ 'ADV': 'Adverbio',
39
+ 'AUX': 'Auxiliar',
40
+ 'CCONJ': 'Conjunción Coordinante',
41
+ 'DET': 'Determinante',
42
+ 'INTJ': 'Interjección',
43
+ 'NOUN': 'Sustantivo',
44
+ 'NUM': 'Número',
45
+ 'PART': 'Partícula',
46
+ 'PRON': 'Pronombre',
47
+ 'PROPN': 'Nombre Propio',
48
+ 'SCONJ': 'Conjunción Subordinante',
49
+ 'SYM': 'Símbolo',
50
+ 'VERB': 'Verbo',
51
+ 'X': 'Otro',
52
+ },
53
+ 'en': {
54
+ 'ADJ': 'Adjective',
55
+ 'ADP': 'Preposition',
56
+ 'ADV': 'Adverb',
57
+ 'AUX': 'Auxiliary',
58
+ 'CCONJ': 'Coordinating Conjunction',
59
+ 'DET': 'Determiner',
60
+ 'INTJ': 'Interjection',
61
+ 'NOUN': 'Noun',
62
+ 'NUM': 'Number',
63
+ 'PART': 'Particle',
64
+ 'PRON': 'Pronoun',
65
+ 'PROPN': 'Proper Noun',
66
+ 'SCONJ': 'Subordinating Conjunction',
67
+ 'SYM': 'Symbol',
68
+ 'VERB': 'Verb',
69
+ 'X': 'Other',
70
+ },
71
+ 'fr': {
72
+ 'ADJ': 'Adjectif',
73
+ 'ADP': 'Préposition',
74
+ 'ADV': 'Adverbe',
75
+ 'AUX': 'Auxiliaire',
76
+ 'CCONJ': 'Conjonction de Coordination',
77
+ 'DET': 'Déterminant',
78
+ 'INTJ': 'Interjection',
79
+ 'NOUN': 'Nom',
80
+ 'NUM': 'Nombre',
81
+ 'PART': 'Particule',
82
+ 'PRON': 'Pronom',
83
+ 'PROPN': 'Nom Propre',
84
+ 'SCONJ': 'Conjonction de Subordination',
85
+ 'SYM': 'Symbole',
86
+ 'VERB': 'Verbe',
87
+ 'X': 'Autre',
88
+ }
89
+ }
90
+
91
+ def generate_arc_diagram(doc):
92
+ arc_diagrams = []
93
+ for sent in doc.sents:
94
+ words = [token.text for token in sent]
95
+ # Calculamos el ancho del SVG basado en la longitud de la oración
96
+ svg_width = max(600, len(words) * 120)
97
+ # Altura fija para cada oración
98
+ svg_height = 350 # Controla la altura del SVG
99
+
100
+ # Renderizamos el diagrama de dependencias
101
+ html = displacy.render(sent, style="dep", options={
102
+ "add_lemma":False, # Introduced in version 2.2.4, this argument prints the lemma’s in a separate row below the token texts.
103
+ "arrow_spacing": 12, #This argument is used for adjusting the spacing between arrows in px to avoid overlaps.
104
+ "arrow_width": 2, #This argument is used for adjusting the width of arrow head in px.
105
+ "arrow_stroke": 2, #This argument is used for adjusting the width of arrow path in px.
106
+ "collapse_punct": True, #It attaches punctuation to the tokens.
107
+ "collapse_phrases": False, # This argument merges the noun phrases into one token.
108
+ "compact":False, # If you will take this argument as true, you will get the “Compact mode” with square arrows that takes up less space.
109
+ "color": "#ffffff",
110
+ "bg": "#0d6efd",
111
+ "compact": False, #Put the value of this argument True, if you want to use fine-grained part-of-speech tags (Token.tag_), instead of coarse-grained tags (Token.pos_).
112
+ "distance": 100, # Aumentamos la distancia entre palabras
113
+ "fine_grained": False, #Put the value of this argument True, if you want to use fine-grained part-of-speech tags (Token.tag_), instead of coarse-grained tags (Token.pos_).
114
+ "offset_x": 55, # This argument is used for spacing on left side of the SVG in px.
115
+ "word_spacing": 25, #This argument is used for adjusting the vertical spacing between words and arcs in px.
116
+ })
117
+
118
+ # Ajustamos el tamaño del SVG y el viewBox
119
+ html = re.sub(r'width="(\d+)"', f'width="{svg_width}"', html)
120
+ html = re.sub(r'height="(\d+)"', f'height="{svg_height}"', html)
121
+ html = re.sub(r'<svg', f'<svg viewBox="0 0 {svg_width} {svg_height}"', html)
122
+
123
+ #html = re.sub(r'<svg[^>]*>', lambda m: m.group(0).replace('height="450"', 'height="300"'), html)
124
+ #html = re.sub(r'<g [^>]*transform="translate\((\d+),(\d+)\)"', lambda m: f'<g transform="translate({m.group(1)},50)"', html)
125
+
126
+ # Movemos todo el contenido hacia abajo
127
+ #html = html.replace('<g', f'<g transform="translate(50, {svg_height - 200})"')
128
+
129
+ # Movemos todo el contenido hacia arriba para eliminar el espacio vacío en la parte superior
130
+ html = re.sub(r'<g transform="translate\((\d+),(\d+)\)"',
131
+ lambda m: f'<g transform="translate({m.group(1)},10)"', html)
132
+
133
+
134
+ # Ajustamos la posición de las etiquetas de las palabras
135
+ html = html.replace('dy="1em"', 'dy="-1em"')
136
+
137
+ # Ajustamos la posición de las etiquetas POS
138
+ html = html.replace('dy="0.25em"', 'dy="-3em"')
139
+
140
+ # Aumentamos el tamaño de la fuente para las etiquetas POS
141
+ html = html.replace('.displacy-tag {', '.displacy-tag { font-size: 14px;')
142
+
143
+ # Rotamos las etiquetas de las palabras para mejorar la legibilidad
144
+ #html = html.replace('class="displacy-label"', 'class="displacy-label" transform="rotate(30)"')
145
+
146
+ arc_diagrams.append(html)
147
+ return arc_diagrams
148
+ ##################################################################################################################################
149
+
150
+
151
+ def perform_advanced_morphosyntactic_analysis(text, nlp):
152
+ doc = nlp(text)
153
+ return {
154
+ 'pos_analysis': get_detailed_pos_analysis(doc),
155
+ 'morphological_analysis': get_morphological_analysis(doc),
156
+ 'sentence_structure': get_sentence_structure_analysis(doc),
157
+ 'arc_diagrams': generate_arc_diagram(doc),
158
+ 'repeated_words': get_repeated_words_colors(doc)
159
+ }
160
+
161
  __all__ = ['perform_advanced_morphosyntactic_analysis']