|
import spacy |
|
from spacy import displacy |
|
from streamlit.components.v1 import html |
|
import base64 |
|
|
|
from collections import Counter |
|
import re |
|
from ..utils.widget_utils import generate_unique_key |
|
|
|
import logging |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
POS_COLORS = { |
|
'ADJ': '#FFA07A', |
|
'ADP': '#98FB98', |
|
'ADV': '#87CEFA', |
|
'AUX': '#DDA0DD', |
|
'CCONJ': '#F0E68C', |
|
'DET': '#FFB6C1', |
|
'INTJ': '#FF6347', |
|
'NOUN': '#90EE90', |
|
'NUM': '#FAFAD2', |
|
'PART': '#D3D3D3', |
|
'PRON': '#FFA500', |
|
'PROPN': '#20B2AA', |
|
'SCONJ': '#DEB887', |
|
'SYM': '#7B68EE', |
|
'VERB': '#FF69B4', |
|
'X': '#A9A9A9', |
|
} |
|
|
|
POS_TRANSLATIONS = { |
|
'es': { |
|
'ADJ': 'Adjetivo', |
|
'ADP': 'Preposición', |
|
'ADV': 'Adverbio', |
|
'AUX': 'Auxiliar', |
|
'CCONJ': 'Conjunción Coordinante', |
|
'DET': 'Determinante', |
|
'INTJ': 'Interjección', |
|
'NOUN': 'Sustantivo', |
|
'NUM': 'Número', |
|
'PART': 'Partícula', |
|
'PRON': 'Pronombre', |
|
'PROPN': 'Nombre Propio', |
|
'SCONJ': 'Conjunción Subordinante', |
|
'SYM': 'Símbolo', |
|
'VERB': 'Verbo', |
|
'X': 'Otro', |
|
}, |
|
'en': { |
|
'ADJ': 'Adjective', |
|
'ADP': 'Preposition', |
|
'ADV': 'Adverb', |
|
'AUX': 'Auxiliary', |
|
'CCONJ': 'Coordinating Conjunction', |
|
'DET': 'Determiner', |
|
'INTJ': 'Interjection', |
|
'NOUN': 'Noun', |
|
'NUM': 'Number', |
|
'PART': 'Particle', |
|
'PRON': 'Pronoun', |
|
'PROPN': 'Proper Noun', |
|
'SCONJ': 'Subordinating Conjunction', |
|
'SYM': 'Symbol', |
|
'VERB': 'Verb', |
|
'X': 'Other', |
|
}, |
|
'fr': { |
|
'ADJ': 'Adjectif', |
|
'ADP': 'Préposition', |
|
'ADV': 'Adverbe', |
|
'AUX': 'Auxiliaire', |
|
'CCONJ': 'Conjonction de Coordination', |
|
'DET': 'Déterminant', |
|
'INTJ': 'Interjection', |
|
'NOUN': 'Nom', |
|
'NUM': 'Nombre', |
|
'PART': 'Particule', |
|
'PRON': 'Pronom', |
|
'PROPN': 'Nom Propre', |
|
'SCONJ': 'Conjonction de Subordination', |
|
'SYM': 'Symbole', |
|
'VERB': 'Verbe', |
|
'X': 'Autre', |
|
} |
|
} |
|
|
|
def generate_arc_diagram(doc): |
|
arc_diagrams = [] |
|
for sent in doc.sents: |
|
words = [token.text for token in sent] |
|
|
|
svg_width = max(600, len(words) * 120) |
|
|
|
svg_height = 350 |
|
|
|
|
|
html = displacy.render(sent, style="dep", options={ |
|
"add_lemma":False, |
|
"arrow_spacing": 12, |
|
"arrow_width": 2, |
|
"arrow_stroke": 2, |
|
"collapse_punct": True, |
|
"collapse_phrases": False, |
|
"compact":False, |
|
"color": "#ffffff", |
|
"bg": "#0d6efd", |
|
"compact": False, |
|
"distance": 100, |
|
"fine_grained": False, |
|
"offset_x": 55, |
|
"word_spacing": 25, |
|
}) |
|
|
|
|
|
html = re.sub(r'width="(\d+)"', f'width="{svg_width}"', html) |
|
html = re.sub(r'height="(\d+)"', f'height="{svg_height}"', html) |
|
html = re.sub(r'<svg', f'<svg viewBox="0 0 {svg_width} {svg_height}"', html) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
html = re.sub(r'<g transform="translate\((\d+),(\d+)\)"', |
|
lambda m: f'<g transform="translate({m.group(1)},10)"', html) |
|
|
|
|
|
|
|
html = html.replace('dy="1em"', 'dy="-1em"') |
|
|
|
|
|
html = html.replace('dy="0.25em"', 'dy="-3em"') |
|
|
|
|
|
html = html.replace('.displacy-tag {', '.displacy-tag { font-size: 14px;') |
|
|
|
|
|
|
|
|
|
arc_diagrams.append(html) |
|
return arc_diagrams |
|
|
|
|
|
|
|
def perform_advanced_morphosyntactic_analysis(text, nlp): |
|
doc = nlp(text) |
|
return { |
|
'pos_analysis': get_detailed_pos_analysis(doc), |
|
'morphological_analysis': get_morphological_analysis(doc), |
|
'sentence_structure': get_sentence_structure_analysis(doc), |
|
'arc_diagrams': generate_arc_diagram(doc), |
|
'repeated_words': get_repeated_words_colors(doc) |
|
} |
|
|
|
__all__ = ['perform_advanced_morphosyntactic_analysis'] |