|
|
|
|
|
import spacy |
|
from collections import Counter |
|
from spacy import displacy |
|
import re |
|
from streamlit.components.v1 import html |
|
import base64 |
|
|
|
from collections import Counter |
|
import re |
|
from ..utils.widget_utils import generate_unique_key |
|
|
|
import logging |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
POS_COLORS = { |
|
'ADJ': '#FFA07A', |
|
'ADP': '#98FB98', |
|
'ADV': '#87CEFA', |
|
'AUX': '#DDA0DD', |
|
'CCONJ': '#F0E68C', |
|
'DET': '#FFB6C1', |
|
'INTJ': '#FF6347', |
|
'NOUN': '#90EE90', |
|
'NUM': '#FAFAD2', |
|
'PART': '#D3D3D3', |
|
'PRON': '#FFA500', |
|
'PROPN': '#20B2AA', |
|
'SCONJ': '#DEB887', |
|
'SYM': '#7B68EE', |
|
'VERB': '#FF69B4', |
|
'X': '#A9A9A9', |
|
} |
|
|
|
POS_TRANSLATIONS = { |
|
'es': { |
|
'ADJ': 'Adjetivo', |
|
'ADP': 'Preposición', |
|
'ADV': 'Adverbio', |
|
'AUX': 'Auxiliar', |
|
'CCONJ': 'Conjunción Coordinante', |
|
'DET': 'Determinante', |
|
'INTJ': 'Interjección', |
|
'NOUN': 'Sustantivo', |
|
'NUM': 'Número', |
|
'PART': 'Partícula', |
|
'PRON': 'Pronombre', |
|
'PROPN': 'Nombre Propio', |
|
'SCONJ': 'Conjunción Subordinante', |
|
'SYM': 'Símbolo', |
|
'VERB': 'Verbo', |
|
'X': 'Otro', |
|
}, |
|
'en': { |
|
'ADJ': 'Adjective', |
|
'ADP': 'Preposition', |
|
'ADV': 'Adverb', |
|
'AUX': 'Auxiliary', |
|
'CCONJ': 'Coordinating Conjunction', |
|
'DET': 'Determiner', |
|
'INTJ': 'Interjection', |
|
'NOUN': 'Noun', |
|
'NUM': 'Number', |
|
'PART': 'Particle', |
|
'PRON': 'Pronoun', |
|
'PROPN': 'Proper Noun', |
|
'SCONJ': 'Subordinating Conjunction', |
|
'SYM': 'Symbol', |
|
'VERB': 'Verb', |
|
'X': 'Other', |
|
}, |
|
'fr': { |
|
'ADJ': 'Adjectif', |
|
'ADP': 'Préposition', |
|
'ADV': 'Adverbe', |
|
'AUX': 'Auxiliaire', |
|
'CCONJ': 'Conjonction de Coordination', |
|
'DET': 'Déterminant', |
|
'INTJ': 'Interjection', |
|
'NOUN': 'Nom', |
|
'NUM': 'Nombre', |
|
'PART': 'Particule', |
|
'PRON': 'Pronom', |
|
'PROPN': 'Nom Propre', |
|
'SCONJ': 'Conjonction de Subordination', |
|
'SYM': 'Symbole', |
|
'VERB': 'Verbe', |
|
'X': 'Autre', |
|
} |
|
} |
|
|
|
|
|
def get_repeated_words_colors(doc): |
|
word_counts = Counter(token.text.lower() for token in doc if token.pos_ != 'PUNCT') |
|
repeated_words = {word: count for word, count in word_counts.items() if count > 1} |
|
|
|
word_colors = {} |
|
for token in doc: |
|
if token.text.lower() in repeated_words: |
|
word_colors[token.text.lower()] = POS_COLORS.get(token.pos_, '#FFFFFF') |
|
|
|
return word_colors |
|
|
|
|
|
def highlight_repeated_words(doc, word_colors): |
|
highlighted_text = [] |
|
for token in doc: |
|
if token.text.lower() in word_colors: |
|
color = word_colors[token.text.lower()] |
|
highlighted_text.append(f'<span style="background-color: {color};">{token.text}</span>') |
|
else: |
|
highlighted_text.append(token.text) |
|
return ' '.join(highlighted_text) |
|
|
|
|
|
|
|
def generate_arc_diagram(doc): |
|
""" |
|
Genera diagramas de arco para cada oración en el documento usando spacy-streamlit. |
|
|
|
Args: |
|
doc: Documento procesado por spaCy |
|
Returns: |
|
list: Lista de diagramas en formato HTML |
|
""" |
|
arc_diagrams = [] |
|
try: |
|
options = { |
|
"compact": False, |
|
"color": "#ffffff", |
|
"bg": "#0d6efd", |
|
"font": "Arial", |
|
"offset_x": 50, |
|
"distance": 100, |
|
"arrow_spacing": 12, |
|
"arrow_width": 2, |
|
"arrow_stroke": 2, |
|
"word_spacing": 25, |
|
"maxZoom": 2 |
|
} |
|
|
|
for sent in doc.sents: |
|
try: |
|
|
|
html = displacy.render(sent, style="dep", options=options) |
|
arc_diagrams.append(html) |
|
except Exception as e: |
|
logger.error(f"Error al renderizar oración: {str(e)}") |
|
continue |
|
|
|
return arc_diagrams |
|
except Exception as e: |
|
logger.error(f"Error general en generate_arc_diagram: {str(e)}") |
|
return None |
|
|
|
|
|
|
|
def get_detailed_pos_analysis(doc): |
|
""" |
|
Realiza un análisis detallado de las categorías gramaticales (POS) en el texto. |
|
""" |
|
pos_counts = Counter(token.pos_ for token in doc) |
|
total_tokens = len(doc) |
|
pos_analysis = [] |
|
for pos, count in pos_counts.items(): |
|
percentage = (count / total_tokens) * 100 |
|
pos_analysis.append({ |
|
'pos': pos, |
|
'count': count, |
|
'percentage': round(percentage, 2), |
|
'examples': [token.text for token in doc if token.pos_ == pos][:5] |
|
}) |
|
return sorted(pos_analysis, key=lambda x: x['count'], reverse=True) |
|
|
|
|
|
def get_morphological_analysis(doc): |
|
""" |
|
Realiza un análisis morfológico detallado de las palabras en el texto. |
|
""" |
|
morphology_analysis = [] |
|
for token in doc: |
|
if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV']: |
|
morphology_analysis.append({ |
|
'text': token.text, |
|
'lemma': token.lemma_, |
|
'pos': token.pos_, |
|
'tag': token.tag_, |
|
'dep': token.dep_, |
|
'shape': token.shape_, |
|
'is_alpha': token.is_alpha, |
|
'is_stop': token.is_stop, |
|
'morph': str(token.morph) |
|
}) |
|
return morphology_analysis |
|
|
|
|
|
def get_sentence_structure_analysis(doc): |
|
""" |
|
Analiza la estructura de las oraciones en el texto. |
|
""" |
|
sentence_analysis = [] |
|
for sent in doc.sents: |
|
sentence_analysis.append({ |
|
'text': sent.text, |
|
'root': sent.root.text, |
|
'root_pos': sent.root.pos_, |
|
'num_tokens': len(sent), |
|
'num_words': len([token for token in sent if token.is_alpha]), |
|
'subjects': [token.text for token in sent if "subj" in token.dep_], |
|
'objects': [token.text for token in sent if "obj" in token.dep_], |
|
'verbs': [token.text for token in sent if token.pos_ == "VERB"] |
|
}) |
|
return sentence_analysis |
|
|
|
|
|
def perform_advanced_morphosyntactic_analysis(text, nlp): |
|
""" |
|
Realiza un análisis morfosintáctico avanzado del texto. |
|
""" |
|
try: |
|
|
|
model_lang = nlp.lang |
|
logger.info(f"Realizando análisis con modelo de idioma: {model_lang}") |
|
|
|
|
|
doc = nlp(text) |
|
|
|
|
|
return { |
|
'doc': doc, |
|
'pos_analysis': get_detailed_pos_analysis(doc), |
|
'morphological_analysis': get_morphological_analysis(doc), |
|
'sentence_structure': get_sentence_structure_analysis(doc), |
|
'arc_diagrams': generate_arc_diagram(doc), |
|
'repeated_words': get_repeated_words_colors(doc), |
|
'highlighted_text': highlight_repeated_words(doc, get_repeated_words_colors(doc)) |
|
} |
|
except Exception as e: |
|
logger.error(f"Error en análisis morfosintáctico: {str(e)}") |
|
return None |
|
|
|
|
|
__all__ = [ |
|
'perform_advanced_morphosyntactic_analysis', |
|
'get_repeated_words_colors', |
|
'highlight_repeated_words', |
|
'generate_arc_diagram', |
|
'get_detailed_pos_analysis', |
|
'get_morphological_analysis', |
|
'get_sentence_structure_analysis', |
|
'POS_COLORS', |
|
'POS_TRANSLATIONS' |
|
] |
|
|