Last commit not found
#semantic_analysis.py | |
import streamlit as st | |
import spacy | |
import networkx as nx | |
import matplotlib.pyplot as plt | |
from collections import Counter | |
from collections import defaultdict | |
# Remove the global nlp model loading | |
# Define colors for grammatical categories | |
POS_COLORS = { | |
'ADJ': '#FFA07A', # Light Salmon | |
'ADP': '#98FB98', # Pale Green | |
'ADV': '#87CEFA', # Light Sky Blue | |
'AUX': '#DDA0DD', # Plum | |
'CCONJ': '#F0E68C', # Khaki | |
'DET': '#FFB6C1', # Light Pink | |
'INTJ': '#FF6347', # Tomato | |
'NOUN': '#90EE90', # Light Green | |
'NUM': '#FAFAD2', # Light Goldenrod Yellow | |
'PART': '#D3D3D3', # Light Gray | |
'PRON': '#FFA500', # Orange | |
'PROPN': '#20B2AA', # Light Sea Green | |
'SCONJ': '#DEB887', # Burlywood | |
'SYM': '#7B68EE', # Medium Slate Blue | |
'VERB': '#FF69B4', # Hot Pink | |
'X': '#A9A9A9', # Dark Gray | |
} | |
POS_TRANSLATIONS = { | |
'es': { | |
'ADJ': 'Adjetivo', | |
'ADP': 'Adposición', | |
'ADV': 'Adverbio', | |
'AUX': 'Auxiliar', | |
'CCONJ': 'Conjunción Coordinante', | |
'DET': 'Determinante', | |
'INTJ': 'Interjección', | |
'NOUN': 'Sustantivo', | |
'NUM': 'Número', | |
'PART': 'Partícula', | |
'PRON': 'Pronombre', | |
'PROPN': 'Nombre Propio', | |
'SCONJ': 'Conjunción Subordinante', | |
'SYM': 'Símbolo', | |
'VERB': 'Verbo', | |
'X': 'Otro', | |
}, | |
'en': { | |
'ADJ': 'Adjective', | |
'ADP': 'Adposition', | |
'ADV': 'Adverb', | |
'AUX': 'Auxiliary', | |
'CCONJ': 'Coordinating Conjunction', | |
'DET': 'Determiner', | |
'INTJ': 'Interjection', | |
'NOUN': 'Noun', | |
'NUM': 'Number', | |
'PART': 'Particle', | |
'PRON': 'Pronoun', | |
'PROPN': 'Proper Noun', | |
'SCONJ': 'Subordinating Conjunction', | |
'SYM': 'Symbol', | |
'VERB': 'Verb', | |
'X': 'Other', | |
}, | |
'fr': { | |
'ADJ': 'Adjectif', | |
'ADP': 'Adposition', | |
'ADV': 'Adverbe', | |
'AUX': 'Auxiliaire', | |
'CCONJ': 'Conjonction de Coordination', | |
'DET': 'Déterminant', | |
'INTJ': 'Interjection', | |
'NOUN': 'Nom', | |
'NUM': 'Nombre', | |
'PART': 'Particule', | |
'PRON': 'Pronom', | |
'PROPN': 'Nom Propre', | |
'SCONJ': 'Conjonction de Subordination', | |
'SYM': 'Symbole', | |
'VERB': 'Verbe', | |
'X': 'Autre', | |
} | |
} | |
######################################################################################################################################## | |
# Definimos las etiquetas y colores para cada idioma | |
ENTITY_LABELS = { | |
'es': { | |
"Personas": "lightblue", | |
"Conceptos": "lightgreen", | |
"Lugares": "lightcoral", | |
"Fechas": "lightyellow" | |
}, | |
'en': { | |
"People": "lightblue", | |
"Concepts": "lightgreen", | |
"Places": "lightcoral", | |
"Dates": "lightyellow" | |
}, | |
'fr': { | |
"Personnes": "lightblue", | |
"Concepts": "lightgreen", | |
"Lieux": "lightcoral", | |
"Dates": "lightyellow" | |
} | |
} | |
######################################################################################################### | |
def count_pos(doc): | |
return Counter(token.pos_ for token in doc if token.pos_ != 'PUNCT') | |
##################################################################################################################### | |
def create_semantic_graph(doc, lang): | |
G = nx.Graph() | |
word_freq = defaultdict(int) | |
lemma_to_word = {} | |
lemma_to_pos = {} | |
# Count frequencies of lemmas and map lemmas to their most common word form and POS | |
for token in doc: | |
if token.pos_ in ['NOUN', 'VERB']: | |
lemma = token.lemma_.lower() | |
word_freq[lemma] += 1 | |
if lemma not in lemma_to_word or token.text.lower() == lemma: | |
lemma_to_word[lemma] = token.text | |
lemma_to_pos[lemma] = token.pos_ | |
# Get top 20 most frequent lemmas | |
top_lemmas = [lemma for lemma, _ in sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]] | |
# Add nodes | |
for lemma in top_lemmas: | |
word = lemma_to_word[lemma] | |
G.add_node(word, pos=lemma_to_pos[lemma]) | |
# Add edges | |
for token in doc: | |
if token.lemma_.lower() in top_lemmas: | |
if token.head.lemma_.lower() in top_lemmas: | |
source = lemma_to_word[token.lemma_.lower()] | |
target = lemma_to_word[token.head.lemma_.lower()] | |
if source != target: # Avoid self-loops | |
G.add_edge(source, target, label=token.dep_) | |
return G, word_freq | |
############################################################################################################################################ | |
def visualize_semantic_relations(doc, lang): | |
G = nx.Graph() | |
word_freq = defaultdict(int) | |
lemma_to_word = {} | |
lemma_to_pos = {} | |
# Count frequencies of lemmas and map lemmas to their most common word form and POS | |
for token in doc: | |
if token.pos_ in ['NOUN', 'VERB']: | |
lemma = token.lemma_.lower() | |
word_freq[lemma] += 1 | |
if lemma not in lemma_to_word or token.text.lower() == lemma: | |
lemma_to_word[lemma] = token.text | |
lemma_to_pos[lemma] = token.pos_ | |
# Get top 20 most frequent lemmas | |
top_lemmas = [lemma for lemma, _ in sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]] | |
# Add nodes | |
for lemma in top_lemmas: | |
word = lemma_to_word[lemma] | |
G.add_node(word, pos=lemma_to_pos[lemma]) | |
# Add edges | |
for token in doc: | |
if token.lemma_.lower() in top_lemmas: | |
if token.head.lemma_.lower() in top_lemmas: | |
source = lemma_to_word[token.lemma_.lower()] | |
target = lemma_to_word[token.head.lemma_.lower()] | |
if source != target: # Avoid self-loops | |
G.add_edge(source, target, label=token.dep_) | |
fig, ax = plt.subplots(figsize=(36, 27)) | |
pos = nx.spring_layout(G, k=0.7, iterations=50) | |
node_colors = [POS_COLORS.get(G.nodes[node]['pos'], '#CCCCCC') for node in G.nodes()] | |
nx.draw(G, pos, node_color=node_colors, with_labels=True, | |
node_size=10000, | |
font_size=16, | |
font_weight='bold', | |
arrows=True, | |
arrowsize=30, | |
width=3, | |
edge_color='gray', | |
ax=ax) | |
edge_labels = nx.get_edge_attributes(G, 'label') | |
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=14, ax=ax) | |
title = { | |
'es': "Relaciones Semánticas Relevantes", | |
'en': "Relevant Semantic Relations", | |
'fr': "Relations Sémantiques Pertinentes" | |
} | |
ax.set_title(title[lang], fontsize=24, fontweight='bold') | |
ax.axis('off') | |
legend_elements = [plt.Rectangle((0,0),1,1,fc=POS_COLORS.get(pos, '#CCCCCC'), edgecolor='none', | |
label=f"{POS_TRANSLATIONS[lang].get(pos, pos)}") | |
for pos in ['NOUN', 'VERB']] | |
ax.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5), fontsize=16) | |
return fig | |
############################################################################################################################################ | |
def perform_semantic_analysis(text, nlp, lang): | |
doc = nlp(text) | |
# Imprimir entidades para depuración | |
print(f"Entidades encontradas ({lang}):") | |
for ent in doc.ents: | |
print(f"{ent.text} - {ent.label_}") | |
relations_graph = visualize_semantic_relations(doc, lang) | |
return relations_graph # Ahora solo devuelve un único gráfico | |
__all__ = ['visualize_semantic_relations', 'create_semantic_graph', 'POS_COLORS', 'POS_TRANSLATIONS'] |