test2 / modules /semantic_analysis.py
AIdeaText's picture
Update modules/semantic_analysis.py
513bf2d verified
raw
history blame
8.94 kB
#semantic_analysis.py
import streamlit as st
import spacy
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
from collections import defaultdict
# Remove the global nlp model loading
# Define colors for grammatical categories
POS_COLORS = {
'ADJ': '#FFA07A', # Light Salmon
'ADP': '#98FB98', # Pale Green
'ADV': '#87CEFA', # Light Sky Blue
'AUX': '#DDA0DD', # Plum
'CCONJ': '#F0E68C', # Khaki
'DET': '#FFB6C1', # Light Pink
'INTJ': '#FF6347', # Tomato
'NOUN': '#90EE90', # Light Green
'NUM': '#FAFAD2', # Light Goldenrod Yellow
'PART': '#D3D3D3', # Light Gray
'PRON': '#FFA500', # Orange
'PROPN': '#20B2AA', # Light Sea Green
'SCONJ': '#DEB887', # Burlywood
'SYM': '#7B68EE', # Medium Slate Blue
'VERB': '#FF69B4', # Hot Pink
'X': '#A9A9A9', # Dark Gray
}
POS_TRANSLATIONS = {
'es': {
'ADJ': 'Adjetivo',
'ADP': 'Adposici贸n',
'ADV': 'Adverbio',
'AUX': 'Auxiliar',
'CCONJ': 'Conjunci贸n Coordinante',
'DET': 'Determinante',
'INTJ': 'Interjecci贸n',
'NOUN': 'Sustantivo',
'NUM': 'N煤mero',
'PART': 'Part铆cula',
'PRON': 'Pronombre',
'PROPN': 'Nombre Propio',
'SCONJ': 'Conjunci贸n Subordinante',
'SYM': 'S铆mbolo',
'VERB': 'Verbo',
'X': 'Otro',
},
'en': {
'ADJ': 'Adjective',
'ADP': 'Adposition',
'ADV': 'Adverb',
'AUX': 'Auxiliary',
'CCONJ': 'Coordinating Conjunction',
'DET': 'Determiner',
'INTJ': 'Interjection',
'NOUN': 'Noun',
'NUM': 'Number',
'PART': 'Particle',
'PRON': 'Pronoun',
'PROPN': 'Proper Noun',
'SCONJ': 'Subordinating Conjunction',
'SYM': 'Symbol',
'VERB': 'Verb',
'X': 'Other',
},
'fr': {
'ADJ': 'Adjectif',
'ADP': 'Adposition',
'ADV': 'Adverbe',
'AUX': 'Auxiliaire',
'CCONJ': 'Conjonction de Coordination',
'DET': 'D茅terminant',
'INTJ': 'Interjection',
'NOUN': 'Nom',
'NUM': 'Nombre',
'PART': 'Particule',
'PRON': 'Pronom',
'PROPN': 'Nom Propre',
'SCONJ': 'Conjonction de Subordination',
'SYM': 'Symbole',
'VERB': 'Verbe',
'X': 'Autre',
}
}
########################################################################################################################################
# Definimos las etiquetas y colores para cada idioma
ENTITY_LABELS = {
'es': {
"Personas": "lightblue",
"Conceptos": "lightgreen",
"Lugares": "lightcoral",
"Fechas": "lightyellow"
},
'en': {
"People": "lightblue",
"Concepts": "lightgreen",
"Places": "lightcoral",
"Dates": "lightyellow"
},
'fr': {
"Personnes": "lightblue",
"Concepts": "lightgreen",
"Lieux": "lightcoral",
"Dates": "lightyellow"
}
}
#########################################################################################################
def count_pos(doc):
return Counter(token.pos_ for token in doc if token.pos_ != 'PUNCT')
import spacy
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
# Mant茅n las definiciones de POS_COLORS y POS_TRANSLATIONS que ya tienes
#############################################################################################################################
def extract_entities(doc, lang):
entities = {label: [] for label in ENTITY_LABELS[lang].keys()}
for ent in doc.ents:
if ent.label_ == "PERSON":
entities[list(ENTITY_LABELS[lang].keys())[0]].append(ent.text)
elif ent.label_ in ["LOC", "GPE"]:
entities[list(ENTITY_LABELS[lang].keys())[2]].append(ent.text)
elif ent.label_ == "DATE":
entities[list(ENTITY_LABELS[lang].keys())[3]].append(ent.text)
else:
entities[list(ENTITY_LABELS[lang].keys())[1]].append(ent.text)
return entities
#####################################################################################################################
#def visualize_context_graph(doc, lang):
# G = nx.Graph()
# entities = extract_entities(doc, lang)
# color_map = ENTITY_LABELS[lang]
# Add nodes
# for category, items in entities.items():
# for item in items:
# G.add_node(item, category=category)
# Add edges
# for sent in doc.sents:
# sent_entities = [ent for ent in sent.ents if ent.text in G.nodes()]
# for i in range(len(sent_entities)):
# for j in range(i+1, len(sent_entities)):
# G.add_edge(sent_entities[i].text, sent_entities[j].text)
# Visualize
# plt.figure(figsize=(30, 22)) # Increased figure size
# pos = nx.spring_layout(G, k=0.7, iterations=50) # Adjusted layout
# node_colors = [color_map[G.nodes[node]['category']] for node in G.nodes()]
# nx.draw(G, pos, node_color=node_colors, with_labels=True,
# node_size=10000, # Increased node size
# font_size=18, # Increased font size
# font_weight='bold',
# width=2, # Increased edge width
# arrowsize=30) # Increased arrow size
# Add a legend
# legend_elements = [plt.Rectangle((0,0),1,1,fc=color, edgecolor='none', label=category)
# for category, color in color_map.items()]
# plt.legend(handles=legend_elements, loc='upper left', bbox_to_anchor=(1, 1), fontsize=16) # Increased legend font size
# plt.title("An谩lisis del Contexto" if lang == 'es' else "Context Analysis" if lang == 'en' else "Analyse du Contexte", fontsize=24) # Increased title font size
# plt.axis('off')
# return plt
############################################################################################################################################
def visualize_semantic_relations(doc, lang):
G = nx.Graph()
word_freq = defaultdict(int)
lemma_to_word = {}
lemma_to_pos = {}
# Count frequencies of lemmas and map lemmas to their most common word form and POS
for token in doc:
if token.pos_ in ['NOUN', 'VERB']:
lemma = token.lemma_.lower()
word_freq[lemma] += 1
if lemma not in lemma_to_word or token.text.lower() == lemma:
lemma_to_word[lemma] = token.text
lemma_to_pos[lemma] = token.pos_
# Get top 20 most frequent lemmas
top_lemmas = [lemma for lemma, _ in sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]]
# Add nodes
for lemma in top_lemmas:
word = lemma_to_word[lemma]
G.add_node(word, pos=lemma_to_pos[lemma])
# Add edges
for token in doc:
if token.lemma_.lower() in top_lemmas:
if token.head.lemma_.lower() in top_lemmas:
source = lemma_to_word[token.lemma_.lower()]
target = lemma_to_word[token.head.lemma_.lower()]
if source != target: # Avoid self-loops
G.add_edge(source, target, label=token.dep_)
fig, ax = plt.subplots(figsize=(36, 27))
pos = nx.spring_layout(G, k=0.7, iterations=50)
node_colors = [POS_COLORS.get(G.nodes[node]['pos'], '#CCCCCC') for node in G.nodes()]
nx.draw(G, pos, node_color=node_colors, with_labels=True,
node_size=10000,
font_size=16,
font_weight='bold',
arrows=True,
arrowsize=30,
width=3,
edge_color='gray',
ax=ax)
edge_labels = nx.get_edge_attributes(G, 'label')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=14, ax=ax)
title = {
'es': "Relaciones Sem谩nticas Relevantes",
'en': "Relevant Semantic Relations",
'fr': "Relations S茅mantiques Pertinentes"
}
ax.set_title(title[lang], fontsize=24, fontweight='bold')
ax.axis('off')
legend_elements = [plt.Rectangle((0,0),1,1,fc=POS_COLORS.get(pos, '#CCCCCC'), edgecolor='none',
label=f"{POS_TRANSLATIONS[lang].get(pos, pos)}")
for pos in ['NOUN', 'VERB']]
ax.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5), fontsize=16)
return fig
############################################################################################################################################
def perform_semantic_analysis(text, nlp, lang):
doc = nlp(text)
# Imprimir entidades para depuraci贸n
print(f"Entidades encontradas ({lang}):")
for ent in doc.ents:
print(f"{ent.text} - {ent.label_}")
relations_graph = visualize_semantic_relations(doc, lang)
return relations_graph # Ahora solo devuelve un 煤nico gr谩fico