File size: 4,720 Bytes
e031c18 355f3ac 6bf8d03 e031c18 355f3ac e031c18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
# syntax_analysis.py
import streamlit as st
import spacy
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
@st.cache_resource
def load_spacy_model():
return spacy.load("es_core_news_lg")
# Load spaCy model
nlp = spacy.load("es_core_news_lg")
# Define colors for grammatical categories
POS_COLORS = {
'ADJ': '#FFA07A', # Light Salmon
'ADP': '#98FB98', # Pale Green
'ADV': '#87CEFA', # Light Sky Blue
'AUX': '#DDA0DD', # Plum
'CCONJ': '#F0E68C', # Khaki
'DET': '#FFB6C1', # Light Pink
'INTJ': '#FF6347', # Tomato
'NOUN': '#90EE90', # Light Green
'NUM': '#FAFAD2', # Light Goldenrod Yellow
'PART': '#D3D3D3', # Light Gray
'PRON': '#FFA500', # Orange
'PROPN': '#20B2AA', # Light Sea Green
'SCONJ': '#DEB887', # Burlywood
'SYM': '#7B68EE', # Medium Slate Blue
'VERB': '#FF69B4', # Hot Pink
'X': '#A9A9A9', # Dark Gray
}
POS_TRANSLATIONS = {
'ADJ': 'Adjetivo',
'ADP': 'Advposici贸n',
'ADV': 'Adverbio',
'AUX': 'Auxiliar',
'CCONJ': 'Conjunci贸n Coordinante',
'DET': 'Determinante',
'INTJ': 'Interjecci贸n',
'NOUN': 'Sustantivo',
'NUM': 'N煤mero',
'PART': 'Part铆cula',
'PRON': 'Pronombre',
'PROPN': 'Nombre Propio',
'SCONJ': 'Conjunci贸n Subordinante',
'SYM': 'S铆mbolo',
'VERB': 'Verbo',
'X': 'Otro',
}
def count_pos(doc):
return Counter(token.pos_ for token in doc if token.pos_ != 'PUNCT')
def create_syntax_graph(doc):
G = nx.DiGraph()
pos_counts = count_pos(doc)
word_nodes = {}
word_colors = {}
for token in doc:
if token.pos_ != 'PUNCT':
lower_text = token.text.lower()
if lower_text not in word_nodes:
node_id = len(word_nodes)
word_nodes[lower_text] = node_id
color = POS_COLORS.get(token.pos_, '#FFFFFF')
word_colors[lower_text] = color
G.add_node(node_id,
label=f"{token.text}\n[{POS_TRANSLATIONS.get(token.pos_, token.pos_)}]",
pos=token.pos_,
size=pos_counts[token.pos_] * 500,
color=color)
if token.dep_ != "ROOT" and token.head.pos_ != 'PUNCT':
head_id = word_nodes.get(token.head.text.lower())
if head_id is not None:
G.add_edge(head_id, word_nodes[lower_text], label=token.dep_)
return G, word_colors
def visualize_syntax_graph(doc):
G, word_colors = create_syntax_graph(doc)
plt.figure(figsize=(20, 15))
pos = nx.spring_layout(G, k=2, iterations=100)
node_colors = [data['color'] for _, data in G.nodes(data=True)]
node_sizes = [data['size'] for _, data in G.nodes(data=True)]
nx.draw(G, pos, with_labels=False, node_color=node_colors, node_size=node_sizes, arrows=True)
nx.draw_networkx_labels(G, pos, {node: data['label'] for node, data in G.nodes(data=True)}, font_size=8)
edge_labels = nx.get_edge_attributes(G, 'label')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)
plt.title("An谩lisis Sint谩ctico")
plt.axis('off')
legend_elements = [plt.Rectangle((0,0),1,1, facecolor=color, edgecolor='none', label=f"{POS_TRANSLATIONS[pos]} ({count_pos(doc)[pos]})")
for pos, color in POS_COLORS.items() if pos in set(nx.get_node_attributes(G, 'pos').values())]
plt.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5))
return plt
def visualize_syntax(text):
max_tokens = 5000
doc = nlp(text)
if len(doc) > max_tokens:
doc = nlp(text[:max_tokens])
print(f"Warning: The input text is too long. Only the first {max_tokens} tokens will be visualized.")
return visualize_syntax_graph(doc)
# Repeated words colors
def get_repeated_words_colors(doc):
word_counts = Counter(token.text.lower() for token in doc if token.pos_ != 'PUNCT')
repeated_words = {word: count for word, count in word_counts.items() if count > 1}
word_colors = {}
for token in doc:
if token.text.lower() in repeated_words:
word_colors[token.text.lower()] = POS_COLORS.get(token.pos_, '#FFFFFF')
return word_colors
def highlight_repeated_words(doc, word_colors):
highlighted_text = []
for token in doc:
if token.text.lower() in word_colors:
color = word_colors[token.text.lower()]
highlighted_text.append(f'<span style="background-color: {color};">{token.text}</span>')
else:
highlighted_text.append(token.text)
return ' '.join(highlighted_text) |