File size: 5,791 Bytes
e031c18 355f3ac 6bf8d03 e031c18 0e46985 e031c18 0e46985 e031c18 0e46985 e031c18 0e46985 e031c18 0e46985 e031c18 0e46985 e031c18 0e46985 e031c18 0e46985 e031c18 0e46985 e031c18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
# syntax_analysis.py
import streamlit as st
import spacy
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
# Remove the global nlp model loading
# Define colors for grammatical categories
POS_COLORS = {
'ADJ': '#FFA07A', # Light Salmon
'ADP': '#98FB98', # Pale Green
'ADV': '#87CEFA', # Light Sky Blue
'AUX': '#DDA0DD', # Plum
'CCONJ': '#F0E68C', # Khaki
'DET': '#FFB6C1', # Light Pink
'INTJ': '#FF6347', # Tomato
'NOUN': '#90EE90', # Light Green
'NUM': '#FAFAD2', # Light Goldenrod Yellow
'PART': '#D3D3D3', # Light Gray
'PRON': '#FFA500', # Orange
'PROPN': '#20B2AA', # Light Sea Green
'SCONJ': '#DEB887', # Burlywood
'SYM': '#7B68EE', # Medium Slate Blue
'VERB': '#FF69B4', # Hot Pink
'X': '#A9A9A9', # Dark Gray
}
POS_TRANSLATIONS = {
'es': {
'ADJ': 'Adjetivo',
'ADP': 'Adposici贸n',
'ADV': 'Adverbio',
'AUX': 'Auxiliar',
'CCONJ': 'Conjunci贸n Coordinante',
'DET': 'Determinante',
'INTJ': 'Interjecci贸n',
'NOUN': 'Sustantivo',
'NUM': 'N煤mero',
'PART': 'Part铆cula',
'PRON': 'Pronombre',
'PROPN': 'Nombre Propio',
'SCONJ': 'Conjunci贸n Subordinante',
'SYM': 'S铆mbolo',
'VERB': 'Verbo',
'X': 'Otro',
},
'en': {
'ADJ': 'Adjective',
'ADP': 'Adposition',
'ADV': 'Adverb',
'AUX': 'Auxiliary',
'CCONJ': 'Coordinating Conjunction',
'DET': 'Determiner',
'INTJ': 'Interjection',
'NOUN': 'Noun',
'NUM': 'Number',
'PART': 'Particle',
'PRON': 'Pronoun',
'PROPN': 'Proper Noun',
'SCONJ': 'Subordinating Conjunction',
'SYM': 'Symbol',
'VERB': 'Verb',
'X': 'Other',
},
'fr': {
'ADJ': 'Adjectif',
'ADP': 'Adposition',
'ADV': 'Adverbe',
'AUX': 'Auxiliaire',
'CCONJ': 'Conjonction de Coordination',
'DET': 'D茅terminant',
'INTJ': 'Interjection',
'NOUN': 'Nom',
'NUM': 'Nombre',
'PART': 'Particule',
'PRON': 'Pronom',
'PROPN': 'Nom Propre',
'SCONJ': 'Conjonction de Subordination',
'SYM': 'Symbole',
'VERB': 'Verbe',
'X': 'Autre',
}
}
def count_pos(doc):
return Counter(token.pos_ for token in doc if token.pos_ != 'PUNCT')
def create_syntax_graph(doc, lang):
G = nx.DiGraph()
pos_counts = count_pos(doc)
word_nodes = {}
word_colors = {}
for token in doc:
if token.pos_ != 'PUNCT':
lower_text = token.text.lower()
if lower_text not in word_nodes:
node_id = len(word_nodes)
word_nodes[lower_text] = node_id
color = POS_COLORS.get(token.pos_, '#FFFFFF')
word_colors[lower_text] = color
G.add_node(node_id,
label=f"{token.text}\n[{POS_TRANSLATIONS[lang].get(token.pos_, token.pos_)}]",
pos=token.pos_,
size=pos_counts[token.pos_] * 500,
color=color)
if token.dep_ != "ROOT" and token.head.pos_ != 'PUNCT':
head_id = word_nodes.get(token.head.text.lower())
if head_id is not None:
G.add_edge(head_id, word_nodes[lower_text], label=token.dep_)
return G, word_colors
def visualize_syntax_graph(doc, lang):
G, word_colors = create_syntax_graph(doc, lang)
plt.figure(figsize=(20, 15))
pos = nx.spring_layout(G, k=2, iterations=100)
node_colors = [data['color'] for _, data in G.nodes(data=True)]
node_sizes = [data['size'] for _, data in G.nodes(data=True)]
nx.draw(G, pos, with_labels=False, node_color=node_colors, node_size=node_sizes, arrows=True)
nx.draw_networkx_labels(G, pos, {node: data['label'] for node, data in G.nodes(data=True)}, font_size=8)
edge_labels = nx.get_edge_attributes(G, 'label')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)
plt.title("Syntactic Analysis" if lang == 'en' else "Analyse Syntaxique" if lang == 'fr' else "An谩lisis Sint谩ctico")
plt.axis('off')
legend_elements = [plt.Rectangle((0,0),1,1, facecolor=color, edgecolor='none', label=f"{POS_TRANSLATIONS[lang][pos]} ({count_pos(doc)[pos]})")
for pos, color in POS_COLORS.items() if pos in set(nx.get_node_attributes(G, 'pos').values())]
plt.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5))
return plt
def visualize_syntax(text, nlp, lang):
max_tokens = 5000
doc = nlp(text)
if len(doc) > max_tokens:
doc = nlp(text[:max_tokens])
print(f"Warning: The input text is too long. Only the first {max_tokens} tokens will be visualized.")
return visualize_syntax_graph(doc, lang)
def get_repeated_words_colors(doc):
word_counts = Counter(token.text.lower() for token in doc if token.pos_ != 'PUNCT')
repeated_words = {word: count for word, count in word_counts.items() if count > 1}
word_colors = {}
for token in doc:
if token.text.lower() in repeated_words:
word_colors[token.text.lower()] = POS_COLORS.get(token.pos_, '#FFFFFF')
return word_colors
def highlight_repeated_words(doc, word_colors):
highlighted_text = []
for token in doc:
if token.text.lower() in word_colors:
color = word_colors[token.text.lower()]
highlighted_text.append(f'<span style="background-color: {color};">{token.text}</span>')
else:
highlighted_text.append(token.text)
return ' '.join(highlighted_text) |