|
|
|
|
|
import streamlit as st |
|
import matplotlib.pyplot as plt |
|
import networkx as nx |
|
import seaborn as sns |
|
from collections import Counter |
|
from itertools import combinations |
|
import numpy as np |
|
import matplotlib.patches as patches |
|
import logging |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
def analyze_text_dimensions(doc): |
|
""" |
|
Analiza las diferentes dimensiones del texto. |
|
|
|
Args: |
|
doc: Documento procesado por spaCy |
|
|
|
Returns: |
|
dict: Métricas del análisis |
|
""" |
|
try: |
|
|
|
clarity_score = analyze_clarity(doc) |
|
|
|
|
|
vocabulary_score = analyze_vocabulary_diversity(doc) |
|
|
|
|
|
cohesion_score = analyze_cohesion(doc) |
|
|
|
|
|
structure_score = analyze_structure(doc) |
|
|
|
|
|
sentence_graphs = generate_sentence_graphs(doc) |
|
word_connections = generate_word_connections(doc) |
|
connection_paths = generate_connection_paths(doc) |
|
|
|
return { |
|
'clarity': clarity_score, |
|
'vocabulary': vocabulary_score, |
|
'cohesion': cohesion_score, |
|
'structure': structure_score, |
|
'sentence_graphs': sentence_graphs, |
|
'word_connections': word_connections, |
|
'connection_paths': connection_paths |
|
} |
|
|
|
except Exception as e: |
|
logger.error(f"Error en analyze_text_dimensions: {str(e)}") |
|
raise |
|
|
|
def analyze_clarity(doc): |
|
"""Analiza la claridad basada en longitud de oraciones""" |
|
sentences = list(doc.sents) |
|
avg_length = sum(len(sent) for sent in sentences) / len(sentences) |
|
return normalize_score(avg_length, optimal_length=20) |
|
|
|
def analyze_vocabulary_diversity(doc): |
|
"""Analiza la diversidad del vocabulario""" |
|
unique_lemmas = {token.lemma_ for token in doc if token.is_alpha} |
|
total_words = len([token for token in doc if token.is_alpha]) |
|
return len(unique_lemmas) / total_words if total_words > 0 else 0 |
|
|
|
def analyze_cohesion(doc): |
|
"""Analiza la cohesión textual""" |
|
sentences = list(doc.sents) |
|
connections = 0 |
|
for i in range(len(sentences)-1): |
|
sent1_words = {token.lemma_ for token in sentences[i]} |
|
sent2_words = {token.lemma_ for token in sentences[i+1]} |
|
connections += len(sent1_words.intersection(sent2_words)) |
|
return normalize_score(connections, optimal_connections=5) |
|
|
|
def analyze_structure(doc): |
|
"""Analiza la complejidad estructural""" |
|
root_distances = [] |
|
for token in doc: |
|
if token.dep_ == 'ROOT': |
|
depths = get_dependency_depths(token) |
|
root_distances.extend(depths) |
|
avg_depth = sum(root_distances) / len(root_distances) if root_distances else 0 |
|
return normalize_score(avg_depth, optimal_depth=3) |
|
|
|
|
|
|
|
def get_dependency_depths(token, depth=0): |
|
"""Obtiene las profundidades de dependencia""" |
|
depths = [depth] |
|
for child in token.children: |
|
depths.extend(get_dependency_depths(child, depth + 1)) |
|
return depths |
|
|
|
def normalize_score(value, optimal_value=1.0, range_factor=2.0): |
|
"""Normaliza un valor a un score entre 0 y 1""" |
|
return 1 / (1 + abs(value - optimal_value) / range_factor) |
|
|
|
|
|
|
|
def generate_sentence_graphs(doc): |
|
"""Genera visualizaciones de estructura de oraciones""" |
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
|
plt.close() |
|
return fig |
|
|
|
def generate_word_connections(doc): |
|
"""Genera red de conexiones de palabras""" |
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
|
plt.close() |
|
return fig |
|
|
|
def generate_connection_paths(doc): |
|
"""Genera patrones de conexión""" |
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
|
plt.close() |
|
return fig |
|
|
|
def create_vocabulary_network(doc): |
|
""" |
|
Genera el grafo de red de vocabulario. |
|
""" |
|
G = nx.Graph() |
|
|
|
|
|
words = [token.text.lower() for token in doc if token.is_alpha and not token.is_stop] |
|
word_freq = Counter(words) |
|
|
|
|
|
for word, freq in word_freq.items(): |
|
G.add_node(word, size=freq) |
|
|
|
|
|
window_size = 5 |
|
for i in range(len(words) - window_size): |
|
window = words[i:i+window_size] |
|
for w1, w2 in combinations(set(window), 2): |
|
if G.has_edge(w1, w2): |
|
G[w1][w2]['weight'] += 1 |
|
else: |
|
G.add_edge(w1, w2, weight=1) |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(12, 8)) |
|
pos = nx.spring_layout(G) |
|
|
|
|
|
nx.draw_networkx_nodes(G, pos, |
|
node_size=[G.nodes[node]['size']*100 for node in G.nodes], |
|
node_color='lightblue', |
|
alpha=0.7) |
|
|
|
|
|
nx.draw_networkx_edges(G, pos, |
|
width=[G[u][v]['weight']*0.5 for u,v in G.edges], |
|
alpha=0.5) |
|
|
|
|
|
nx.draw_networkx_labels(G, pos) |
|
|
|
plt.title("Red de Vocabulario") |
|
plt.axis('off') |
|
return fig |
|
|
|
def create_syntax_complexity_graph(doc): |
|
""" |
|
Genera el diagrama de arco de complejidad sintáctica. |
|
Muestra la estructura de dependencias con colores basados en la complejidad. |
|
""" |
|
try: |
|
|
|
sentences = list(doc.sents) |
|
if not sentences: |
|
return None |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(12, len(sentences) * 2)) |
|
|
|
|
|
depth_colors = plt.cm.viridis(np.linspace(0, 1, 6)) |
|
|
|
y_offset = 0 |
|
max_x = 0 |
|
|
|
for sent in sentences: |
|
words = [token.text for token in sent] |
|
x_positions = range(len(words)) |
|
max_x = max(max_x, len(words)) |
|
|
|
|
|
plt.plot(x_positions, [y_offset] * len(words), 'k-', alpha=0.2) |
|
plt.scatter(x_positions, [y_offset] * len(words), alpha=0) |
|
|
|
|
|
for i, word in enumerate(words): |
|
plt.annotate(word, (i, y_offset), xytext=(0, -10), |
|
textcoords='offset points', ha='center') |
|
|
|
|
|
for token in sent: |
|
if token.dep_ != "ROOT": |
|
|
|
depth = 0 |
|
current = token |
|
while current.head != current: |
|
depth += 1 |
|
current = current.head |
|
|
|
|
|
start = token.i - sent[0].i |
|
end = token.head.i - sent[0].i |
|
|
|
|
|
height = 0.5 * abs(end - start) |
|
|
|
|
|
color = depth_colors[min(depth, len(depth_colors)-1)] |
|
|
|
|
|
arc = patches.Arc((min(start, end) + abs(end - start)/2, y_offset), |
|
width=abs(end - start), |
|
height=height, |
|
angle=0, |
|
theta1=0, |
|
theta2=180, |
|
color=color, |
|
alpha=0.6) |
|
ax.add_patch(arc) |
|
|
|
y_offset -= 2 |
|
|
|
|
|
plt.xlim(-1, max_x) |
|
plt.ylim(y_offset - 1, 1) |
|
plt.axis('off') |
|
plt.title("Complejidad Sintáctica") |
|
|
|
return fig |
|
|
|
except Exception as e: |
|
logger.error(f"Error en create_syntax_complexity_graph: {str(e)}") |
|
return None |
|
|
|
|
|
def create_cohesion_heatmap(doc): |
|
""" |
|
Genera un mapa de calor que muestra la cohesión entre párrafos/oraciones. |
|
""" |
|
try: |
|
|
|
sentences = list(doc.sents) |
|
n_sentences = len(sentences) |
|
|
|
if n_sentences < 2: |
|
return None |
|
|
|
|
|
similarity_matrix = np.zeros((n_sentences, n_sentences)) |
|
|
|
|
|
for i in range(n_sentences): |
|
for j in range(n_sentences): |
|
sent1_lemmas = {token.lemma_ for token in sentences[i] |
|
if token.is_alpha and not token.is_stop} |
|
sent2_lemmas = {token.lemma_ for token in sentences[j] |
|
if token.is_alpha and not token.is_stop} |
|
|
|
if sent1_lemmas and sent2_lemmas: |
|
intersection = len(sent1_lemmas & sent2_words) |
|
union = len(sent1_lemmas | sent2_words) |
|
similarity_matrix[i, j] = intersection / union if union > 0 else 0 |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
|
sns.heatmap(similarity_matrix, |
|
cmap='YlOrRd', |
|
square=True, |
|
xticklabels=False, |
|
yticklabels=False, |
|
cbar_kws={'label': 'Cohesión'}, |
|
ax=ax) |
|
|
|
plt.title("Mapa de Cohesión Textual") |
|
plt.xlabel("Oraciones") |
|
plt.ylabel("Oraciones") |
|
|
|
plt.tight_layout() |
|
return fig |
|
|
|
except Exception as e: |
|
logger.error(f"Error en create_cohesion_heatmap: {str(e)}") |
|
return None |
|
|