Nubedepalabras / app.py
dexttttrees's picture
Upload 2 files
03b1bed verified
import gradio as gr
import nltk
from nltk.corpus import cess_esp, conll2002
from nltk.tokenize import word_tokenize
import stylecloud
import matplotlib.pyplot as plt
from fpdf import FPDF
import re
from collections import Counter
import spacy
import random
import csv
# Descargar recursos necesarios de nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('cess_esp')
nltk.download('conll2002')
# Cargar el modelo de spaCy para español
nlp = spacy.load('es_core_news_md')
additional_stopwords = [
# Aquí puedes agregar más stopwords si lo deseas
]
# Función de preprocesamiento
def preprocess_text(text):
text = text.lower()
text = re.sub(r'\W', ' ', text)
doc = nlp(text)
stop_words = set(nltk.corpus.stopwords.words('spanish')).union(set(additional_stopwords))
filtered_words = [token.lemma_ for token in doc if token.text not in stop_words and token.pos_ in ['VERB', 'ADJ', 'NOUN']]
return filtered_words
# Obtener oraciones de ejemplo de múltiples corpus de nltk
def get_example_sentences(word, num_sentences=1):
sentences = []
for corpus in [cess_esp, conll2002]:
for sent in corpus.sents():
if word in sent and len(word) > 1:
sentences.append(' '.join(sent))
if len(sentences) >= num_sentences:
break
if len(sentences) >= num_sentences:
break
return sentences
# Función para generar la nube de palabras con estilo aleatorio
def generate_random_style_cloud(words, filename):
text = ' '.join(words)
icons = ['fas fa-cloud', 'fas fa-star', 'fas fa-heart', 'fas fa-tree', 'fas fa-sun', 'fas fa-moon']
random_icon = random.choice(icons)
stylecloud.gen_stylecloud(text=text, icon_name=random_icon, output_name=filename)
img = plt.imread(filename)
plt.imshow(img)
plt.axis('off')
plt.show()
# Crear el documento PDF
class PDF(FPDF):
def header(self):
self.set_fill_color(200, 220, 255)
self.rect(0, 0, 10, 297, 'F')
self.rect(200, 0, 10, 297, 'F')
def footer(self):
self.set_y(-15)
self.set_font('Arial', 'I', 8)
self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
def add_text_to_pdf(pdf, text, title):
filtered_words = preprocess_text(text)
word_freq = Counter(filtered_words)
word_freq_file = f"word_freq_{title}.csv"
with open(word_freq_file, 'w') as f:
writer = csv.writer(f)
writer.writerow(['word', 'frequency'])
for word, freq in word_freq.items():
writer.writerow([word, freq])
cloud_filename = f'wordcloud_{title}.png'
generate_random_style_cloud(filtered_words, cloud_filename)
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, title, ln=True, align='C')
pdf.set_draw_color(0, 0, 0)
pdf.set_line_width(0.5)
pdf.line(10, 25, 200, 25)
pdf.image(cloud_filename, x=15, y=30, w=180)
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, "Oraciones de ejemplo", ln=True, align='C')
high_freq_words = sorted([word.upper() for word, freq in word_freq.most_common(20)])
pdf.set_font('Arial', 'B', 12)
pdf.set_fill_color(200, 200, 200)
pdf.cell(90, 10, 'PALABRA', 1, fill=True)
pdf.cell(0, 10, 'ORACIÓN DE EJEMPLO', 1, fill=True)
pdf.ln()
pdf.set_font('Arial', '', 12)
pdf.set_line_width(0.1)
for word in high_freq_words:
example_sent = get_example_sentences(word.lower())
if example_sent:
example_sentence = example_sent[0].replace(word.lower(), f'**{word}**').replace(word, f'**{word}**')
pdf.cell(90, 10, word, 1)
pdf.set_font('Arial', '', 10)
pdf.multi_cell(0, 10, example_sentence, 1)
pdf.set_font('Arial', 'I', 8)
pdf.cell(90, 10, '', 0)
pdf.cell(0, 10, 'Fuente: NLTK', 0)
pdf.set_font('Arial', '', 12)
else:
continue
pdf.ln()
# Función principal para la interfaz de Gradio
def create_pdf_from_text(text, title):
pdf = PDF()
add_text_to_pdf(pdf, text, title)
pdf_filename = f'{title}.pdf'
pdf.output(pdf_filename)
return pdf_filename
# Interfaz de Gradio
iface = gr.Interface(
fn=create_pdf_from_text,
inputs=[gr.inputs.Textbox(lines=10, label="Texto en Español"), gr.inputs.Textbox(label="Título")],
outputs=gr.outputs.File(label="Descargar PDF"),
title="Generador de PDFs con Nubes de Palabras"
)
if __name__ == "__main__":
iface.launch()