Spaces:
Build error
Build error
File size: 4,682 Bytes
03b1bed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import gradio as gr
import nltk
from nltk.corpus import cess_esp, conll2002
from nltk.tokenize import word_tokenize
import stylecloud
import matplotlib.pyplot as plt
from fpdf import FPDF
import re
from collections import Counter
import spacy
import random
import csv
# Descargar recursos necesarios de nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('cess_esp')
nltk.download('conll2002')
# Cargar el modelo de spaCy para español
nlp = spacy.load('es_core_news_md')
additional_stopwords = [
# Aquí puedes agregar más stopwords si lo deseas
]
# Función de preprocesamiento
def preprocess_text(text):
text = text.lower()
text = re.sub(r'\W', ' ', text)
doc = nlp(text)
stop_words = set(nltk.corpus.stopwords.words('spanish')).union(set(additional_stopwords))
filtered_words = [token.lemma_ for token in doc if token.text not in stop_words and token.pos_ in ['VERB', 'ADJ', 'NOUN']]
return filtered_words
# Obtener oraciones de ejemplo de múltiples corpus de nltk
def get_example_sentences(word, num_sentences=1):
sentences = []
for corpus in [cess_esp, conll2002]:
for sent in corpus.sents():
if word in sent and len(word) > 1:
sentences.append(' '.join(sent))
if len(sentences) >= num_sentences:
break
if len(sentences) >= num_sentences:
break
return sentences
# Función para generar la nube de palabras con estilo aleatorio
def generate_random_style_cloud(words, filename):
text = ' '.join(words)
icons = ['fas fa-cloud', 'fas fa-star', 'fas fa-heart', 'fas fa-tree', 'fas fa-sun', 'fas fa-moon']
random_icon = random.choice(icons)
stylecloud.gen_stylecloud(text=text, icon_name=random_icon, output_name=filename)
img = plt.imread(filename)
plt.imshow(img)
plt.axis('off')
plt.show()
# Crear el documento PDF
class PDF(FPDF):
def header(self):
self.set_fill_color(200, 220, 255)
self.rect(0, 0, 10, 297, 'F')
self.rect(200, 0, 10, 297, 'F')
def footer(self):
self.set_y(-15)
self.set_font('Arial', 'I', 8)
self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
def add_text_to_pdf(pdf, text, title):
filtered_words = preprocess_text(text)
word_freq = Counter(filtered_words)
word_freq_file = f"word_freq_{title}.csv"
with open(word_freq_file, 'w') as f:
writer = csv.writer(f)
writer.writerow(['word', 'frequency'])
for word, freq in word_freq.items():
writer.writerow([word, freq])
cloud_filename = f'wordcloud_{title}.png'
generate_random_style_cloud(filtered_words, cloud_filename)
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, title, ln=True, align='C')
pdf.set_draw_color(0, 0, 0)
pdf.set_line_width(0.5)
pdf.line(10, 25, 200, 25)
pdf.image(cloud_filename, x=15, y=30, w=180)
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, "Oraciones de ejemplo", ln=True, align='C')
high_freq_words = sorted([word.upper() for word, freq in word_freq.most_common(20)])
pdf.set_font('Arial', 'B', 12)
pdf.set_fill_color(200, 200, 200)
pdf.cell(90, 10, 'PALABRA', 1, fill=True)
pdf.cell(0, 10, 'ORACIÓN DE EJEMPLO', 1, fill=True)
pdf.ln()
pdf.set_font('Arial', '', 12)
pdf.set_line_width(0.1)
for word in high_freq_words:
example_sent = get_example_sentences(word.lower())
if example_sent:
example_sentence = example_sent[0].replace(word.lower(), f'**{word}**').replace(word, f'**{word}**')
pdf.cell(90, 10, word, 1)
pdf.set_font('Arial', '', 10)
pdf.multi_cell(0, 10, example_sentence, 1)
pdf.set_font('Arial', 'I', 8)
pdf.cell(90, 10, '', 0)
pdf.cell(0, 10, 'Fuente: NLTK', 0)
pdf.set_font('Arial', '', 12)
else:
continue
pdf.ln()
# Función principal para la interfaz de Gradio
def create_pdf_from_text(text, title):
pdf = PDF()
add_text_to_pdf(pdf, text, title)
pdf_filename = f'{title}.pdf'
pdf.output(pdf_filename)
return pdf_filename
# Interfaz de Gradio
iface = gr.Interface(
fn=create_pdf_from_text,
inputs=[gr.inputs.Textbox(lines=10, label="Texto en Español"), gr.inputs.Textbox(label="Título")],
outputs=gr.outputs.File(label="Descargar PDF"),
title="Generador de PDFs con Nubes de Palabras"
)
if __name__ == "__main__":
iface.launch()
|