File size: 4,682 Bytes
03b1bed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import gradio as gr
import nltk
from nltk.corpus import cess_esp, conll2002
from nltk.tokenize import word_tokenize
import stylecloud
import matplotlib.pyplot as plt
from fpdf import FPDF
import re
from collections import Counter
import spacy
import random
import csv

# Descargar recursos necesarios de nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('cess_esp')
nltk.download('conll2002')

# Cargar el modelo de spaCy para español
nlp = spacy.load('es_core_news_md')

additional_stopwords = [
    # Aquí puedes agregar más stopwords si lo deseas
]

# Función de preprocesamiento
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    doc = nlp(text)
    stop_words = set(nltk.corpus.stopwords.words('spanish')).union(set(additional_stopwords))
    filtered_words = [token.lemma_ for token in doc if token.text not in stop_words and token.pos_ in ['VERB', 'ADJ', 'NOUN']]
    return filtered_words

# Obtener oraciones de ejemplo de múltiples corpus de nltk
def get_example_sentences(word, num_sentences=1):
    sentences = []
    for corpus in [cess_esp, conll2002]:
        for sent in corpus.sents():
            if word in sent and len(word) > 1:
                sentences.append(' '.join(sent))
            if len(sentences) >= num_sentences:
                break
        if len(sentences) >= num_sentences:
            break
    return sentences

# Función para generar la nube de palabras con estilo aleatorio
def generate_random_style_cloud(words, filename):
    text = ' '.join(words)
    icons = ['fas fa-cloud', 'fas fa-star', 'fas fa-heart', 'fas fa-tree', 'fas fa-sun', 'fas fa-moon']
    random_icon = random.choice(icons)
    stylecloud.gen_stylecloud(text=text, icon_name=random_icon, output_name=filename)
    img = plt.imread(filename)
    plt.imshow(img)
    plt.axis('off')
    plt.show()

# Crear el documento PDF
class PDF(FPDF):
    def header(self):
        self.set_fill_color(200, 220, 255)
        self.rect(0, 0, 10, 297, 'F')
        self.rect(200, 0, 10, 297, 'F')

    def footer(self):
        self.set_y(-15)
        self.set_font('Arial', 'I', 8)
        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')

def add_text_to_pdf(pdf, text, title):
    filtered_words = preprocess_text(text)
    word_freq = Counter(filtered_words)
    word_freq_file = f"word_freq_{title}.csv"

    with open(word_freq_file, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['word', 'frequency'])
        for word, freq in word_freq.items():
            writer.writerow([word, freq])

    cloud_filename = f'wordcloud_{title}.png'
    generate_random_style_cloud(filtered_words, cloud_filename)

    pdf.add_page()
    pdf.set_font('Arial', 'B', 16)
    pdf.cell(0, 10, title, ln=True, align='C')
    pdf.set_draw_color(0, 0, 0)
    pdf.set_line_width(0.5)
    pdf.line(10, 25, 200, 25)
    pdf.image(cloud_filename, x=15, y=30, w=180)

    pdf.add_page()
    pdf.set_font('Arial', 'B', 16)
    pdf.cell(0, 10, "Oraciones de ejemplo", ln=True, align='C')

    high_freq_words = sorted([word.upper() for word, freq in word_freq.most_common(20)])

    pdf.set_font('Arial', 'B', 12)
    pdf.set_fill_color(200, 200, 200)
    pdf.cell(90, 10, 'PALABRA', 1, fill=True)
    pdf.cell(0, 10, 'ORACIÓN DE EJEMPLO', 1, fill=True)
    pdf.ln()

    pdf.set_font('Arial', '', 12)
    pdf.set_line_width(0.1)
    for word in high_freq_words:
        example_sent = get_example_sentences(word.lower())
        if example_sent:
            example_sentence = example_sent[0].replace(word.lower(), f'**{word}**').replace(word, f'**{word}**')
            pdf.cell(90, 10, word, 1)
            pdf.set_font('Arial', '', 10)
            pdf.multi_cell(0, 10, example_sentence, 1)
            pdf.set_font('Arial', 'I', 8)
            pdf.cell(90, 10, '', 0)
            pdf.cell(0, 10, 'Fuente: NLTK', 0)
            pdf.set_font('Arial', '', 12)
        else:
            continue
        pdf.ln()

# Función principal para la interfaz de Gradio
def create_pdf_from_text(text, title):
    pdf = PDF()
    add_text_to_pdf(pdf, text, title)
    pdf_filename = f'{title}.pdf'
    pdf.output(pdf_filename)
    return pdf_filename

# Interfaz de Gradio
iface = gr.Interface(
    fn=create_pdf_from_text,
    inputs=[gr.inputs.Textbox(lines=10, label="Texto en Español"), gr.inputs.Textbox(label="Título")],
    outputs=gr.outputs.File(label="Descargar PDF"),
    title="Generador de PDFs con Nubes de Palabras"
)

if __name__ == "__main__":
    iface.launch()