File size: 4,470 Bytes
d697844
 
 
f22332c
 
4d7fa61
d697844
 
 
 
 
 
f22332c
 
 
 
 
 
 
 
 
d697844
f22332c
d697844
 
 
 
 
 
 
 
 
f22332c
d697844
 
f22332c
 
 
 
 
 
 
d697844
f22332c
 
 
 
 
 
d697844
f22332c
d697844
 
 
 
f22332c
d697844
f22332c
d697844
 
f22332c
d697844
 
 
 
 
f22332c
 
 
 
 
 
 
d697844
f22332c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d697844
f22332c
 
d697844
f22332c
 
 
 
d697844
 
f22332c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d697844
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
from docx import Document
import os
import zipfile
import tempfile

def split_by_headers(file_path, headers_per_chunk=1):
    doc = Document(file_path)
    chunks = []
    current_chunk = Document()
    header_count = 0
    
    for paragraph in doc.paragraphs:
        if paragraph.style.name.startswith('Heading'):
            header_count += 1
            if header_count > headers_per_chunk:
                chunks.append(current_chunk)
                current_chunk = Document()
                header_count = 1
        
        current_chunk.add_paragraph(paragraph.text, style=paragraph.style.name)
    
    if len(current_chunk.paragraphs):
        chunks.append(current_chunk)
    
    return chunks

def split_by_pages(file_path, pages_per_chunk=1):
    doc = Document(file_path)
    chunks = []
    current_chunk = Document()
    page_count = 0
    estimated_chars_per_page = 3000
    char_count = 0
    
    for paragraph in doc.paragraphs:
        text = paragraph.text
        char_count += len(text)
        
        if char_count >= estimated_chars_per_page:
            page_count += 1
            char_count = 0
            
            if page_count >= pages_per_chunk:
                chunks.append(current_chunk)
                current_chunk = Document()
                page_count = 0
        
        current_chunk.add_paragraph(text, style=paragraph.style.name)
    
    if len(current_chunk.paragraphs):
        chunks.append(current_chunk)
    
    return chunks

def save_chunks(chunks, original_filename, temp_dir):
    saved_files = []
    base_name = os.path.splitext(os.path.basename(original_filename))[0]
    
    for i, chunk in enumerate(chunks, 1):
        output_path = os.path.join(temp_dir, f"{base_name}_part{i}.docx")
        chunk.save(output_path)
        saved_files.append(output_path)
    
    return saved_files

def create_zip_file(file_paths, zip_path):
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        for file_path in file_paths:
            zipf.write(file_path, os.path.basename(file_path))
    return zip_path

def process_document(file, split_type, headers_or_pages, download_type):
    if headers_or_pages < 1:
        return None, "Por favor, especifique un número positivo de encabezados o páginas por fragmento."
    
    try:
        # Crear directorio temporal
        temp_dir = tempfile.mkdtemp()
        
        # Procesar el documento
        if split_type == "Encabezados":
            chunks = split_by_headers(file.name, headers_or_pages)
        else:  # Páginas
            chunks = split_by_pages(file.name, headers_or_pages)
        
        # Guardar chunks en el directorio temporal
        saved_files = save_chunks(chunks, file.name, temp_dir)
        
        if download_type == "ZIP":
            # Crear archivo ZIP
            zip_path = os.path.join(temp_dir, "documentos_divididos.zip")
            create_zip_file(saved_files, zip_path)
            return zip_path, f"Documento dividido en {len(saved_files)} partes y comprimido en ZIP."
        else:  # Archivos individuales
            return saved_files, f"Documento dividido en {len(saved_files)} partes."
    
    except Exception as e:
        return None, f"Error al procesar el documento: {str(e)}"
    
    finally:
        # Programar la limpieza del directorio temporal
        # (Gradio se encargará de esto después de la descarga)
        pass

# Interfaz Gradio
with gr.Blocks() as iface:
    gr.Markdown("# Divisor de Documentos DOCX")
    
    with gr.Row():
        file_input = gr.File(label="Seleccione el archivo DOCX")
        split_type = gr.Radio(
            ["Encabezados", "Páginas"], 
            label="Método de división",
            value="Encabezados"
        )
    
    with gr.Row():
        headers_pages = gr.Number(
            value=1, 
            label="Número de encabezados/páginas por fragmento", 
            minimum=1
        )
        download_type = gr.Radio(
            ["Individual", "ZIP"], 
            label="Tipo de descarga",
            value="ZIP"
        )
    
    process_btn = gr.Button("Procesar Documento")
    output_text = gr.Text(label="Estado")
    
    file_output = gr.File(label="Archivos Procesados")
    
    process_btn.click(
        fn=process_document,
        inputs=[file_input, split_type, headers_pages, download_type],
        outputs=[file_output, output_text]
    )

if __name__ == "__main__":
    iface.launch()