Spaces:
Sleeping
Sleeping
File size: 4,470 Bytes
d697844 f22332c 4d7fa61 d697844 f22332c d697844 f22332c d697844 f22332c d697844 f22332c d697844 f22332c d697844 f22332c d697844 f22332c d697844 f22332c d697844 f22332c d697844 f22332c d697844 f22332c d697844 f22332c d697844 f22332c d697844 f22332c d697844 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import gradio as gr
from docx import Document
import os
import zipfile
import tempfile
def split_by_headers(file_path, headers_per_chunk=1):
doc = Document(file_path)
chunks = []
current_chunk = Document()
header_count = 0
for paragraph in doc.paragraphs:
if paragraph.style.name.startswith('Heading'):
header_count += 1
if header_count > headers_per_chunk:
chunks.append(current_chunk)
current_chunk = Document()
header_count = 1
current_chunk.add_paragraph(paragraph.text, style=paragraph.style.name)
if len(current_chunk.paragraphs):
chunks.append(current_chunk)
return chunks
def split_by_pages(file_path, pages_per_chunk=1):
doc = Document(file_path)
chunks = []
current_chunk = Document()
page_count = 0
estimated_chars_per_page = 3000
char_count = 0
for paragraph in doc.paragraphs:
text = paragraph.text
char_count += len(text)
if char_count >= estimated_chars_per_page:
page_count += 1
char_count = 0
if page_count >= pages_per_chunk:
chunks.append(current_chunk)
current_chunk = Document()
page_count = 0
current_chunk.add_paragraph(text, style=paragraph.style.name)
if len(current_chunk.paragraphs):
chunks.append(current_chunk)
return chunks
def save_chunks(chunks, original_filename, temp_dir):
saved_files = []
base_name = os.path.splitext(os.path.basename(original_filename))[0]
for i, chunk in enumerate(chunks, 1):
output_path = os.path.join(temp_dir, f"{base_name}_part{i}.docx")
chunk.save(output_path)
saved_files.append(output_path)
return saved_files
def create_zip_file(file_paths, zip_path):
with zipfile.ZipFile(zip_path, 'w') as zipf:
for file_path in file_paths:
zipf.write(file_path, os.path.basename(file_path))
return zip_path
def process_document(file, split_type, headers_or_pages, download_type):
if headers_or_pages < 1:
return None, "Por favor, especifique un número positivo de encabezados o páginas por fragmento."
try:
# Crear directorio temporal
temp_dir = tempfile.mkdtemp()
# Procesar el documento
if split_type == "Encabezados":
chunks = split_by_headers(file.name, headers_or_pages)
else: # Páginas
chunks = split_by_pages(file.name, headers_or_pages)
# Guardar chunks en el directorio temporal
saved_files = save_chunks(chunks, file.name, temp_dir)
if download_type == "ZIP":
# Crear archivo ZIP
zip_path = os.path.join(temp_dir, "documentos_divididos.zip")
create_zip_file(saved_files, zip_path)
return zip_path, f"Documento dividido en {len(saved_files)} partes y comprimido en ZIP."
else: # Archivos individuales
return saved_files, f"Documento dividido en {len(saved_files)} partes."
except Exception as e:
return None, f"Error al procesar el documento: {str(e)}"
finally:
# Programar la limpieza del directorio temporal
# (Gradio se encargará de esto después de la descarga)
pass
# Interfaz Gradio
with gr.Blocks() as iface:
gr.Markdown("# Divisor de Documentos DOCX")
with gr.Row():
file_input = gr.File(label="Seleccione el archivo DOCX")
split_type = gr.Radio(
["Encabezados", "Páginas"],
label="Método de división",
value="Encabezados"
)
with gr.Row():
headers_pages = gr.Number(
value=1,
label="Número de encabezados/páginas por fragmento",
minimum=1
)
download_type = gr.Radio(
["Individual", "ZIP"],
label="Tipo de descarga",
value="ZIP"
)
process_btn = gr.Button("Procesar Documento")
output_text = gr.Text(label="Estado")
file_output = gr.File(label="Archivos Procesados")
process_btn.click(
fn=process_document,
inputs=[file_input, split_type, headers_pages, download_type],
outputs=[file_output, output_text]
)
if __name__ == "__main__":
iface.launch() |