Spaces:

lik07
/

docx-spliter

Sleeping

App Files Files Community

lik07 commited on Oct 5, 2024

Commit

f22332c

verified ·

1 Parent(s): 4d7fa61

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -53

app.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import gradio as gr
 from docx import Document
 import os
-from docx import Document
-from docx.oxml import CT_P
 def split_by_headers(file_path, headers_per_chunk=1):
     doc = Document(file_path)
@@ -11,19 +10,17 @@ def split_by_headers(file_path, headers_per_chunk=1):
     current_chunk = Document()
     header_count = 0
-    for element in doc.element.body:
-        if isinstance(element, CT_P):
-            paragraph = element
-            if any(style.val.startswith('Heading') for style in paragraph.xpath('.//w:pStyle')):
-                header_count += 1
-                if header_count > headers_per_chunk:
-                    chunks.append(current_chunk)
-                    current_chunk = Document()
-                    header_count = 1
-            current_chunk.element.body.append(element)
-    if len(current_chunk.element.body):
         chunks.append(current_chunk)
     return chunks
@@ -33,65 +30,113 @@ def split_by_pages(file_path, pages_per_chunk=1):
     chunks = []
     current_chunk = Document()
     page_count = 0
-    estimated_chars_per_page = 3000  # This is an estimation
     char_count = 0
-    for element in doc.element.body:
-        if element.tag.endswith('p'):
-            text = element.text
-            char_count += len(text)
-            if char_count >= estimated_chars_per_page:
-                page_count += 1
-                char_count = 0
-                if page_count >= pages_per_chunk:
-                    chunks.append(current_chunk)
-                    current_chunk = Document()
-                    page_count = 0
-            current_chunk.element.body.append(element)
-    if len(current_chunk.element.body):
         chunks.append(current_chunk)
     return chunks
-def save_chunks(chunks, original_filename):
     saved_files = []
-    base_name = os.path.splitext(original_filename)[0]
     for i, chunk in enumerate(chunks, 1):
-        output_path = f"{base_name}_part{i}.docx"
         chunk.save(output_path)
         saved_files.append(output_path)
     return saved_files
-def process_document(file, split_type, headers_or_pages):
     if headers_or_pages < 1:
-        return "Por favor, especifique un número positivo de encabezados o páginas por fragmento."
-    if split_type == "Encabezados":
-        chunks = split_by_headers(file.name, headers_or_pages)
-    else:  # Páginas
-        chunks = split_by_pages(file.name, headers_or_pages)
-    saved_files = save_chunks(chunks, os.path.basename(file.name))
-    return f"Documento dividido en {len(saved_files)} partes: {', '.join(saved_files)}"
 # Interfaz Gradio
-iface = gr.Interface(
-    fn=process_document,
-    inputs=[
-        gr.File(label="Seleccione el archivo DOCX"),
-        gr.Radio(["Encabezados", "Páginas"], label="Método de división"),
-        gr.Number(value=1, label="Número de encabezados/páginas por fragmento", minimum=1)
-    ],
-    outputs=gr.Text(label="Resultado"),
-    title="Divisor de Documentos DOCX",
-    description="Divida documentos DOCX por encabezados o páginas estimadas"
-)
 if __name__ == "__main__":
     iface.launch()

 import gradio as gr
 from docx import Document
 import os
+import zipfile
+import tempfile
 def split_by_headers(file_path, headers_per_chunk=1):
     doc = Document(file_path)
     current_chunk = Document()
     header_count = 0
+    for paragraph in doc.paragraphs:
+        if paragraph.style.name.startswith('Heading'):
+            header_count += 1
+            if header_count > headers_per_chunk:
+                chunks.append(current_chunk)
+                current_chunk = Document()
+                header_count = 1
+        current_chunk.add_paragraph(paragraph.text, style=paragraph.style.name)
+    if len(current_chunk.paragraphs):
         chunks.append(current_chunk)
     return chunks
     chunks = []
     current_chunk = Document()
     page_count = 0
+    estimated_chars_per_page = 3000
     char_count = 0
+    for paragraph in doc.paragraphs:
+        text = paragraph.text
+        char_count += len(text)
+        if char_count >= estimated_chars_per_page:
+            page_count += 1
+            char_count = 0
+            if page_count >= pages_per_chunk:
+                chunks.append(current_chunk)
+                current_chunk = Document()
+                page_count = 0
+        current_chunk.add_paragraph(text, style=paragraph.style.name)
+    if len(current_chunk.paragraphs):
         chunks.append(current_chunk)
     return chunks
+def save_chunks(chunks, original_filename, temp_dir):
     saved_files = []
+    base_name = os.path.splitext(os.path.basename(original_filename))[0]
     for i, chunk in enumerate(chunks, 1):
+        output_path = os.path.join(temp_dir, f"{base_name}_part{i}.docx")
         chunk.save(output_path)
         saved_files.append(output_path)
     return saved_files
+def create_zip_file(file_paths, zip_path):
+    with zipfile.ZipFile(zip_path, 'w') as zipf:
+        for file_path in file_paths:
+            zipf.write(file_path, os.path.basename(file_path))
+    return zip_path
+def process_document(file, split_type, headers_or_pages, download_type):
     if headers_or_pages < 1:
+        return None, "Por favor, especifique un número positivo de encabezados o páginas por fragmento."
+    try:
+        # Crear directorio temporal
+        temp_dir = tempfile.mkdtemp()
+        # Procesar el documento
+        if split_type == "Encabezados":
+            chunks = split_by_headers(file.name, headers_or_pages)
+        else:  # Páginas
+            chunks = split_by_pages(file.name, headers_or_pages)
+        # Guardar chunks en el directorio temporal
+        saved_files = save_chunks(chunks, file.name, temp_dir)
+        if download_type == "ZIP":
+            # Crear archivo ZIP
+            zip_path = os.path.join(temp_dir, "documentos_divididos.zip")
+            create_zip_file(saved_files, zip_path)
+            return zip_path, f"Documento dividido en {len(saved_files)} partes y comprimido en ZIP."
+        else:  # Archivos individuales
+            return saved_files, f"Documento dividido en {len(saved_files)} partes."
+    except Exception as e:
+        return None, f"Error al procesar el documento: {str(e)}"
+    finally:
+        # Programar la limpieza del directorio temporal
+        # (Gradio se encargará de esto después de la descarga)
+        pass
 # Interfaz Gradio
+with gr.Blocks() as iface:
+    gr.Markdown("# Divisor de Documentos DOCX")
+    with gr.Row():
+        file_input = gr.File(label="Seleccione el archivo DOCX")
+        split_type = gr.Radio(
+            ["Encabezados", "Páginas"],
+            label="Método de división",
+            value="Encabezados"
+        )
+    with gr.Row():
+        headers_pages = gr.Number(
+            value=1,
+            label="Número de encabezados/páginas por fragmento",
+            minimum=1
+        )
+        download_type = gr.Radio(
+            ["Individual", "ZIP"],
+            label="Tipo de descarga",
+            value="ZIP"
+        )
+    process_btn = gr.Button("Procesar Documento")
+    output_text = gr.Text(label="Estado")
+    file_output = gr.File(label="Archivos Procesados")
+    process_btn.click(
+        fn=process_document,
+        inputs=[file_input, split_type, headers_pages, download_type],
+        outputs=[file_output, output_text]
+    )
 if __name__ == "__main__":
     iface.launch()