lik07 commited on
Commit
f22332c
·
verified ·
1 Parent(s): 4d7fa61

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -53
app.py CHANGED
@@ -1,9 +1,8 @@
1
  import gradio as gr
2
  from docx import Document
3
  import os
4
-
5
- from docx import Document
6
- from docx.oxml import CT_P
7
 
8
  def split_by_headers(file_path, headers_per_chunk=1):
9
  doc = Document(file_path)
@@ -11,19 +10,17 @@ def split_by_headers(file_path, headers_per_chunk=1):
11
  current_chunk = Document()
12
  header_count = 0
13
 
14
- for element in doc.element.body:
15
- if isinstance(element, CT_P):
16
- paragraph = element
17
- if any(style.val.startswith('Heading') for style in paragraph.xpath('.//w:pStyle')):
18
- header_count += 1
19
- if header_count > headers_per_chunk:
20
- chunks.append(current_chunk)
21
- current_chunk = Document()
22
- header_count = 1
23
-
24
- current_chunk.element.body.append(element)
25
 
26
- if len(current_chunk.element.body):
27
  chunks.append(current_chunk)
28
 
29
  return chunks
@@ -33,65 +30,113 @@ def split_by_pages(file_path, pages_per_chunk=1):
33
  chunks = []
34
  current_chunk = Document()
35
  page_count = 0
36
- estimated_chars_per_page = 3000 # This is an estimation
37
  char_count = 0
38
 
39
- for element in doc.element.body:
40
- if element.tag.endswith('p'):
41
- text = element.text
42
- char_count += len(text)
 
 
 
43
 
44
- if char_count >= estimated_chars_per_page:
45
- page_count += 1
46
- char_count = 0
47
-
48
- if page_count >= pages_per_chunk:
49
- chunks.append(current_chunk)
50
- current_chunk = Document()
51
- page_count = 0
52
-
53
- current_chunk.element.body.append(element)
54
 
55
- if len(current_chunk.element.body):
56
  chunks.append(current_chunk)
57
 
58
  return chunks
59
 
60
- def save_chunks(chunks, original_filename):
61
  saved_files = []
62
- base_name = os.path.splitext(original_filename)[0]
63
 
64
  for i, chunk in enumerate(chunks, 1):
65
- output_path = f"{base_name}_part{i}.docx"
66
  chunk.save(output_path)
67
  saved_files.append(output_path)
68
 
69
  return saved_files
70
 
71
- def process_document(file, split_type, headers_or_pages):
 
 
 
 
 
 
72
  if headers_or_pages < 1:
73
- return "Por favor, especifique un número positivo de encabezados o páginas por fragmento."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- if split_type == "Encabezados":
76
- chunks = split_by_headers(file.name, headers_or_pages)
77
- else: # Páginas
78
- chunks = split_by_pages(file.name, headers_or_pages)
79
 
80
- saved_files = save_chunks(chunks, os.path.basename(file.name))
81
- return f"Documento dividido en {len(saved_files)} partes: {', '.join(saved_files)}"
 
 
82
 
83
  # Interfaz Gradio
84
- iface = gr.Interface(
85
- fn=process_document,
86
- inputs=[
87
- gr.File(label="Seleccione el archivo DOCX"),
88
- gr.Radio(["Encabezados", "Páginas"], label="Método de división"),
89
- gr.Number(value=1, label="Número de encabezados/páginas por fragmento", minimum=1)
90
- ],
91
- outputs=gr.Text(label="Resultado"),
92
- title="Divisor de Documentos DOCX",
93
- description="Divida documentos DOCX por encabezados o páginas estimadas"
94
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  if __name__ == "__main__":
97
  iface.launch()
 
1
  import gradio as gr
2
  from docx import Document
3
  import os
4
+ import zipfile
5
+ import tempfile
 
6
 
7
  def split_by_headers(file_path, headers_per_chunk=1):
8
  doc = Document(file_path)
 
10
  current_chunk = Document()
11
  header_count = 0
12
 
13
+ for paragraph in doc.paragraphs:
14
+ if paragraph.style.name.startswith('Heading'):
15
+ header_count += 1
16
+ if header_count > headers_per_chunk:
17
+ chunks.append(current_chunk)
18
+ current_chunk = Document()
19
+ header_count = 1
20
+
21
+ current_chunk.add_paragraph(paragraph.text, style=paragraph.style.name)
 
 
22
 
23
+ if len(current_chunk.paragraphs):
24
  chunks.append(current_chunk)
25
 
26
  return chunks
 
30
  chunks = []
31
  current_chunk = Document()
32
  page_count = 0
33
+ estimated_chars_per_page = 3000
34
  char_count = 0
35
 
36
+ for paragraph in doc.paragraphs:
37
+ text = paragraph.text
38
+ char_count += len(text)
39
+
40
+ if char_count >= estimated_chars_per_page:
41
+ page_count += 1
42
+ char_count = 0
43
 
44
+ if page_count >= pages_per_chunk:
45
+ chunks.append(current_chunk)
46
+ current_chunk = Document()
47
+ page_count = 0
48
+
49
+ current_chunk.add_paragraph(text, style=paragraph.style.name)
 
 
 
 
50
 
51
+ if len(current_chunk.paragraphs):
52
  chunks.append(current_chunk)
53
 
54
  return chunks
55
 
56
+ def save_chunks(chunks, original_filename, temp_dir):
57
  saved_files = []
58
+ base_name = os.path.splitext(os.path.basename(original_filename))[0]
59
 
60
  for i, chunk in enumerate(chunks, 1):
61
+ output_path = os.path.join(temp_dir, f"{base_name}_part{i}.docx")
62
  chunk.save(output_path)
63
  saved_files.append(output_path)
64
 
65
  return saved_files
66
 
67
+ def create_zip_file(file_paths, zip_path):
68
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
69
+ for file_path in file_paths:
70
+ zipf.write(file_path, os.path.basename(file_path))
71
+ return zip_path
72
+
73
+ def process_document(file, split_type, headers_or_pages, download_type):
74
  if headers_or_pages < 1:
75
+ return None, "Por favor, especifique un número positivo de encabezados o páginas por fragmento."
76
+
77
+ try:
78
+ # Crear directorio temporal
79
+ temp_dir = tempfile.mkdtemp()
80
+
81
+ # Procesar el documento
82
+ if split_type == "Encabezados":
83
+ chunks = split_by_headers(file.name, headers_or_pages)
84
+ else: # Páginas
85
+ chunks = split_by_pages(file.name, headers_or_pages)
86
+
87
+ # Guardar chunks en el directorio temporal
88
+ saved_files = save_chunks(chunks, file.name, temp_dir)
89
+
90
+ if download_type == "ZIP":
91
+ # Crear archivo ZIP
92
+ zip_path = os.path.join(temp_dir, "documentos_divididos.zip")
93
+ create_zip_file(saved_files, zip_path)
94
+ return zip_path, f"Documento dividido en {len(saved_files)} partes y comprimido en ZIP."
95
+ else: # Archivos individuales
96
+ return saved_files, f"Documento dividido en {len(saved_files)} partes."
97
 
98
+ except Exception as e:
99
+ return None, f"Error al procesar el documento: {str(e)}"
 
 
100
 
101
+ finally:
102
+ # Programar la limpieza del directorio temporal
103
+ # (Gradio se encargará de esto después de la descarga)
104
+ pass
105
 
106
  # Interfaz Gradio
107
+ with gr.Blocks() as iface:
108
+ gr.Markdown("# Divisor de Documentos DOCX")
109
+
110
+ with gr.Row():
111
+ file_input = gr.File(label="Seleccione el archivo DOCX")
112
+ split_type = gr.Radio(
113
+ ["Encabezados", "Páginas"],
114
+ label="Método de división",
115
+ value="Encabezados"
116
+ )
117
+
118
+ with gr.Row():
119
+ headers_pages = gr.Number(
120
+ value=1,
121
+ label="Número de encabezados/páginas por fragmento",
122
+ minimum=1
123
+ )
124
+ download_type = gr.Radio(
125
+ ["Individual", "ZIP"],
126
+ label="Tipo de descarga",
127
+ value="ZIP"
128
+ )
129
+
130
+ process_btn = gr.Button("Procesar Documento")
131
+ output_text = gr.Text(label="Estado")
132
+
133
+ file_output = gr.File(label="Archivos Procesados")
134
+
135
+ process_btn.click(
136
+ fn=process_document,
137
+ inputs=[file_input, split_type, headers_pages, download_type],
138
+ outputs=[file_output, output_text]
139
+ )
140
 
141
  if __name__ == "__main__":
142
  iface.launch()