lik07 commited on
Commit
c9954a1
verified
1 Parent(s): 02010f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +187 -106
app.py CHANGED
@@ -2,116 +2,197 @@ import gradio as gr
2
  import spacy
3
  import pandas as pd
4
  from docx import Document
5
- from io import BytesIO
6
  import tempfile
7
  import os
8
  import multiprocessing as mp
9
  import psutil
10
- import time
11
  from datetime import datetime
 
 
 
12
 
13
- # Cargar el modelo de SpaCy en espa帽ol
14
- nlp = spacy.load('zh_core_web_trf')
15
-
16
- def get_system_status():
17
- cpu_usage = psutil.cpu_percent()
18
- memory = psutil.virtual_memory()
19
- return f"CPU: {cpu_usage}% | RAM: {memory.percent}% | 脷ltimo update: {datetime.now().strftime('%H:%M:%S')}"
20
-
21
- def extract_names_from_text(text):
22
- print(f'{len(text)}/n/n')
23
- doc = nlp(text)
24
- persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
25
- return persons
26
-
27
- def split_text(text, max_length=100000):
28
- result = []
29
- current_chunk = []
30
- current_length = 0
31
- paragraphs = text.split('\n')
32
-
33
- for paragraph in paragraphs:
34
- paragraph_length = len(paragraph) + 1
35
- if current_length + paragraph_length <= max_length:
36
- current_chunk.append(paragraph)
37
- current_length += paragraph_length
38
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  result.append('\n'.join(current_chunk))
40
- current_chunk = [paragraph]
41
- current_length = paragraph_length
42
-
43
- if current_chunk:
44
- result.append('\n'.join(current_chunk))
45
-
46
- return result
47
-
48
- def extract_names_from_fragments(fragments):
49
- with mp.Pool(processes=4) as pool:
50
- results = pool.map(extract_names_from_text, fragments)
51
- return results
52
-
53
- def extract_names_from_docx(docx_file, progress=gr.Progress()):
54
- # Inicializar variables de progreso
55
- progress(0, desc="Iniciando procesamiento...")
56
-
57
- # Cargar el archivo DOCX
58
- document = Document(docx_file)
59
- full_text = []
60
- for para in document.paragraphs:
61
- full_text.append(para.text)
62
-
63
- progress(0.2, desc="Documento cargado, preparando texto...")
64
-
65
- # Unir todo el texto
66
- text = ' '.join(full_text)
67
-
68
- # Dividir el texto en fragmentos
69
- text_fragments = split_text(text)
70
- progress(0.3, desc=f"Texto dividido en {len(text_fragments)} fragmentos...")
71
-
72
- # Extraer los nombres de cada fragmento en paralelo
73
- all_persons = []
74
- for i, fragment_results in enumerate(extract_names_from_fragments(text_fragments)):
75
- all_persons.extend(fragment_results)
76
- progress((0.3 + (0.5 * (i+1)/len(text_fragments))),
77
- desc=f"Procesando fragmento {i+1} de {len(text_fragments)}...")
78
-
79
- # Eliminar duplicados
80
- all_persons = list(set(all_persons))
81
- progress(0.9, desc="Preparando resultados...")
82
-
83
- # Crear un DataFrame
84
- df = pd.DataFrame(all_persons, columns=['Nombres'])
85
-
86
- # Crear un archivo temporal para guardar el Excel
87
- temp_dir = tempfile.mkdtemp()
88
- temp_file_path = os.path.join(temp_dir, "nombres_personas.xlsx")
89
-
90
- # Guardar el DataFrame en un archivo Excel
91
- with pd.ExcelWriter(temp_file_path, engine='openpyxl') as writer:
92
- df.to_excel(writer, index=False)
93
-
94
- progress(1.0, desc="隆Procesamiento completado!")
95
- return temp_file_path
96
-
97
- # Interfaz de Gradio
98
- with gr.Blocks() as demo:
99
- gr.Markdown("# Extractor de Nombres")
100
- gr.Markdown("Sube un archivo .docx y extrae los nombres de las personas usando NLP con SpaCy.")
101
-
102
- # Componente de estado del sistema (keepalive)
103
- system_status = gr.Textbox(label="Estado del Sistema", value="Inicializando...")
104
-
105
- # Componentes principales
106
- file_input = gr.File(file_types=[".docx"])
107
- output_file = gr.File(label="Archivo de resultados")
108
-
109
- # Bot贸n de proceso
110
- process_btn = gr.Button("Procesar Documento")
111
- process_btn.click(fn=extract_names_from_docx, inputs=file_input, outputs=output_file)
112
-
113
- # Actualizaci贸n peri贸dica del estado del sistema
114
- demo.load(get_system_status, None, system_status, every=5)
115
-
116
- # Iniciar la aplicaci贸n
117
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import spacy
3
  import pandas as pd
4
  from docx import Document
 
5
  import tempfile
6
  import os
7
  import multiprocessing as mp
8
  import psutil
 
9
  from datetime import datetime
10
+ from typing import List, Dict
11
+ from dataclasses import dataclass
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
13
 
14
+ @dataclass
15
+ class ProcessingResult:
16
+ filename: str
17
+ names: List[str]
18
+ status: str
19
+ error: str = None
20
+
21
+ class SystemMonitor:
22
+ @staticmethod
23
+ def get_status() -> str:
24
+ cpu_usage = psutil.cpu_percent()
25
+ memory = psutil.virtual_memory()
26
+ return f"CPU: {cpu_usage}% | RAM: {memory.percent}% | 脷ltimo update: {datetime.now().strftime('%H:%M:%S')}"
27
+
28
+ class TextProcessor:
29
+ def __init__(self, model_name: str = 'zh_core_web_trf'):
30
+ self.nlp = spacy.load(model_name)
31
+ self.max_chunk_size = 100000
32
+
33
+ def extract_names(self, text: str) -> List[str]:
34
+ doc = self.nlp(text)
35
+ return [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
36
+
37
+ def split_text(self, text: str) -> List[str]:
38
+ result = []
39
+ current_chunk = []
40
+ current_length = 0
41
+
42
+ for paragraph in text.split('\n'):
43
+ paragraph_length = len(paragraph) + 1
44
+ if current_length + paragraph_length <= self.max_chunk_size:
45
+ current_chunk.append(paragraph)
46
+ current_length += paragraph_length
47
+ else:
48
+ result.append('\n'.join(current_chunk))
49
+ current_chunk = [paragraph]
50
+ current_length = paragraph_length
51
+
52
+ if current_chunk:
53
  result.append('\n'.join(current_chunk))
54
+
55
+ return result
56
+
57
+ class DocumentProcessor:
58
+ def __init__(self, text_processor: TextProcessor):
59
+ self.text_processor = text_processor
60
+ self.num_processes = mp.cpu_count()
61
+
62
+ def process_document(self, file_path: str, progress=None) -> ProcessingResult:
63
+ try:
64
+ if progress:
65
+ progress(0.1, desc=f"Procesando {os.path.basename(file_path)}...")
66
+
67
+ # Cargar documento
68
+ document = Document(file_path)
69
+ text = ' '.join(para.text for para in document.paragraphs)
70
+
71
+ if progress:
72
+ progress(0.3, desc="Dividiendo texto en fragmentos...")
73
+
74
+ # Dividir texto en fragmentos
75
+ fragments = self.text_processor.split_text(text)
76
+
77
+ if progress:
78
+ progress(0.5, desc="Extrayendo nombres...")
79
+
80
+ # Procesar fragmentos en paralelo
81
+ with mp.Pool(processes=self.num_processes) as pool:
82
+ all_names = []
83
+ for names in pool.imap(self.text_processor.extract_names, fragments):
84
+ all_names.extend(names)
85
+
86
+ if progress:
87
+ progress(0.8, desc="Finalizando procesamiento...")
88
+
89
+ return ProcessingResult(
90
+ filename=os.path.basename(file_path),
91
+ names=list(set(all_names)),
92
+ status="success"
93
+ )
94
+
95
+ except Exception as e:
96
+ return ProcessingResult(
97
+ filename=os.path.basename(file_path),
98
+ names=[],
99
+ status="error",
100
+ error=str(e)
101
+ )
102
+
103
+ class ResultsExporter:
104
+ @staticmethod
105
+ def export_to_excel(results: List[ProcessingResult]) -> str:
106
+ # Crear DataFrame con todos los resultados
107
+ data = []
108
+ for result in results:
109
+ for name in result.names:
110
+ data.append({
111
+ 'Archivo': result.filename,
112
+ 'Nombre': name,
113
+ 'Estado': result.status,
114
+ 'Error': result.error
115
+ })
116
+
117
+ df = pd.DataFrame(data)
118
+
119
+ # Guardar a archivo temporal
120
+ temp_dir = tempfile.mkdtemp()
121
+ temp_file_path = os.path.join(temp_dir, "nombres_extraidos.xlsx")
122
+
123
+ with pd.ExcelWriter(temp_file_path, engine='openpyxl') as writer:
124
+ df.to_excel(writer, index=False)
125
+
126
+ return temp_file_path
127
+
128
+ class NameExtractorApp:
129
+ def __init__(self):
130
+ self.text_processor = TextProcessor()
131
+ self.document_processor = DocumentProcessor(self.text_processor)
132
+ self.system_monitor = SystemMonitor()
133
+ self.results_exporter = ResultsExporter()
134
+
135
+ def process_files(self, files: List[tempfile._TemporaryFileWrapper], progress=None) -> str:
136
+ if progress:
137
+ progress(0, desc="Iniciando procesamiento...")
138
+
139
+ results = []
140
+ total_files = len(files)
141
+
142
+ # Procesar archivos en paralelo usando ThreadPoolExecutor
143
+ with ThreadPoolExecutor(max_workers=min(total_files, os.cpu_count() * 2)) as executor:
144
+ future_to_file = {
145
+ executor.submit(self.document_processor.process_document, file.name): file
146
+ for file in files
147
+ }
148
+
149
+ for i, future in enumerate(as_completed(future_to_file)):
150
+ result = future.result()
151
+ results.append(result)
152
+ if progress:
153
+ progress((i + 1) / total_files,
154
+ desc=f"Procesado {i + 1} de {total_files} archivos...")
155
+
156
+ if progress:
157
+ progress(0.9, desc="Generando archivo de resultados...")
158
+
159
+ # Exportar resultados
160
+ output_file = self.results_exporter.export_to_excel(results)
161
+
162
+ if progress:
163
+ progress(1.0, desc="隆Procesamiento completado!")
164
+
165
+ return output_file
166
+
167
+ def create_interface(self):
168
+ with gr.Blocks() as demo:
169
+ gr.Markdown("# Extractor de Nombres - Procesamiento Paralelo")
170
+ gr.Markdown("Sube uno o varios archivos .docx para extraer nombres de personas usando NLP.")
171
+
172
+ # Estado del sistema
173
+ system_status = gr.Textbox(label="Estado del Sistema", value="Inicializando...")
174
+
175
+ # Entrada y salida
176
+ file_input = gr.File(file_types=[".docx"], multiple=True)
177
+ output_file = gr.File(label="Archivo de resultados")
178
+
179
+ # Bot贸n de proceso
180
+ process_btn = gr.Button("Procesar Documentos")
181
+ process_btn.click(
182
+ fn=self.process_files,
183
+ inputs=file_input,
184
+ outputs=output_file
185
+ )
186
+
187
+ # Actualizaci贸n del estado del sistema
188
+ demo.load(self.system_monitor.get_status, None, system_status, every=5)
189
+
190
+ return demo
191
+
192
+ def main():
193
+ app = NameExtractorApp()
194
+ demo = app.create_interface()
195
+ demo.launch()
196
+
197
+ if __name__ == "__main__":
198
+ main()