File size: 6,643 Bytes
d61bee5 37944ec 02010f9 c9954a1 d61bee5 c9954a1 ace795f c9954a1 bf0287c c9954a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
import gradio as gr
import spacy
import pandas as pd
from docx import Document
import tempfile
import os
import multiprocessing as mp
import psutil
from datetime import datetime
from typing import List, Dict
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor, as_completed
@dataclass
class ProcessingResult:
filename: str
names: List[str]
status: str
error: str = None
class SystemMonitor:
@staticmethod
def get_status() -> str:
cpu_usage = psutil.cpu_percent()
memory = psutil.virtual_memory()
return f"CPU: {cpu_usage}% | RAM: {memory.percent}% | Último update: {datetime.now().strftime('%H:%M:%S')}"
class TextProcessor:
def __init__(self, model_name: str = 'zh_core_web_trf'):
self.nlp = spacy.load(model_name)
self.max_chunk_size = 100000
def extract_names(self, text: str) -> List[str]:
doc = self.nlp(text)
return [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
def split_text(self, text: str) -> List[str]:
result = []
current_chunk = []
current_length = 0
for paragraph in text.split('\n'):
paragraph_length = len(paragraph) + 1
if current_length + paragraph_length <= self.max_chunk_size:
current_chunk.append(paragraph)
current_length += paragraph_length
else:
result.append('\n'.join(current_chunk))
current_chunk = [paragraph]
current_length = paragraph_length
if current_chunk:
result.append('\n'.join(current_chunk))
return result
class DocumentProcessor:
def __init__(self, text_processor: TextProcessor):
self.text_processor = text_processor
self.num_processes = mp.cpu_count()
def process_document(self, file_path: str, progress=None) -> ProcessingResult:
try:
if progress:
progress(0.1, desc=f"Procesando {os.path.basename(file_path)}...")
# Cargar documento
document = Document(file_path)
text = ' '.join(para.text for para in document.paragraphs)
if progress:
progress(0.3, desc="Dividiendo texto en fragmentos...")
# Dividir texto en fragmentos
fragments = self.text_processor.split_text(text)
if progress:
progress(0.5, desc="Extrayendo nombres...")
# Procesar fragmentos en paralelo
with mp.Pool(processes=self.num_processes) as pool:
all_names = []
for names in pool.imap(self.text_processor.extract_names, fragments):
all_names.extend(names)
if progress:
progress(0.8, desc="Finalizando procesamiento...")
return ProcessingResult(
filename=os.path.basename(file_path),
names=list(set(all_names)),
status="success"
)
except Exception as e:
return ProcessingResult(
filename=os.path.basename(file_path),
names=[],
status="error",
error=str(e)
)
class ResultsExporter:
@staticmethod
def export_to_excel(results: List[ProcessingResult]) -> str:
# Crear DataFrame con todos los resultados
data = []
for result in results:
for name in result.names:
data.append({
'Archivo': result.filename,
'Nombre': name,
'Estado': result.status,
'Error': result.error
})
df = pd.DataFrame(data)
# Guardar a archivo temporal
temp_dir = tempfile.mkdtemp()
temp_file_path = os.path.join(temp_dir, "nombres_extraidos.xlsx")
with pd.ExcelWriter(temp_file_path, engine='openpyxl') as writer:
df.to_excel(writer, index=False)
return temp_file_path
class NameExtractorApp:
def __init__(self):
self.text_processor = TextProcessor()
self.document_processor = DocumentProcessor(self.text_processor)
self.system_monitor = SystemMonitor()
self.results_exporter = ResultsExporter()
def process_files(self, files: List[tempfile._TemporaryFileWrapper], progress=None) -> str:
if progress:
progress(0, desc="Iniciando procesamiento...")
results = []
total_files = len(files)
# Procesar archivos en paralelo usando ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=min(total_files, os.cpu_count() * 2)) as executor:
future_to_file = {
executor.submit(self.document_processor.process_document, file.name): file
for file in files
}
for i, future in enumerate(as_completed(future_to_file)):
result = future.result()
results.append(result)
if progress:
progress((i + 1) / total_files,
desc=f"Procesado {i + 1} de {total_files} archivos...")
if progress:
progress(0.9, desc="Generando archivo de resultados...")
# Exportar resultados
output_file = self.results_exporter.export_to_excel(results)
if progress:
progress(1.0, desc="¡Procesamiento completado!")
return output_file
def create_interface(self):
with gr.Blocks() as demo:
gr.Markdown("# Extractor de Nombres - Procesamiento Paralelo")
gr.Markdown("Sube uno o varios archivos .docx para extraer nombres de personas usando NLP.")
# Estado del sistema
system_status = gr.Textbox(label="Estado del Sistema", value="Inicializando...")
# Entrada y salida
file_input = gr.File(file_types=[".docx"], file_count="multiple")
output_file = gr.File(label="Archivo de resultados")
# Botón de proceso
process_btn = gr.Button("Procesar Documentos")
process_btn.click(
fn=self.process_files,
inputs=file_input,
outputs=output_file
)
# Actualización del estado del sistema
demo.load(self.system_monitor.get_status, None, system_status, every=5)
return demo
def main():
app = NameExtractorApp()
demo = app.create_interface()
demo.launch()
if __name__ == "__main__":
main() |