File size: 3,842 Bytes
d61bee5 37944ec 02010f9 d61bee5 bfa2e5d d61bee5 02010f9 09a8de9 710722a 09a8de9 ace795f 63eb0e1 5f302e9 02010f9 ace795f 02010f9 ace795f 63eb0e1 ace795f 63eb0e1 09a8de9 2615d12 944732b 2615d12 02010f9 d61bee5 02010f9 d61bee5 02010f9 09a8de9 02010f9 d61bee5 2615d12 09a8de9 02010f9 d61bee5 09a8de9 02010f9 d61bee5 09a8de9 d61bee5 37944ec d61bee5 02010f9 d61bee5 02010f9 d61bee5 02010f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import gradio as gr
import spacy
import pandas as pd
from docx import Document
from io import BytesIO
import tempfile
import os
import multiprocessing as mp
import psutil
import time
from datetime import datetime
# Cargar el modelo de SpaCy en espa帽ol
nlp = spacy.load('zh_core_web_trf')
def get_system_status():
cpu_usage = psutil.cpu_percent()
memory = psutil.virtual_memory()
return f"CPU: {cpu_usage}% | RAM: {memory.percent}% | 脷ltimo update: {datetime.now().strftime('%H:%M:%S')}"
def extract_names_from_text(text):
print(f'{len(text)}/n/n')
doc = nlp(text)
persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
return persons
def split_text(text, max_length=100000):
result = []
current_chunk = []
current_length = 0
paragraphs = text.split('\n')
for paragraph in paragraphs:
paragraph_length = len(paragraph) + 1
if current_length + paragraph_length <= max_length:
current_chunk.append(paragraph)
current_length += paragraph_length
else:
result.append('\n'.join(current_chunk))
current_chunk = [paragraph]
current_length = paragraph_length
if current_chunk:
result.append('\n'.join(current_chunk))
return result
def extract_names_from_fragments(fragments):
with mp.Pool(processes=4) as pool:
results = pool.map(extract_names_from_text, fragments)
return results
def extract_names_from_docx(docx_file, progress=gr.Progress()):
# Inicializar variables de progreso
progress(0, desc="Iniciando procesamiento...")
# Cargar el archivo DOCX
document = Document(docx_file)
full_text = []
for para in document.paragraphs:
full_text.append(para.text)
progress(0.2, desc="Documento cargado, preparando texto...")
# Unir todo el texto
text = ' '.join(full_text)
# Dividir el texto en fragmentos
text_fragments = split_text(text)
progress(0.3, desc=f"Texto dividido en {len(text_fragments)} fragmentos...")
# Extraer los nombres de cada fragmento en paralelo
all_persons = []
for i, fragment_results in enumerate(extract_names_from_fragments(text_fragments)):
all_persons.extend(fragment_results)
progress((0.3 + (0.5 * (i+1)/len(text_fragments))),
desc=f"Procesando fragmento {i+1} de {len(text_fragments)}...")
# Eliminar duplicados
all_persons = list(set(all_persons))
progress(0.9, desc="Preparando resultados...")
# Crear un DataFrame
df = pd.DataFrame(all_persons, columns=['Nombres'])
# Crear un archivo temporal para guardar el Excel
temp_dir = tempfile.mkdtemp()
temp_file_path = os.path.join(temp_dir, "nombres_personas.xlsx")
# Guardar el DataFrame en un archivo Excel
with pd.ExcelWriter(temp_file_path, engine='openpyxl') as writer:
df.to_excel(writer, index=False)
progress(1.0, desc="隆Procesamiento completado!")
return temp_file_path
# Interfaz de Gradio
with gr.Blocks() as demo:
gr.Markdown("# Extractor de Nombres")
gr.Markdown("Sube un archivo .docx y extrae los nombres de las personas usando NLP con SpaCy.")
# Componente de estado del sistema (keepalive)
system_status = gr.Textbox(label="Estado del Sistema", value="Inicializando...")
# Componentes principales
file_input = gr.File(file_types=[".docx"])
output_file = gr.File(label="Archivo de resultados")
# Bot贸n de proceso
process_btn = gr.Button("Procesar Documento")
process_btn.click(fn=extract_names_from_docx, inputs=file_input, outputs=output_file)
# Actualizaci贸n peri贸dica del estado del sistema
demo.load(get_system_status, None, system_status, every=5)
# Iniciar la aplicaci贸n
demo.launch() |