Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files
app.py
CHANGED
@@ -1 +1,104 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import fitz
|
3 |
+
import re
|
4 |
+
import gradio as gr
|
5 |
+
import pandas as pd
|
6 |
+
import tempfile
|
7 |
+
import pytesseract
|
8 |
+
from PIL import Image
|
9 |
+
import io
|
10 |
+
|
11 |
+
faixas = {
|
12 |
+
"HB": (12, 17), "HT": (36, 50), "GLI": (70, 99), "UREIA": (10, 50),
|
13 |
+
"CR": (0.6, 1.3), "K+": (3.5, 5.5), "NA+": (135, 145), "TGO": (0, 40),
|
14 |
+
"TGP": (0, 40), "ALB": (3.5, 5.0), "INR": (0.8, 1.2), "TAP": (10, 14),
|
15 |
+
"TTP": (25, 35), "LAC": (0.5, 2.2), "PLT": (150000, 450000),
|
16 |
+
"LEUCO": (4000, 11000)
|
17 |
+
}
|
18 |
+
|
19 |
+
def classificar(nome, valor):
|
20 |
+
try:
|
21 |
+
val = float(valor.replace("K", "000").replace(">", "").replace("<", "").strip())
|
22 |
+
if nome in faixas:
|
23 |
+
min_v, max_v = faixas[nome]
|
24 |
+
if val < min_v:
|
25 |
+
return f"{valor} ↓"
|
26 |
+
elif val > max_v:
|
27 |
+
return f"{valor} ↑"
|
28 |
+
return valor
|
29 |
+
except:
|
30 |
+
return valor
|
31 |
+
|
32 |
+
def extrair_texto_pdf(pdf_file):
|
33 |
+
texto_fitz = ""
|
34 |
+
imagens_ocr = []
|
35 |
+
|
36 |
+
with fitz.open(pdf_file.name) as doc:
|
37 |
+
for page in doc:
|
38 |
+
texto_fitz += page.get_text()
|
39 |
+
pix = page.get_pixmap(dpi=300)
|
40 |
+
img_data = pix.tobytes("png")
|
41 |
+
img = Image.open(io.BytesIO(img_data))
|
42 |
+
imagens_ocr.append(img)
|
43 |
+
|
44 |
+
texto_fitz = texto_fitz.replace('\n', ' ').replace('\r', ' ')
|
45 |
+
texto_ocr = " ".join([pytesseract.image_to_string(im) for im in imagens_ocr])
|
46 |
+
texto_ocr = texto_ocr.replace('\n', ' ').replace('\r', ' ')
|
47 |
+
|
48 |
+
return texto_fitz, texto_ocr
|
49 |
+
|
50 |
+
def buscar_exame(texto_primario, texto_ocr, padrao):
|
51 |
+
matches = re.findall(padrao, texto_primario, re.IGNORECASE)
|
52 |
+
for val in matches:
|
53 |
+
val = val.strip().replace(",", ".")
|
54 |
+
if len(val.replace(".", "").replace(">", "").replace("<", "")) <= 5:
|
55 |
+
return val
|
56 |
+
|
57 |
+
matches_ocr = re.findall(padrao, texto_ocr, re.IGNORECASE)
|
58 |
+
for val in matches_ocr:
|
59 |
+
val = val.strip().replace(",", ".")
|
60 |
+
if len(val.replace(".", "").replace(">", "").replace("<", "")) <= 5:
|
61 |
+
return val
|
62 |
+
return None
|
63 |
+
|
64 |
+
def extrair_exames_formatado(pdf_file):
|
65 |
+
if pdf_file is None:
|
66 |
+
return "Nenhum arquivo enviado.", None
|
67 |
+
|
68 |
+
texto_fitz, texto_ocr = extrair_texto_pdf(pdf_file)
|
69 |
+
|
70 |
+
campos = {
|
71 |
+
"HB": r"hemoglobina[^:\d]{0,10}[:=]?\s*([\d.,]+)",
|
72 |
+
"HT": r"hemat[óo]crito[^:\d]{0,10}[:=]?\s*([\d.,]+)",
|
73 |
+
"PLT": r"plaquetas[^:\d]{0,10}[:=]?\s*([\d.,]+)",
|
74 |
+
"INR": r"INR[^:\d]{0,10}[:=]?\s*([\d.,]+)",
|
75 |
+
"TROPO": r"troponina.*?[:=]?\s*([<>]?\s*[\d.,]+)"
|
76 |
+
# Pode adicionar outros campos aqui
|
77 |
+
}
|
78 |
+
|
79 |
+
resultados = []
|
80 |
+
|
81 |
+
for rotulo, padrao in campos.items():
|
82 |
+
val = buscar_exame(texto_fitz, texto_ocr, padrao)
|
83 |
+
if val:
|
84 |
+
resultados.append((rotulo, classificar(rotulo, val)))
|
85 |
+
else:
|
86 |
+
resultados.append((rotulo, "—"))
|
87 |
+
|
88 |
+
df = pd.DataFrame(resultados, columns=["Exame", "Valor"])
|
89 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
|
90 |
+
df.to_csv(temp_file.name, index=False)
|
91 |
+
texto_final = "\n".join(f"{e}: {v}" for e, v in resultados)
|
92 |
+
|
93 |
+
return texto_final, temp_file.name
|
94 |
+
|
95 |
+
with gr.Blocks() as demo:
|
96 |
+
gr.Markdown("## 🧪 Extrator de Exames com OCR")
|
97 |
+
pdf_file = gr.File(label="📄 PDF de exames", file_types=[".pdf"])
|
98 |
+
extract_button = gr.Button("🔍 Extrair Exames")
|
99 |
+
output_text = gr.Textbox(label="📋 Resultado Classificado", lines=25)
|
100 |
+
download_button = gr.File(label="📥 Baixar CSV")
|
101 |
+
|
102 |
+
extract_button.click(fn=extrair_exames_formatado, inputs=pdf_file, outputs=[output_text, download_button])
|
103 |
+
|
104 |
+
demo.launch()
|