Spaces:

GABRIELSZK
/

EXAMES

Sleeping

App Files Files Community

GABRIELSZK commited on May 20

Commit

e444dfe

verified ·

1 Parent(s): 07c89f0

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -86

app.py CHANGED Viewed

@@ -6,52 +6,43 @@ import tempfile
 import pytesseract
 from PIL import Image, ImageEnhance, ImageFilter
 import io
-# 🎯 Faixas de referência (valores de referência mínimos e máximos)
 faixas = {
-    "LEUCO": (4000, 11000),
-    "B": (0, 1), "SS": (45, 59), "EOS": (1, 6), "LINF": (30, 50), "MONO": (1, 8),
-    "HB": (12, 17), "HT": (36, 50), "PLT": (150, 450),
-    "AMIL": (25, 125), "ÁC UR": (3.5, 7.2),
-    "BT": (0.3, 1.2), "BD": (0.0, 0.3), "BI": (0.1, 0.8),
-    "CAI": (1.1, 1.3), "CA TOTAL": (8.5, 10.2),
-    "CL-": (98, 107), "CR": (0.6, 1.3), "UREIA": (17, 49),
-    "FAL": (44, 147), "FÓS": (2.5, 4.5), "GGT": (8, 61),
-    "GLI": (70, 99), "LIP": (10, 140), "MG++": (1.7, 2.2),
-    "PCR": (0, 5), "K+": (3.5, 5.1), "NA+": (135, 145),
-    "PTN": (6.0, 8.3), "ALB": (3.5, 5.0), "GLOB": (2.3, 3.5),
-    "RELAÇÃO": (1.0, 2.2),
-    "TGO": (0, 40), "TGP": (0, 40),
-    "TAP": (10, 14), "INR": (0.8, 1.2), "TTP": (25, 35),
-    "DIMERO D": (0, 500), "LAC": (0.5, 2.2),
-    "CKMB": (0, 25), "CPK": (20, 200),
-    "TROPONINA": (0, 0.5)
 }
 def classificar(nome, valor):
     try:
-        v = float(valor.replace(">", "").replace("<", "").strip())
         if nome in faixas:
             lo, hi = faixas[nome]
-            if v < lo:
-                return f"{valor} ↓"
-            if v > hi:
-                return f"{valor} ↑"
         return valor
     except:
         return valor
-# Ajustes para melhorar OCR
 def melhorar_imagem(img: Image.Image) -> Image.Image:
     img = img.convert("L")
     img = ImageEnhance.Contrast(img).enhance(2)
     return img.filter(ImageFilter.SHARPEN)
-# Extrai texto nativo + OCR do PDF
 def extrair_texto_pdf(pdf_input):
     if isinstance(pdf_input, dict):
         pdf_path = pdf_input.get("name") or pdf_input.get("file_path")
-    elif hasattr(pdf_input, "name") and isinstance(pdf_input.name, str):
         pdf_path = pdf_input.name
     else:
         pdf_path = str(pdf_input)
@@ -59,67 +50,57 @@ def extrair_texto_pdf(pdf_input):
     texto_nativo, ocr_imgs = [], []
     with fitz.open(pdf_path) as doc:
         for page in doc:
-            texto_nativo.append(page.get_text())
-            pix = page.get_pixmap(dpi=300)
-            img = Image.open(io.BytesIO(pix.tobytes("png")))
-            ocr_imgs.append(melhorar_imagem(img))
     tn = re.sub(r"\s+", " ", "".join(texto_nativo))
-    tocr = re.sub(r"\s+", " ", " ".join(pytesseract.image_to_string(im) for im in ocr_imgs))
     return tn, tocr
-# Padrões de extração com word boundaries e unidades obrigatórias
 exames = {
     "LEUCO": r"\bleuc[óo]citos\b.*?([\d.,]+)\s*/u?l",
-    "B":    r"\bbastonetes\b.*?([\d.,]+)\s?%",
-    "SS":   r"\bsegmentados\b.*?([\d.,]+)\s?%",
-    "EOS":  r"\beosin[óo]filos\b.*?([\d.,]+)\s?%",
     "LINF": r"\blinf[oó]citos\b.*?([\d.,]+)\s?%",
     "MONO": r"\bmon[óo]citos\b.*?([\d.,]+)\s?%",
-    "HB":   r"\bhemoglobina\b.*?([\d.,]+)\s?g/dl",
-    "HT":   r"\bhemat[óo]crito\b.*?([\d.,]+)\s?%",
-    "PLT":  r"\bplaquetas\b.*?([\d.,]+)\s*/u?l",
     "AMIL": r"\bamilase\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
-    "BT":   r"\bbilirrubina total\b.*?([\d.,]+)\s?mg/dl",
-    "BD":   r"\bbilirrubina direta\b.*?([\d.,]+)\s?mg/dl",
-    "BI":   r"\bbilirrubina indireta\b.*?([\d.,]+)\s?mg/dl",
-    "CR":   r"\bcreatinina\b.*?resultado[:\s]*([\d.,]+)\s?mg/dl",
-    "UREIA":r"\bureia\b.*?resultado[:\s]*([\d.,]+)\s?mg/dl",
-    "FAL":  r"\bfosfatase alcalina\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
-    "GGT":  r"\bggt\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
-    "TGO":  r"\btgo\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
-    "TGP":  r"\btgp\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
-    "GLI":  r"\bglicose\b(?! qualitativa).*?resultado[:\s]*([\d.,]+)\s?mg/dl",
-    "LIP":  r"\blipase\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
     "MG++": r"\bmagn[eé]sio\b.*?resultado[:\s]*([\d.,]+)\s?mg/dl",
-    "TAP":      r"\btempo de protrombina\b.*?resultado[:\s]*([\d.,]+)",
-    "INR":      r"\binr\b.*?([\d.,]+)",
-    "TTP":      r"\bttpa\b.*?resultado[:\s]*([\d.,]+)",
     "DIMERO D": r"\bd[ií]mero d\b.*?resultado[:\s]*([\d.,]+)",
-    "PCR":       r"\bpcr\b.*?resultado[:\s]*([\d.,]+)\s?mg/dl",
-    "CKMB":      r"\bck[- ]?mb\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
-    "CPK":       r"\bcpk\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
-    "TROPONINA": r"troponina(?! qualitativa).*?resultado[:\s]*([><\d.,]+)(?=\s*ng\/m[lL])",
-    "TROPONINA QUAL": r"troponina qualitativa.*?resultado[:\s]*(positivo|negativo)",
-    "PROTEINA UR":    r"\bprote[ií]na\b.*?\b(ausente|positivo|negativo)",
-    "GLI UR":         r"\bglicose\b.*?\b(ausente|positivo|negativo)",
-    "CETONAS UR":     r"\bcorpos cet[oô]nicos\b.*?\b(ausente|positivo|negativo)",
-    "SANGUE UR":      r"\bsangue\b.*?\b(ausente|positivo|negativo)",
-    "LEUC ESTERASE":  r"\bleuc[óo]citos? esterase\b.*?\b(ausente|positivo|negativo)",
-    "NITRITO UR":     r"\bnitrito\b.*?\b(ausente|positivo|negativo)",
-    "LEUCO EAS":      r"\bleuc[óo]citos?\b\s*([\d]+[-\/–][\d]+)",
-    "HEMA EAS":       r"\bhem[áa]cias?\b\s*([\d]+[-\/–][\d]+)",
-    "BACTERIAS UR":   r"\bbact[ée]rias?\b.*?\b(raras|ausentes|positivas|negativas)"
 }
-# Ordem de exibição
-ordem = [
-    "LEUCO","B","SS","EOS","LINF","MONO",
-    "HB","HT","PLT","AMIL","BT","BD","BI",
-    "CR","UREIA","FAL","GGT","TGO","TGP","GLI","LIP","MG++",
-    "PCR","CKMB","CPK","TROPONINA","TROPONINA QUAL",
-    "TAP","INR","TTP","DIMERO D",
-    "PROTEINA UR","GLI UR","CETONAS UR","SANGUE UR","LEUC ESTERASE","NITRITO UR","LEUCO EAS","HEMA EAS","BACTERIAS UR"
-]
 def extrair_exames_formatado(pdf_file):
     if not pdf_file:
@@ -127,8 +108,8 @@ def extrair_exames_formatado(pdf_file):
     tn, tocr = extrair_texto_pdf(pdf_file)
     texto = (tn + " " + tocr).lower()
     resultados = {}
-    for nome, pat in exames.items():
-        m = re.search(pat, texto, re.IGNORECASE)
         if not m:
             continue
         raw = m.group(1).strip().upper()
@@ -137,22 +118,16 @@ def extrair_exames_formatado(pdf_file):
         else:
             resultados[nome] = classificar(nome, raw.replace(",", "."))
-    eas_fields = [f"{k}: {resultados[k]}" for k in ordem if k in resultados and (k.endswith("UR") or k.endswith("EAS"))]
-    main_fields = [f"{r}: {resultados[r]}" for r in ordem if r in resultados and not (r.endswith("UR") or r.endswith("EAS"))]
-    line_eas = f"🟤 EAS → {' / '.join(eas_fields)}" if eas_fields else ""
-    line_main = ' / '.join(main_fields)
-    final = '\n'.join([l for l in (line_eas, line_main) if l])
-    # Gera CSV
     df = pd.DataFrame([[k, resultados[k]] for k in resultados], columns=["Exame", "Valor"])
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
     df.to_csv(tmp.name, index=False)
     return final, tmp.name
-# Interface Gradio
 demo = gr.Blocks()
 with demo:
-    gr.Markdown("## 🧪 Extrator Avançado com OCR + EAS + Troponina (Quant. e Qual.)")
     pdf_input = gr.File(file_types=[".pdf"], label="📄 PDF de exames")
     btn = gr.Button("🔍 Extrair")
     out_txt = gr.Textbox(lines=15, label="📋 Resultados")

 import pytesseract
 from PIL import Image, ImageEnhance, ImageFilter
 import io
+from concurrent.futures import ThreadPoolExecutor
 faixas = {
+    "LEUCO": (4000, 11000), "B": (0, 1), "SS": (45, 59), "EOS": (1, 6),
+    "LINF": (30, 50), "MONO": (1, 8), "HB": (12, 17), "HT": (36, 50), "PLT": (150, 450),
+    "AMIL": (25, 125), "ÁC UR": (3.5, 7.2), "BT": (0.3, 1.2), "BD": (0.0, 0.3), "BI": (0.1, 0.8),
+    "CAI": (1.1, 1.3), "CA TOTAL": (8.5, 10.2), "CL-": (98, 107), "CR": (0.6, 1.3), "UREIA": (17, 49),
+    "FAL": (44, 147), "FÓS": (2.5, 4.5), "GGT": (8, 61), "GLI": (70, 99), "LIP": (10, 140),
+    "MG++": (1.7, 2.2), "PCR": (0, 5), "K+": (3.5, 5.1), "NA+": (135, 145), "PTN": (6.0, 8.3),
+    "ALB": (3.5, 5.0), "GLOB": (2.3, 3.5), "RELAÇÃO": (1.0, 2.2), "TGO": (0, 40), "TGP": (0, 40),
+    "TAP": (10, 14), "INR": (0.8, 1.2), "TTP": (25, 35), "DIMERO D": (0, 500), "LAC": (0.5, 2.2),
+    "CKMB": (0, 25), "CPK": (20, 200), "TROPONINA": (0, 0.5)
 }
 def classificar(nome, valor):
     try:
+        v = float(valor.replace("<", "").replace(">", "").strip())
         if nome in faixas:
             lo, hi = faixas[nome]
+            if v < lo: return f"{valor} ↓"
+            if v > hi: return f"{valor} ↑"
         return valor
     except:
         return valor
 def melhorar_imagem(img: Image.Image) -> Image.Image:
     img = img.convert("L")
     img = ImageEnhance.Contrast(img).enhance(2)
     return img.filter(ImageFilter.SHARPEN)
+def ocr_image(img):
+    return pytesseract.image_to_string(img)
 def extrair_texto_pdf(pdf_input):
     if isinstance(pdf_input, dict):
         pdf_path = pdf_input.get("name") or pdf_input.get("file_path")
+    elif hasattr(pdf_input, "name"):
         pdf_path = pdf_input.name
     else:
         pdf_path = str(pdf_input)
     texto_nativo, ocr_imgs = [], []
     with fitz.open(pdf_path) as doc:
         for page in doc:
+            t = page.get_text()
+            texto_nativo.append(t)
+            if not t.strip():
+                pix = page.get_pixmap(dpi=200)
+                img = Image.open(io.BytesIO(pix.tobytes("png")))
+                ocr_imgs.append(melhorar_imagem(img))
     tn = re.sub(r"\s+", " ", "".join(texto_nativo))
+    tocr = ""
+    if ocr_imgs:
+        with ThreadPoolExecutor() as executor:
+            tocr = " ".join(executor.map(ocr_image, ocr_imgs))
+    tocr = re.sub(r"\s+", " ", tocr)
     return tn, tocr
 exames = {
     "LEUCO": r"\bleuc[óo]citos\b.*?([\d.,]+)\s*/u?l",
+    "B": r"\bbastonetes\b.*?([\d.,]+)\s?%",
+    "SS": r"\bsegmentados\b.*?([\d.,]+)\s?%",
+    "EOS": r"\beosin[óo]filos\b.*?([\d.,]+)\s?%",
     "LINF": r"\blinf[oó]citos\b.*?([\d.,]+)\s?%",
     "MONO": r"\bmon[óo]citos\b.*?([\d.,]+)\s?%",
+    "HB": r"\bhemoglobina\b.*?([\d.,]+)\s?g/dl",
+    "HT": r"\bhemat[óo]crito\b.*?([\d.,]+)\s?%",
+    "PLT": r"\bplaquetas\b.*?([\d.,]+)\s*/u?l",
     "AMIL": r"\bamilase\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
+    "BT": r"\bbilirrubina total\b.*?([\d.,]+)\s?mg/dl",
+    "BD": r"\bbilirrubina direta\b.*?([\d.,]+)\s?mg/dl",
+    "BI": r"\bbilirrubina indireta\b.*?([\d.,]+)\s?mg/dl",
+    "CR": r"\bcreatinina\b.*?resultado[:\s]*([\d.,]+)\s?mg/dl",
+    "UREIA": r"\bureia\b.*?resultado[:\s]*([\d.,]+)\s?mg/dl",
+    "FAL": r"\bfosfatase alcalina\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
+    "GGT": r"\bggt\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
+    "TGO": r"\btgo\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
+    "TGP": r"\btgp\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
+    "GLI": r"\bglicose\b(?! qualitativa).*?resultado[:\s]*([\d.,]+)\s?mg/dl",
+    "LIP": r"\blipase\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
     "MG++": r"\bmagn[eé]sio\b.*?resultado[:\s]*([\d.,]+)\s?mg/dl",
+    "TAP": r"\btempo de protrombina\b.*?resultado[:\s]*([\d.,]+)",
+    "INR": r"\binr\b.*?([\d.,]+)",
+    "TTP": r"\bttpa\b.*?resultado[:\s]*([\d.,]+)",
     "DIMERO D": r"\bd[ií]mero d\b.*?resultado[:\s]*([\d.,]+)",
+    "PCR": r"\bpcr\b.*?resultado[:\s]*([\d.,]+)\s?mg/dl",
+    "CKMB": r"\bck[- ]?mb\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
+    "CPK": r"\bcpk\b.*?resultado[:\s]*([\d.,]+)\s?u/l",
+    "TROPONINA": r"troponina(?! qualitativa).*?resultado[:\s]*([><\d.,]+)(?=\s*ng/m[lL])",
+    "TROPONINA QUAL": r"troponina qualitativa.*?resultado[:\s]*(positivo|negativo)"
 }
+regex_compilado = {k: re.compile(v, re.IGNORECASE) for k, v in exames.items()}
+ordem = list(exames.keys())
 def extrair_exames_formatado(pdf_file):
     if not pdf_file:
     tn, tocr = extrair_texto_pdf(pdf_file)
     texto = (tn + " " + tocr).lower()
     resultados = {}
+    for nome, pat in regex_compilado.items():
+        m = pat.search(texto)
         if not m:
             continue
         raw = m.group(1).strip().upper()
         else:
             resultados[nome] = classificar(nome, raw.replace(",", "."))
+    linhas = [f"{nome}: {resultados[nome]}" for nome in ordem if nome in resultados]
+    final = " / ".join(linhas)
     df = pd.DataFrame([[k, resultados[k]] for k in resultados], columns=["Exame", "Valor"])
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
     df.to_csv(tmp.name, index=False)
     return final, tmp.name
 demo = gr.Blocks()
 with demo:
+    gr.Markdown("## 🧪 Extrator Rápido com OCR seletivo")
     pdf_input = gr.File(file_types=[".pdf"], label="📄 PDF de exames")
     btn = gr.Button("🔍 Extrair")
     out_txt = gr.Textbox(lines=15, label="📋 Resultados")