Spaces:

GABRIELSZK
/

EXAMES

Sleeping

App Files Files Community

GABRIELSZK commited on Apr 27

Commit

e70ba90

verified ·

1 Parent(s): b98e0d2

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -88

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# Código completo atualizado para omitir exames ausentes no PDF
 import fitz
 import re
@@ -9,58 +10,53 @@ import pytesseract
 from PIL import Image, ImageEnhance, ImageFilter
 import io
-# Faixas de referência originais; exames sem faixa definida retornam valor bruto
 faixas = {
-    "HB": (12, 17), "HT": (36, 50), "LEUCO": (4, 11), "PLT": (150, 450),
     "K+": (3.5, 5.5), "NA+": (135, 145), "UREIA": (10, 50), "CR": (0.6, 1.3),
     "TGO": (0, 40), "TGP": (0, 40), "ALB": (3.5, 5.0), "INR": (0.8, 1.2),
     "TAP": (10, 14), "TTP": (25, 35)
 }
 def classificar(nome, valor):
-    """
-    Retorna valor com setas se fora da faixa.
-    """
     try:
-        raw = valor.replace("K", "").replace(">", "").replace("<", "").strip()
         val = float(raw)
         if nome in faixas:
-            min_v, max_v = faixas[nome]
-            if val < min_v:
-                return f"{valor} ↓"
-            if val > max_v:
-                return f"{valor} ↑"
         return valor
     except:
         return valor
-# Pré-processamento de imagem para OCR
-def melhorar_imagem(img):
     img = img.convert('L')
     img = ImageEnhance.Contrast(img).enhance(2)
-    img = img.filter(ImageFilter.SHARPEN)
-    return img
-# Extrai texto nativo + OCR das páginas do PDF
 def extrair_texto_pdf(pdf_file):
-    texto_fitz = []
     ocr_imgs = []
     with fitz.open(pdf_file.name) as doc:
         for page in doc:
-            texto_fitz.append(page.get_text())
             pix = page.get_pixmap(dpi=300)
             img = Image.open(io.BytesIO(pix.tobytes("png")))
             ocr_imgs.append(melhorar_imagem(img))
-    texto_fitz = " ".join(texto_fitz)
-    texto_fitz = re.sub(r'\s+', ' ', texto_fitz)
-    texto_ocr = " ".join(pytesseract.image_to_string(im) for im in ocr_imgs)
-    texto_ocr = re.sub(r'(\b[A-Z])\s+(?=[A-Z]\b)', r'\1', texto_ocr)
-    texto_ocr = re.sub(r'\s+', ' ', texto_ocr)
-    return texto_fitz, texto_ocr
-# Padrões regex para extração de cada exame, incluindo EAS
 exames = {
-    "LEUCO":   r"leuc[óo]citos.*?([\d.,]+)\s?(?:10\^3)?/u?l",
     "B":       r"bas[óo]filos.*?([\d.,]+)\s?%",
     "SS":      r"segmentados.*?([\d.,]+)\s?%",
     "EOS":     r"eosin[óo]filos.*?([\d.,]+)\s?%",
@@ -68,105 +64,108 @@ exames = {
     "MONO":    r"mon[óo]citos.*?([\d.,]+)\s?%",
     "HB":      r"hemoglobina.*?([\d.,]+)\s?g/dl",
     "HT":      r"hemat[óo]crito.*?([\d.,]+)\s?%",
-    "PLT":     r"plaquetas.*?([\d.,]+)\s?(?:10\^3)?/u?l",
     "AMIL":    r"amilase.*?([\d.,]+)\s?u/l",
     "ÁC UR":   r"[áa]cido ur[íi]co.*?([\d.,]+)\s?mg/dl",
     "BT":      r"bilirrubina total.*?([\d.,]+)\s?mg/dl",
     "BD":      r"bilirrubina direta.*?([\d.,]+)\s?mg/dl",
     "BI":      r"bilirrubina indireta.*?([\d.,]+)\s?mg/dl",
-    "CAI":     r"c[áa]lcio ioniza(?:do)?[a-z]*.*?([\d.,]+)\s?mmol/l",
     "CA TOTAL":r"c[áa]lcio total.*?([\d.,]+)\s?mg/dl",
     "CL-":     r"cloro.*?([\d.,]+)\s?mmol/l",
-    "CR":      r"creatinina.*?([\d.,]+)\s?mg/dl",
-    "UREIA":   r"ureia.*?([\d.,]+)\s?mg/dl",
-    "FAL":     r"fosfatase alcalina.*?([\d.,]+)\s?u/l",
     "FÓS":     r"f[oó]sforo.*?([\d.,]+)\s?mg/dl",
     "GGT":     r"ggt.*?([\d.,]+)\s?u/l",
-    "GLI":     r"glicose.*?([\d.,]+)\s?mg/dl",
-    "LIP":     r"lipase.*?([\d.,]+)\s?u/l",
-    "MG++":    r"magn[eé]sio.*?([\d.,]+)\s?mg/dl",
-    "PCR":     r"pcr.*?\bresultado\b\s*([\d]+,[\d]+)",
-    "K+":      r"pot[áa]ssio.*?([\d.,]+)\s?mmol/l",
-    "NA+":     r"s[óo]dio.*?([\d.,]+)\s?mmol/l",    "PTN":     r"prote[íi]na total.*?([\d.,]+)\s?g/dl",
     "ALB":     r"albumina.*?([\d.,]+)\s?g/dl",
     "GLOB":    r"globulina.*?([\d.,]+)\s?g/dl",
     "RELAÇÃO": r"rela[cç][ãa]o\s+a\/g.*?([\d.,]+)",
-    "TGO":     r"tgo.*?([\d.,]+)\s?u/l",
-    "TGP":     r"tgp.*?([\d.,]+)\s?u/l",
-    "TAP":     r"tempo de protrombina.*?\bresultado\b\s*([\d]+,[\d]+)",
-    "INR":     r"I\s*N\s*R\s+([\d]+,[\d]+)",
     "TTP":     r"ttpa.*?([\d.,]+)\s?seg",
-    "DIMERO D":r"d[ií]mero d.*?resultado\s*([\d.,]+)\s?ng/ml",
-    "LAC":     r"lactato.*?([\d.,]+)\s?mmol/l",
     "CKMB":    r"ck[- ]?mb.*?([\d.,]+)\s?u/l",
-    "CPK":     r"cpk.*?\bresultado\b\s*([\d.,]+)",
-    "TROPO":   r"troponina.*?([<>]?[\d.,]+)\s?ng/ml",
-    # Padrões para EAS
-    "LEUC ESTERASE": r"Leuc[óo]cito esterase\s*[:\-]?\s*([A-Za-z0-9\+\-]+)",
-    "LEUCO EAS":     r"Leuc[óo]citos?\s*[:\-]?\s*([\d]+\s*[-\/]\s*\d+)",
-    "HEMA EAS":      r"Hem[áa]cias?\s*[:\-]?\s*([\d]+\s*[-\/]\s*\d+)",
-    "BACTERIAS":     r"Bact[ée]rias?\s*[:\-]?\s*([A-Za-z]+)"
 }
-# Ordem de saída das chaves (sem EAS)
 ordem = [
     "LEUCO","B","SS","EOS","LINF","MONO",
-    "HB","HT","PLT","AMIL","ÁC UR","BT","BD","BI",
-    "CAI","CA TOTAL","CL-","CR","UREIA",
-    "FAL","FÓS","GGT","GLI","LIP","MG++",
-    "PCR","K+","NA+","PTN","ALB","GLOB","RELAÇÃO",
-    "TGO","TGP","TAP","INR","TTP","DIMERO D","LAC",
-    "CKMB","CPK","TROPO"
 ]
 def extrair_exames_formatado(pdf_file):
     if not pdf_file:
         return "Nenhum arquivo enviado.", None
-    texto_fitz, texto_ocr = extrair_texto_pdf(pdf_file)
-    textos = [texto_fitz, texto_ocr]
     resultados = {}
-    for rotulo, padrao in exames.items():
-        val = None
-        for txt in textos:
-            m = re.search(padrao, txt, re.IGNORECASE)
-            if m:
-                val = m.group(1).replace(',', '.')
-                break
-        if val:
-            resultados[rotulo] = classificar(rotulo, val)
-    # EAS (se presente)
     eas_chaves = ["LEUC ESTERASE","LEUCO EAS","HEMA EAS","BACTERIAS"]
     partes_eas = [f"{k}: {resultados[k]}" for k in eas_chaves if k in resultados]
     texto_eas = ""
     if partes_eas:
-        texto_eas = "EAS: " + " / ".join(partes_eas)
-    # Parte principal (omite ausentes)
     partes_main = [f"{r}: {resultados[r]}" for r in ordem if r in resultados]
     texto_main = " / ".join(partes_main)
-    # Concatena
-    texto_final = " / ".join([t for t in (texto_eas, texto_main) if t])
-    # Gera CSV apenas com presentes
-    df = pd.DataFrame(
-        [(k, resultados[k]) for k in resultados],
-        columns=["Exame","Valor"]
-    )
     temp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
     df.to_csv(temp.name, index=False)
     return texto_final, temp.name
-# Interface Gradio
 with gr.Blocks() as demo:
-    gr.Markdown("## 🧪 Extrator Avançado com OCR e EAS")
-    pdf_file = gr.File(label="📄 PDF de exames", file_types=[".pdf"])
     btn = gr.Button("🔍 Extrair Exames")
-    out_txt = gr.Textbox(label="📋 Exames Classificados", lines=12)
-    dl = gr.File(label="📥 CSV")
-    btn.click(extrair_exames_formatado, inputs=pdf_file, outputs=[out_txt, dl])
-if __name__ == '__main__':
     demo.launch()

+# Instalações necessárias
+!pip install pdfplumber gradio pandas pytesseract --quiet
 import fitz
 import re
 from PIL import Image, ImageEnhance, ImageFilter
 import io
+# Faixas de referência para classificação
 faixas = {
+    "HB": (11.5, 16.5), "HT": (36, 50), "LEUCO": (4000, 11000), "PLT": (150000, 450000),
     "K+": (3.5, 5.5), "NA+": (135, 145), "UREIA": (10, 50), "CR": (0.6, 1.3),
     "TGO": (0, 40), "TGP": (0, 40), "ALB": (3.5, 5.0), "INR": (0.8, 1.2),
     "TAP": (10, 14), "TTP": (25, 35)
 }
 def classificar(nome, valor):
+    """Adiciona setas se valor numérico estiver fora da faixa de referência."""
     try:
+        raw = valor.replace(">", "").replace("<", "").strip()
         val = float(raw)
         if nome in faixas:
+            mn, mx = faixas[nome]
+            if val < mn: return f"{valor} ↓"
+            if val > mx: return f"{valor} ↑"
         return valor
     except:
         return valor
+def melhorar_imagem(img: Image.Image) -> Image.Image:
+    """Aumenta contraste e nitidez para OCR."""
     img = img.convert('L')
     img = ImageEnhance.Contrast(img).enhance(2)
+    return img.filter(ImageFilter.SHARPEN)
 def extrair_texto_pdf(pdf_file):
+    """Extrai texto nativo e via OCR de cada página."""
+    texto_nativo = []
     ocr_imgs = []
     with fitz.open(pdf_file.name) as doc:
         for page in doc:
+            texto_nativo.append(page.get_text())
             pix = page.get_pixmap(dpi=300)
             img = Image.open(io.BytesIO(pix.tobytes("png")))
             ocr_imgs.append(melhorar_imagem(img))
+    tn = " ".join(texto_nativo)
+    tn = re.sub(r'\s+', ' ', tn)
+    tocr = " ".join(pytesseract.image_to_string(im) for im in ocr_imgs)
+    tocr = re.sub(r'\s+', ' ', tocr)
+    return tn, tocr
+# Padrões regex (case-insensitive) para todos os exames, incluindo Troponina Qualitativa
 exames = {
+    # Hemograma e diferenciais
+    "LEUCO":   r"leuc[óo]citos.*?([\d.,]+)\s?/u?l",
     "B":       r"bas[óo]filos.*?([\d.,]+)\s?%",
     "SS":      r"segmentados.*?([\d.,]+)\s?%",
     "EOS":     r"eosin[óo]filos.*?([\d.,]+)\s?%",
     "MONO":    r"mon[óo]citos.*?([\d.,]+)\s?%",
     "HB":      r"hemoglobina.*?([\d.,]+)\s?g/dl",
     "HT":      r"hemat[óo]crito.*?([\d.,]+)\s?%",
+    "PLT":     r"plaquetas.*?([\d.,]+).?/u?l",
+    # Bioquímica
     "AMIL":    r"amilase.*?([\d.,]+)\s?u/l",
+    "LIP":     r"lipase.*?([\d.,]+)\s?u/l",
+    "GLI":     r"glicose.*?([\d.,]+)\s?mg/dl",
+    "LACTATO": r"lactato.*?([\d.,]+)\s?mmol/l",
     "ÁC UR":   r"[áa]cido ur[íi]co.*?([\d.,]+)\s?mg/dl",
     "BT":      r"bilirrubina total.*?([\d.,]+)\s?mg/dl",
     "BD":      r"bilirrubina direta.*?([\d.,]+)\s?mg/dl",
     "BI":      r"bilirrubina indireta.*?([\d.,]+)\s?mg/dl",
+    "CAI":     r"c[áa]lcio ioniza(?:do)?.*?([\d.,]+)\s?mmol/l",
     "CA TOTAL":r"c[áa]lcio total.*?([\d.,]+)\s?mg/dl",
     "CL-":     r"cloro.*?([\d.,]+)\s?mmol/l",
+    "MG++":    r"magn[ée]sio.*?([\d.,]+)\s?mg/dl",
     "FÓS":     r"f[oó]sforo.*?([\d.,]+)\s?mg/dl",
+    "UREIA":   r"ureia.*?([\d.,]+)\s?mg/dl",
+    "CR":      r"creatinina.*?([\d.,]+)\s?mg/dl",
+    # Hepática e proteínas
+    "TGO":     r"tgo.*?([\d.,]+)\s?u/l",
+    "TGP":     r"tgp.*?([\d.,]+)\s?u/l",
     "GGT":     r"ggt.*?([\d.,]+)\s?u/l",
+    "FAL":     r"fosfatase alcalina.*?([\d.,]+)\s?u/l",
     "ALB":     r"albumina.*?([\d.,]+)\s?g/dl",
+    "PTN TOTAL":r"prote[ií]na total.*?([\d.,]+)\s?g/dl",
     "GLOB":    r"globulina.*?([\d.,]+)\s?g/dl",
     "RELAÇÃO": r"rela[cç][ãa]o\s+a\/g.*?([\d.,]+)",
+    # Coagulação
+    "TAP":     r"tempo de protrombina.*?resultado\s*([\d.,]+)",
+    "INR":     r"inr\s*([\d.,]+)",
     "TTP":     r"ttpa.*?([\d.,]+)\s?seg",
+    # Inflamatório
+    "PCR":     r"pcr.*?resultado\s*([\d.,]+)",
+    # Cardíacos
     "CKMB":    r"ck[- ]?mb.*?([\d.,]+)\s?u/l",
+    "CPK":     r"cpk.*?resultado\s*([\d.,]+)",
+    "TROPO":   r"troponina\s*(?!qual).*?([<>]?[\d.,]+)\s?ng/ml",
+    "TROPONINA QUAL": r"troponina qualitativa.*?resultado\s*([A-Za-z]+)",
+    # EAS (urina)
+    "LEUC ESTERASE": r"leuc[óo]cito esterase.*?([A-Za-z\+\-]+)",
+    "LEUCO EAS":     r"leuc[óo]citos?.*?([\d]+\s*[-\/]\s*\d+)",
+    "HEMA EAS":      r"hem[áa]cias?.*?([\d]+\s*[-\/]\s*\d+)",
+    "BACTERIAS":     r"bact[ée]rias?.*?([A-Za-z]+)"
 }
+# Ordem preferencial de exibição (numéricos e qualitativos)
 ordem = [
     "LEUCO","B","SS","EOS","LINF","MONO",
+    "HB","HT","PLT","AMIL","LIP","GLI","LACTATO",
+    "ÁC UR","BT","BD","BI","CAI","CA TOTAL","CL-","MG++","FÓS","UREIA","CR",
+    "TGO","TGP","GGT","FAL","ALB","PTN TOTAL","GLOB","RELAÇÃO",
+    "TAP","INR","TTP","PCR","DIMERO D",
+    "CKMB","CPK","TROPO","TROPONINA QUAL"
 ]
 def extrair_exames_formatado(pdf_file):
     if not pdf_file:
         return "Nenhum arquivo enviado.", None
+    # extrai texto
+    tn, tocr = extrair_texto_pdf(pdf_file)
+    textos = tn + " " + tocr
     resultados = {}
+    # varre todos os padrões
+    for nome, pat in exames.items():
+        m = re.search(pat, textos, re.IGNORECASE)
+        if m:
+            val = m.group(1).strip().replace(",", ".")
+            # normaliza QUAL como uppercase
+            if nome == "TROPONINA QUAL":
+                val = val.upper()
+            resultados[nome] = classificar(nome, val)
+    # monta string de EAS
     eas_chaves = ["LEUC ESTERASE","LEUCO EAS","HEMA EAS","BACTERIAS"]
     partes_eas = [f"{k}: {resultados[k]}" for k in eas_chaves if k in resultados]
     texto_eas = ""
     if partes_eas:
+        texto_eas = "🟤 EAS (Urinálise) → " + " / ".join(partes_eas)
+    # monta string principal
     partes_main = [f"{r}: {resultados[r]}" for r in ordem if r in resultados]
     texto_main = " / ".join(partes_main)
+    # concatena só as partes não vazias
+    texto_final = "\n".join([t for t in (texto_eas, texto_main) if t])
+    # gera CSV
+    df = pd.DataFrame([[k, resultados[k]] for k in resultados], columns=["Exame","Valor"])
     temp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
     df.to_csv(temp.name, index=False)
     return texto_final, temp.name
+# interface Gradio
 with gr.Blocks() as demo:
+    gr.Markdown("## 🧪 Extrator Avançado com OCR + EAS + Troponina Qualitativa")
+    pdf_input = gr.File(label="📄 PDF de exames", file_types=[".pdf"])
     btn = gr.Button("🔍 Extrair Exames")
+    out_txt = gr.Textbox(label="📋 Resultados", lines=8)
+    dl = gr.File(label="📥 Baixar CSV")
+    btn.click(extrair_exames_formatado, inputs=pdf_input, outputs=[out_txt, dl])
+if __name__ == "__main__":
     demo.launch()