File size: 4,165 Bytes
ca69a0e
5244794
8b18b7b
ed4ebee
8b18b7b
 
ae7d660
8b18b7b
 
 
 
 
 
fcfc162
8b18b7b
 
 
 
5244794
 
8b18b7b
5244794
8b18b7b
 
 
 
 
 
5244794
ca69a0e
5244794
 
8b18b7b
5244794
 
 
 
 
ca69a0e
8b18b7b
 
 
 
 
 
 
 
ca69a0e
8b18b7b
 
 
 
 
 
 
ca69a0e
8b18b7b
ddb299c
8b18b7b
5244794
8b18b7b
 
5244794
 
8b18b7b
 
5244794
8b18b7b
 
 
 
 
5244794
8b18b7b
 
 
 
5244794
8b18b7b
 
 
 
5244794
8b18b7b
 
 
5244794
8b18b7b
5244794
 
 
8b18b7b
 
5244794
8b18b7b
 
5244794
8b18b7b
 
 
 
5244794
8b18b7b
 
5244794
8b18b7b
 
ae7d660
 
5244794
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import streamlit as st
from transformers import pipeline
from PIL import Image
import fitz  # PyMuPDF for PDF processing
import logging
from concurrent.futures import ThreadPoolExecutor

# Setup logging
def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
    )

# Load models globally for faster performance
@st.cache_resource
def load_models():
    logging.info("Loading Hugging Face models...")
    # Use most popular image-to-text model
    image_to_text = pipeline("image-to-text", model="microsoft/trocr-large-printed")
    
    # Translation models
    translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
    translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur")
    
    # Summarization model
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    
    return image_to_text, translator_hi, translator_ur, summarizer

# Function to extract text from images
def extract_text_from_image(image):
    logging.info("Extracting text from image...")
    # Use TrOCR for more accurate text extraction
    image_to_text = load_models()[0]
    results = image_to_text(image)
    # Combine all detected text
    return " ".join([result['generated_text'] for result in results])

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_file):
    logging.info("Extracting text from PDF...")
    doc = fitz.open(pdf_file)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Function to process text in chunks for better performance
def process_chunks(text, model, chunk_size=500):
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    results = []
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(lambda chunk: model(chunk, max_length=200), chunks))
    return " ".join([result[0]["translation_text"] for result in results])

# Main app logic
def main():
    setup_logging()
    st.title("Advanced Lab Report Analyzer")
    st.write("Upload a file (Image, PDF, or Text) to analyze and summarize the lab report in English, Hindi, and Urdu.")
    
    # Load all models
    image_to_text, translator_hi, translator_ur, summarizer = load_models()
    
    file = st.file_uploader("Upload a file (Image, PDF, or Text):", type=["jpg", "png", "jpeg", "pdf", "txt"])
    
    if file:
        text = ""
        try:
            if file.type in ["image/jpeg", "image/png", "image/jpg"]:
                image = Image.open(file)
                text = extract_text_from_image(image)
            elif file.type == "application/pdf":
                text = extract_text_from_pdf(file)
            elif file.type == "text/plain":
                text = file.read().decode("utf-8")
            
            if text:
                with st.spinner("Analyzing the report..."):
                    # Generate summary
                    summary = summarizer(text, max_length=130, min_length=30)[0]["summary_text"]
                    
                    # Generate translations
                    hindi_translation = process_chunks(text, translator_hi)
                    urdu_translation = process_chunks(text, translator_ur)
                    
                    # Display results
                    st.subheader("Original Text:")
                    st.write(text)
                    
                    st.subheader("Analysis Summary (English):")
                    st.write(summary)
                    
                    st.subheader("Hindi Translation:")
                    st.write(hindi_translation)
                    
                    st.subheader("Urdu Translation:")
                    st.write(urdu_translation)
            else:
                st.warning("No text could be extracted. Please check the file and try again.")
        
        except Exception as e:
            logging.error(f"Error processing the file: {e}")
            st.error(f"An error occurred while processing the file: {e}")
    else:
        st.info("Please upload a file to begin.")

if __name__ == "__main__":
    main()