Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import pipeline | |
from PIL import Image | |
import fitz # PyMuPDF for PDF processing | |
import logging | |
from concurrent.futures import ThreadPoolExecutor | |
# Setup logging | |
def setup_logging(): | |
logging.basicConfig( | |
level=logging.INFO, | |
format="%(asctime)s - %(levelname)s - %(message)s", | |
) | |
# Load models globally for faster performance | |
def load_models(): | |
logging.info("Loading Hugging Face models...") | |
# Use most popular image-to-text model | |
image_to_text = pipeline("image-to-text", model="microsoft/trocr-large-printed") | |
# Translation models | |
translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi") | |
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur") | |
# Summarization model | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
return image_to_text, translator_hi, translator_ur, summarizer | |
# Function to extract text from images | |
def extract_text_from_image(image): | |
logging.info("Extracting text from image...") | |
# Use TrOCR for more accurate text extraction | |
image_to_text = load_models()[0] | |
results = image_to_text(image) | |
# Combine all detected text | |
return " ".join([result['generated_text'] for result in results]) | |
# Function to extract text from PDFs | |
def extract_text_from_pdf(pdf_file): | |
logging.info("Extracting text from PDF...") | |
doc = fitz.open(pdf_file) | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
# Function to process text in chunks for better performance | |
def process_chunks(text, model, chunk_size=500): | |
chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] | |
results = [] | |
with ThreadPoolExecutor() as executor: | |
results = list(executor.map(lambda chunk: model(chunk, max_length=200), chunks)) | |
return " ".join([result[0]["translation_text"] for result in results]) | |
# Main app logic | |
def main(): | |
setup_logging() | |
st.title("Advanced Lab Report Analyzer") | |
st.write("Upload a file (Image, PDF, or Text) to analyze and summarize the lab report in English, Hindi, and Urdu.") | |
# Load all models | |
image_to_text, translator_hi, translator_ur, summarizer = load_models() | |
file = st.file_uploader("Upload a file (Image, PDF, or Text):", type=["jpg", "png", "jpeg", "pdf", "txt"]) | |
if file: | |
text = "" | |
try: | |
if file.type in ["image/jpeg", "image/png", "image/jpg"]: | |
image = Image.open(file) | |
text = extract_text_from_image(image) | |
elif file.type == "application/pdf": | |
text = extract_text_from_pdf(file) | |
elif file.type == "text/plain": | |
text = file.read().decode("utf-8") | |
if text: | |
with st.spinner("Analyzing the report..."): | |
# Generate summary | |
summary = summarizer(text, max_length=130, min_length=30)[0]["summary_text"] | |
# Generate translations | |
hindi_translation = process_chunks(text, translator_hi) | |
urdu_translation = process_chunks(text, translator_ur) | |
# Display results | |
st.subheader("Original Text:") | |
st.write(text) | |
st.subheader("Analysis Summary (English):") | |
st.write(summary) | |
st.subheader("Hindi Translation:") | |
st.write(hindi_translation) | |
st.subheader("Urdu Translation:") | |
st.write(urdu_translation) | |
else: | |
st.warning("No text could be extracted. Please check the file and try again.") | |
except Exception as e: | |
logging.error(f"Error processing the file: {e}") | |
st.error(f"An error occurred while processing the file: {e}") | |
else: | |
st.info("Please upload a file to begin.") | |
if __name__ == "__main__": | |
main() |