mdasad3617 commited on
Commit
ca69a0e
·
verified ·
1 Parent(s): 51a6db9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -79
app.py CHANGED
@@ -1,94 +1,60 @@
1
- import streamlit as st
2
- from transformers import pipeline
3
  import pytesseract
4
  from PIL import Image
5
- import fitz # PyMuPDF for PDF processing
6
- import logging
7
- from concurrent.futures import ThreadPoolExecutor
8
-
9
- # Setup logging
10
- def setup_logging():
11
- logging.basicConfig(
12
- level=logging.INFO,
13
- format="%(asctime)s - %(levelname)s - %(message)s",
14
- )
15
 
16
- # Load models globally for faster performance
17
- @st.cache_resource
18
- def load_models():
19
- logging.info("Loading Hugging Face models...")
20
- translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
21
- translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur")
22
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
23
- return translator_hi, translator_ur, summarizer
24
 
25
- # Function to extract text from images
26
  def extract_text_from_image(image):
27
- logging.info("Extracting text from image...")
28
- return pytesseract.image_to_string(image)
29
-
30
- # Function to extract text from PDFs
31
- def extract_text_from_pdf(pdf_file):
32
- logging.info("Extracting text from PDF...")
33
- doc = fitz.open(pdf_file)
34
- text = ""
35
- for page in doc:
36
- text += page.get_text()
37
- return text
38
-
39
- # Function to process text in chunks for better performance
40
- def process_chunks(text, model, chunk_size=500):
41
- chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
42
- results = []
43
- with ThreadPoolExecutor() as executor:
44
- results = list(executor.map(lambda chunk: model(chunk, max_length=200), chunks))
45
- return " ".join([result[0]["translation_text"] for result in results])
46
-
47
- # Main app logic
48
  def main():
49
- setup_logging()
50
  st.title("Lab Report Analyzer")
51
- st.write("Upload a file (Image, PDF, or Text) to analyze and summarize the lab report in English, Hindi, and Urdu.")
52
-
53
- translator_hi, translator_ur, summarizer = load_models()
54
-
55
- file = st.file_uploader("Upload a file (Image, PDF, or Text):", type=["jpg", "png", "jpeg", "pdf", "txt"])
56
- if file:
57
- text = ""
58
- try:
59
- if file.type in ["image/jpeg", "image/png", "image/jpg"]:
60
- image = Image.open(file)
61
- text = extract_text_from_image(image)
62
- elif file.type == "application/pdf":
63
- text = extract_text_from_pdf(file)
64
- elif file.type == "text/plain":
65
- text = file.read().decode("utf-8")
66
 
67
- if text:
68
- with st.spinner("Analyzing the report..."):
69
- # Generate summary
70
- summary = summarizer(text, max_length=130, min_length=30)[0]["summary_text"]
71
 
72
- # Generate translations
73
- hindi_translation = process_chunks(text, translator_hi)
74
- urdu_translation = process_chunks(text, translator_ur)
75
 
76
- # Display results
77
- st.subheader("Analysis Summary (English):")
78
- st.write(summary)
 
 
 
 
 
 
 
79
 
80
- st.subheader("Hindi Translation:")
81
- st.write(hindi_translation)
 
82
 
83
- st.subheader("Urdu Translation:")
84
- st.write(urdu_translation)
85
- else:
86
- st.warning("No text could be extracted. Please check the file and try again.")
87
- except Exception as e:
88
- logging.error(f"Error processing the file: {e}")
89
- st.error("An error occurred while processing the file. Please try again.")
90
- else:
91
- st.info("Please upload a file to begin.")
92
 
93
  if __name__ == "__main__":
94
  main()
 
1
+ import logging
 
2
  import pytesseract
3
  from PIL import Image
4
+ import os
5
+ import streamlit as st
 
 
 
 
 
 
 
 
6
 
7
+ # Configure logging to display debug information
8
+ logging.basicConfig(level=logging.DEBUG)
 
 
 
 
 
 
9
 
10
+ # Function to extract text from an image
11
  def extract_text_from_image(image):
12
+ try:
13
+ logging.info("Starting text extraction from image...")
14
+
15
+ # Verify the image is not corrupted
16
+ image.verify() # Verifies the image is not corrupted
17
+ logging.info("Image opened and verified successfully.")
18
+
19
+ # Resize the image to improve performance (optional)
20
+ image = image.resize((image.width // 2, image.height // 2)) # Resize image to 50% of the original size
21
+
22
+ # Extract text using pytesseract
23
+ text = pytesseract.image_to_string(image)
24
+
25
+ logging.info("Text extraction completed successfully.")
26
+ return text
27
+
28
+ except Exception as e:
29
+ logging.error(f"An error occurred while processing the image: {str(e)}")
30
+ return f"Error: {str(e)}"
31
+
32
+ # Streamlit web application
33
  def main():
 
34
  st.title("Lab Report Analyzer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ st.markdown("Upload an image file to extract text from it.")
 
 
 
37
 
38
+ # File uploader widget
39
+ uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png", "jpeg"])
 
40
 
41
+ if uploaded_file is not None:
42
+ # Save the uploaded file temporarily
43
+ with open("temp_image.jpg", "wb") as f:
44
+ f.write(uploaded_file.getbuffer())
45
+
46
+ # Open the image file
47
+ image = Image.open("temp_image.jpg")
48
+
49
+ # Extract text from the uploaded image
50
+ extracted_text = extract_text_from_image(image)
51
 
52
+ # Display extracted text
53
+ st.subheader("Extracted Text")
54
+ st.text(extracted_text)
55
 
56
+ # Optionally, delete the temporary file after processing
57
+ os.remove("temp_image.jpg")
 
 
 
 
 
 
 
58
 
59
  if __name__ == "__main__":
60
  main()