mdasad3617 commited on
Commit
fcfc162
·
verified ·
1 Parent(s): 82535dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -78
app.py CHANGED
@@ -1,105 +1,94 @@
1
  import streamlit as st
2
  from transformers import pipeline
3
- from PIL import Image
4
  import pytesseract
 
 
5
  import logging
6
- import PyPDF2
7
 
8
  # Setup logging
9
  def setup_logging():
10
  logging.basicConfig(
11
  level=logging.INFO,
12
  format="%(asctime)s - %(levelname)s - %(message)s",
13
- handlers=[logging.StreamHandler()],
14
  )
15
 
16
- # Text extraction from image
 
 
 
 
 
 
 
 
 
17
  def extract_text_from_image(image):
18
- try:
19
- text = pytesseract.image_to_string(image)
20
- return text
21
- except Exception as e:
22
- logging.error(f"Error during OCR: {e}")
23
- return "Error occurred during text extraction."
24
 
25
- # Text extraction from PDF
26
- def extract_text_from_pdf(file):
27
- try:
28
- pdf_reader = PyPDF2.PdfReader(file)
29
- text = ""
30
- for page in pdf_reader.pages:
31
- text += page.extract_text()
32
- return text
33
- except Exception as e:
34
- logging.error(f"Error during PDF text extraction: {e}")
35
- return "Error occurred during text extraction."
36
 
37
- # Main function
 
 
 
 
 
 
 
 
38
  def main():
39
  setup_logging()
40
  st.title("Lab Report Analyzer")
41
- st.write("Analyze lab reports from images, PDFs, or text and get summaries in English, Hindi, and Urdu.")
42
-
43
- # Hugging Face pipelines
44
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # Summarization model
45
- translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi") # English to Hindi
46
- translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur") # English to Urdu
47
-
48
- # File upload section
49
- uploaded_file = st.file_uploader("Upload a file (Image or PDF):", type=["png", "jpg", "jpeg", "pdf"])
50
- text_input = st.text_area("Or paste your text here:")
 
 
 
 
 
51
 
52
- if st.button("Analyze"):
53
- extracted_text = ""
54
-
55
- # Extract text based on file type
56
- if uploaded_file:
57
- if uploaded_file.name.endswith(".pdf"):
58
- st.info("Extracting text from PDF...")
59
- extracted_text = extract_text_from_pdf(uploaded_file)
60
- else:
61
- st.info("Extracting text from image...")
62
- image = Image.open(uploaded_file)
63
- extracted_text = extract_text_from_image(image)
64
- elif text_input:
65
- extracted_text = text_input
66
- else:
67
- st.warning("Please upload a file or enter text.")
68
- return
69
 
70
- # Display extracted text
71
- st.subheader("Extracted Text")
72
- st.text_area("Extracted Text:", extracted_text, height=200)
73
 
74
- # Summarize the text
75
- try:
76
- st.info("Summarizing text...")
77
- summary = summarizer(extracted_text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
78
- st.subheader("Summary (English)")
79
- st.write(summary)
80
- except Exception as e:
81
- logging.error(f"Error during summarization: {e}")
82
- st.error("An error occurred during summarization.")
83
 
84
- # Translate summary to Hindi
85
- try:
86
- st.info("Translating summary to Hindi...")
87
- summary_hi = translator_hi(summary)[0]['translation_text']
88
- st.subheader("Summary (Hindi)")
89
- st.write(summary_hi)
90
- except Exception as e:
91
- logging.error(f"Error during Hindi translation: {e}")
92
- st.error("An error occurred during Hindi translation.")
93
 
94
- # Translate summary to Urdu
95
- try:
96
- st.info("Translating summary to Urdu...")
97
- summary_ur = translator_ur(summary)[0]['translation_text']
98
- st.subheader("Summary (Urdu)")
99
- st.write(summary_ur)
100
  except Exception as e:
101
- logging.error(f"Error during Urdu translation: {e}")
102
- st.error("An error occurred during Urdu translation.")
 
 
103
 
104
  if __name__ == "__main__":
105
  main()
 
1
  import streamlit as st
2
  from transformers import pipeline
 
3
  import pytesseract
4
+ from PIL import Image
5
+ import fitz # PyMuPDF for PDF processing
6
  import logging
7
+ from concurrent.futures import ThreadPoolExecutor
8
 
9
  # Setup logging
10
  def setup_logging():
11
  logging.basicConfig(
12
  level=logging.INFO,
13
  format="%(asctime)s - %(levelname)s - %(message)s",
 
14
  )
15
 
16
+ # Load models globally for faster performance
17
+ @st.cache_resource
18
+ def load_models():
19
+ logging.info("Loading Hugging Face models...")
20
+ translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
21
+ translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur")
22
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
23
+ return translator_hi, translator_ur, summarizer
24
+
25
+ # Function to extract text from images
26
  def extract_text_from_image(image):
27
+ logging.info("Extracting text from image...")
28
+ return pytesseract.image_to_string(image)
 
 
 
 
29
 
30
+ # Function to extract text from PDFs
31
+ def extract_text_from_pdf(pdf_file):
32
+ logging.info("Extracting text from PDF...")
33
+ doc = fitz.open(pdf_file)
34
+ text = ""
35
+ for page in doc:
36
+ text += page.get_text()
37
+ return text
 
 
 
38
 
39
+ # Function to process text in chunks for better performance
40
+ def process_chunks(text, model, chunk_size=500):
41
+ chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
42
+ results = []
43
+ with ThreadPoolExecutor() as executor:
44
+ results = list(executor.map(lambda chunk: model(chunk, max_length=200), chunks))
45
+ return " ".join([result[0]["translation_text"] for result in results])
46
+
47
+ # Main app logic
48
  def main():
49
  setup_logging()
50
  st.title("Lab Report Analyzer")
51
+ st.write("Upload a file (Image, PDF, or Text) to analyze and summarize the lab report in English, Hindi, and Urdu.")
52
+
53
+ translator_hi, translator_ur, summarizer = load_models()
54
+
55
+ file = st.file_uploader("Upload a file (Image, PDF, or Text):", type=["jpg", "png", "jpeg", "pdf", "txt"])
56
+ if file:
57
+ text = ""
58
+ try:
59
+ if file.type in ["image/jpeg", "image/png", "image/jpg"]:
60
+ image = Image.open(file)
61
+ text = extract_text_from_image(image)
62
+ elif file.type == "application/pdf":
63
+ text = extract_text_from_pdf(file)
64
+ elif file.type == "text/plain":
65
+ text = file.read().decode("utf-8")
66
 
67
+ if text:
68
+ with st.spinner("Analyzing the report..."):
69
+ # Generate summary
70
+ summary = summarizer(text, max_length=130, min_length=30)[0]["summary_text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ # Generate translations
73
+ hindi_translation = process_chunks(text, translator_hi)
74
+ urdu_translation = process_chunks(text, translator_ur)
75
 
76
+ # Display results
77
+ st.subheader("Analysis Summary (English):")
78
+ st.write(summary)
 
 
 
 
 
 
79
 
80
+ st.subheader("Hindi Translation:")
81
+ st.write(hindi_translation)
 
 
 
 
 
 
 
82
 
83
+ st.subheader("Urdu Translation:")
84
+ st.write(urdu_translation)
85
+ else:
86
+ st.warning("No text could be extracted. Please check the file and try again.")
 
 
87
  except Exception as e:
88
+ logging.error(f"Error processing the file: {e}")
89
+ st.error("An error occurred while processing the file. Please try again.")
90
+ else:
91
+ st.info("Please upload a file to begin.")
92
 
93
  if __name__ == "__main__":
94
  main()