CR7CAD commited on
Commit
9753cc9
·
verified ·
1 Parent(s): 294af95

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -18
app.py CHANGED
@@ -1,13 +1,11 @@
1
  import os
2
  import re
3
- import torch # Explicitly imported if you want to use torch directly
4
- from io import BytesIO
5
-
6
  import streamlit as st
7
  from PIL import Image
8
  from transformers import pipeline
 
9
 
10
- # Use st.cache_resource (Streamlit 1.18+) to load and cache the OCR pipeline once
11
  @st.cache_resource(show_spinner=False)
12
  def load_ocr_pipeline():
13
  try:
@@ -18,22 +16,30 @@ def load_ocr_pipeline():
18
  st.error(f"Error loading model: {e}")
19
  st.stop()
20
 
21
- # Load the model at startup
22
  ocr_pipeline = load_ocr_pipeline()
23
  st.write("Model loaded successfully!")
24
 
25
  #####################################
26
- # Pipeline: Extract Text with OCR Pipeline
27
  #####################################
28
  def extract_text_from_file(file_obj):
29
  full_text = ""
30
- try:
31
- img = Image.open(file_obj)
32
- result = ocr_pipeline(img)
33
- if isinstance(result, list) and "text" in result[0]:
34
- full_text = result[0]["text"]
35
- except Exception as e:
36
- full_text = f"Error processing image: {e}"
 
 
 
 
 
 
 
 
 
37
  return full_text
38
 
39
  #####################################
@@ -94,9 +100,9 @@ def process_resume(file_obj):
94
  if file_obj is None:
95
  return None, None
96
 
97
- # Extract text using only the image-based OCR pipeline
98
  resume_text = extract_text_from_file(file_obj)
99
- # Parse basic resume info
100
  resume_info = extract_basic_resume_info(resume_text)
101
  return resume_text, resume_info
102
 
@@ -105,14 +111,14 @@ def process_resume(file_obj):
105
  #####################################
106
  st.title("Resume Extraction and Basic Info Parsing")
107
  st.markdown("""
108
- Upload an image file (PNG, JPG, or JPEG) to extract basic text and candidate information.
109
  """)
110
 
111
- uploaded_file = st.file_uploader("Upload Resume (Image Only)", type=["png", "jpg", "jpeg"])
112
 
113
  if st.button("Extract Info"):
114
  if uploaded_file is None:
115
- st.error("Please upload an image file first.")
116
  else:
117
  with st.spinner("Processing..."):
118
  resume_text, resume_info = process_resume(uploaded_file)
 
1
  import os
2
  import re
 
 
 
3
  import streamlit as st
4
  from PIL import Image
5
  from transformers import pipeline
6
+ from pdfminer.high_level import extract_text
7
 
8
+ # Load and cache the OCR model once at startup
9
  @st.cache_resource(show_spinner=False)
10
  def load_ocr_pipeline():
11
  try:
 
16
  st.error(f"Error loading model: {e}")
17
  st.stop()
18
 
 
19
  ocr_pipeline = load_ocr_pipeline()
20
  st.write("Model loaded successfully!")
21
 
22
  #####################################
23
+ # Extract Text from File Function
24
  #####################################
25
  def extract_text_from_file(file_obj):
26
  full_text = ""
27
+ file_extension = os.path.splitext(file_obj.name)[1].lower()
28
+
29
+ if file_extension == ".pdf":
30
+ try:
31
+ # Use pdfminer.six to extract text from PDF files.
32
+ full_text = extract_text(file_obj)
33
+ except Exception as e:
34
+ full_text = f"Error processing PDF: {e}"
35
+ else:
36
+ try:
37
+ img = Image.open(file_obj)
38
+ result = ocr_pipeline(img)
39
+ if isinstance(result, list) and "text" in result[0]:
40
+ full_text = result[0]["text"]
41
+ except Exception as e:
42
+ full_text = f"Error processing image: {e}"
43
  return full_text
44
 
45
  #####################################
 
100
  if file_obj is None:
101
  return None, None
102
 
103
+ # Extract text based on file type (PDF or image)
104
  resume_text = extract_text_from_file(file_obj)
105
+ # Parse basic resume info using heuristics
106
  resume_info = extract_basic_resume_info(resume_text)
107
  return resume_text, resume_info
108
 
 
111
  #####################################
112
  st.title("Resume Extraction and Basic Info Parsing")
113
  st.markdown("""
114
+ Upload a resume file (PDF, PNG, JPG, or JPEG) to extract basic text and candidate information.
115
  """)
116
 
117
+ uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])
118
 
119
  if st.button("Extract Info"):
120
  if uploaded_file is None:
121
+ st.error("Please upload a file first.")
122
  else:
123
  with st.spinner("Processing..."):
124
  resume_text, resume_info = process_resume(uploaded_file)