CR7CAD commited on
Commit
92f45fe
·
verified ·
1 Parent(s): 08361f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -93
app.py CHANGED
@@ -1,135 +1,121 @@
1
  import os
2
- import re
3
  import streamlit as st
4
- from PIL import Image
5
  from transformers import pipeline
6
- from pdfminer.high_level import extract_text
 
7
 
8
- # Load and cache the OCR model once at startup.
 
 
9
  @st.cache_resource(show_spinner=False)
10
- def load_ocr_pipeline():
11
  try:
12
- # Initialize the OCR pipeline from transformers. Change the model as needed.
13
- ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
14
- return ocr_pipe
15
  except Exception as e:
16
- st.error(f"Error loading model: {e}")
17
  st.stop()
18
 
19
- ocr_pipeline = load_ocr_pipeline()
20
- st.write("Model loaded successfully!")
21
 
22
  #####################################
23
- # Text Extraction Function
24
  #####################################
25
  def extract_text_from_file(file_obj):
26
- full_text = ""
27
- file_extension = os.path.splitext(file_obj.name)[1].lower()
 
 
 
 
28
 
29
- if file_extension == ".pdf":
 
30
  try:
31
- # Use pdfminer.six to extract text from PDF files.
32
- full_text = extract_text(file_obj)
33
  except Exception as e:
34
- full_text = f"Error processing PDF: {e}"
35
- else:
 
36
  try:
37
- img = Image.open(file_obj)
38
- result = ocr_pipeline(img)
39
- if isinstance(result, list) and "text" in result[0]:
40
- full_text = result[0]["text"]
41
  except Exception as e:
42
- full_text = f"Error processing image: {e}"
43
- return full_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  #####################################
46
- # Resume Information Extraction Functions
47
  #####################################
48
- def extract_basic_resume_info(text):
49
  """
50
- Extract basic resume information, such as:
51
- - Name
52
- - Age
53
- - Job Experience
54
- - Skills
55
- - Expected Industry/Direction
56
  """
57
- info = {
58
- "Name": None,
59
- "Age": None,
60
- "Job Experience": None,
61
- "Skills": None,
62
- "Expected Industry/Direction": None,
63
- }
64
-
65
- # Extract Name: trigger words like 'Name:'
66
- name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
67
- if name_match:
68
- info["Name"] = name_match.group(1).strip()
69
- else:
70
- # Fallback: heuristic for sequences of capitalized words.
71
- potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
72
- if potential_names:
73
- info["Name"] = potential_names[0]
74
-
75
- # Extract Age:
76
- age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
77
- if age_match:
78
- info["Age"] = age_match.group(1)
79
-
80
- # Extract Job Experience (years)
81
- exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
82
- if exp_match:
83
- info["Job Experience"] = exp_match.group(1) + " years"
84
- else:
85
- exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE)
86
- if exp_line:
87
- info["Job Experience"] = exp_line.group(2).strip()
88
-
89
- # Extract Skills (e.g., "Skills: Python, Java, SQL")
90
- skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
91
- if skills_match:
92
- skills_text = skills_match.group(1)
93
- skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
94
- info["Skills"] = skills
95
-
96
- # Extract Expected Industry/Direction (e.g., "Interest: Software Development")
97
- industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
98
- if industry_match:
99
- info["Expected Industry/Direction"] = industry_match.group(2).strip()
100
-
101
- return info
102
 
103
  #####################################
104
- # Main Resume Processing Logic
105
  #####################################
106
  def process_resume(file_obj):
107
  if file_obj is None:
108
  return None, None
109
 
110
- # Extract text based on file type (PDF or image).
111
  resume_text = extract_text_from_file(file_obj)
112
- # Parse basic resume details from the extracted text.
113
- resume_info = extract_basic_resume_info(resume_text)
114
- return resume_text, resume_info
115
 
116
  #####################################
117
  # Streamlit Interface
118
  #####################################
119
- st.title("Resume Extraction and Basic Info Parsing")
120
- st.markdown("""
121
- Upload a resume file in PDF or image format (PNG, JPG, JPEG) to extract text and candidate information.
122
- """)
 
 
 
123
 
124
- uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])
125
 
126
- if st.button("Extract Info"):
127
  if uploaded_file is None:
128
  st.error("Please upload a file first.")
129
  else:
130
  with st.spinner("Processing..."):
131
- resume_text, resume_info = process_resume(uploaded_file)
132
  st.subheader("Extracted Resume Text")
133
- st.text_area("", resume_text, height=200)
134
- st.subheader("Parsed Basic Resume Information")
135
- st.json(resume_info)
 
1
  import os
2
+ import tempfile
3
  import streamlit as st
 
4
  from transformers import pipeline
5
+ import docx
6
+ import textract
7
 
8
+ #####################################
9
+ # Summarization Pipeline Setup
10
+ #####################################
11
  @st.cache_resource(show_spinner=False)
12
+ def load_summarization_pipeline():
13
  try:
14
+ summarizer = pipeline("summarization", model="recogna-nlp/ptt5-base-summ-xlsum")
15
+ return summarizer
 
16
  except Exception as e:
17
+ st.error(f"Error loading summarization model: {e}")
18
  st.stop()
19
 
20
+ summarizer = load_summarization_pipeline()
21
+ st.write("Summarization model loaded successfully!")
22
 
23
  #####################################
24
+ # Function to Extract Text from File
25
  #####################################
26
  def extract_text_from_file(file_obj):
27
+ """
28
+ Extract text from .txt, .docx, and .doc files.
29
+ """
30
+ filename = file_obj.name
31
+ ext = os.path.splitext(filename)[1].lower()
32
+ text = ""
33
 
34
+ if ext == ".txt":
35
+ # For text files, decode the byte stream into a string.
36
  try:
37
+ text = file_obj.read().decode("utf-8")
 
38
  except Exception as e:
39
+ text = f"Error reading text file: {e}"
40
+
41
+ elif ext == ".docx":
42
  try:
43
+ # Use python-docx to read .docx files.
44
+ document = docx.Document(file_obj)
45
+ text = "\n".join([para.text for para in document.paragraphs])
 
46
  except Exception as e:
47
+ text = f"Error processing DOCX file: {e}"
48
+
49
+ elif ext == ".doc":
50
+ # For .doc files, use textract. textract expects a filename, so save temporarily.
51
+ try:
52
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
53
+ tmp.write(file_obj.read())
54
+ tmp.flush()
55
+ tmp_filename = tmp.name
56
+ text = textract.process(tmp_filename).decode("utf-8")
57
+ except Exception as e:
58
+ text = f"Error processing DOC file: {e}"
59
+ finally:
60
+ try:
61
+ os.remove(tmp_filename)
62
+ except Exception:
63
+ pass
64
+ else:
65
+ text = "Unsupported file type."
66
+
67
+ return text
68
 
69
  #####################################
70
+ # Function to Summarize Extracted Text
71
  #####################################
72
+ def summarize_text(text):
73
  """
74
+ Summarize the given text using the summarization pipeline.
75
+ Adjust max_length and min_length as needed.
 
 
 
 
76
  """
77
+ if not text.strip():
78
+ return "No text available to summarize."
79
+
80
+ try:
81
+ # Note: The summarization pipeline can have limitations on text length.
82
+ # If you face issues with long documents, consider summarizing in chunks.
83
+ summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
84
+ return summary[0]["summary_text"]
85
+ except Exception as e:
86
+ return f"Error during summarization: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  #####################################
89
+ # Main Processing Logic
90
  #####################################
91
  def process_resume(file_obj):
92
  if file_obj is None:
93
  return None, None
94
 
 
95
  resume_text = extract_text_from_file(file_obj)
96
+ summary_text = summarize_text(resume_text)
97
+ return resume_text, summary_text
 
98
 
99
  #####################################
100
  # Streamlit Interface
101
  #####################################
102
+ st.title("Resume Summarization App")
103
+ st.markdown(
104
+ """
105
+ Upload your resume file — supported formats: **.doc**, **.docx**, and **.txt**.
106
+ The app will extract the text content from your resume and generate a summarization.
107
+ """
108
+ )
109
 
110
+ uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx", "txt"])
111
 
112
+ if st.button("Summarize Resume"):
113
  if uploaded_file is None:
114
  st.error("Please upload a file first.")
115
  else:
116
  with st.spinner("Processing..."):
117
+ resume_text, summary_text = process_resume(uploaded_file)
118
  st.subheader("Extracted Resume Text")
119
+ st.text_area("", resume_text, height=250)
120
+ st.subheader("Summarized Resume")
121
+ st.text_area("", summary_text, height=150)