Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,135 +1,121 @@
|
|
1 |
import os
|
2 |
-
import
|
3 |
import streamlit as st
|
4 |
-
from PIL import Image
|
5 |
from transformers import pipeline
|
6 |
-
|
|
|
7 |
|
8 |
-
|
|
|
|
|
9 |
@st.cache_resource(show_spinner=False)
|
10 |
-
def
|
11 |
try:
|
12 |
-
|
13 |
-
|
14 |
-
return ocr_pipe
|
15 |
except Exception as e:
|
16 |
-
st.error(f"Error loading model: {e}")
|
17 |
st.stop()
|
18 |
|
19 |
-
|
20 |
-
st.write("
|
21 |
|
22 |
#####################################
|
23 |
-
# Text
|
24 |
#####################################
|
25 |
def extract_text_from_file(file_obj):
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
if
|
|
|
30 |
try:
|
31 |
-
|
32 |
-
full_text = extract_text(file_obj)
|
33 |
except Exception as e:
|
34 |
-
|
35 |
-
|
|
|
36 |
try:
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
full_text = result[0]["text"]
|
41 |
except Exception as e:
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
#####################################
|
46 |
-
#
|
47 |
#####################################
|
48 |
-
def
|
49 |
"""
|
50 |
-
|
51 |
-
|
52 |
-
- Age
|
53 |
-
- Job Experience
|
54 |
-
- Skills
|
55 |
-
- Expected Industry/Direction
|
56 |
"""
|
57 |
-
|
58 |
-
"
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
if name_match:
|
68 |
-
info["Name"] = name_match.group(1).strip()
|
69 |
-
else:
|
70 |
-
# Fallback: heuristic for sequences of capitalized words.
|
71 |
-
potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
|
72 |
-
if potential_names:
|
73 |
-
info["Name"] = potential_names[0]
|
74 |
-
|
75 |
-
# Extract Age:
|
76 |
-
age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
|
77 |
-
if age_match:
|
78 |
-
info["Age"] = age_match.group(1)
|
79 |
-
|
80 |
-
# Extract Job Experience (years)
|
81 |
-
exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
|
82 |
-
if exp_match:
|
83 |
-
info["Job Experience"] = exp_match.group(1) + " years"
|
84 |
-
else:
|
85 |
-
exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE)
|
86 |
-
if exp_line:
|
87 |
-
info["Job Experience"] = exp_line.group(2).strip()
|
88 |
-
|
89 |
-
# Extract Skills (e.g., "Skills: Python, Java, SQL")
|
90 |
-
skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
|
91 |
-
if skills_match:
|
92 |
-
skills_text = skills_match.group(1)
|
93 |
-
skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
|
94 |
-
info["Skills"] = skills
|
95 |
-
|
96 |
-
# Extract Expected Industry/Direction (e.g., "Interest: Software Development")
|
97 |
-
industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
|
98 |
-
if industry_match:
|
99 |
-
info["Expected Industry/Direction"] = industry_match.group(2).strip()
|
100 |
-
|
101 |
-
return info
|
102 |
|
103 |
#####################################
|
104 |
-
# Main
|
105 |
#####################################
|
106 |
def process_resume(file_obj):
|
107 |
if file_obj is None:
|
108 |
return None, None
|
109 |
|
110 |
-
# Extract text based on file type (PDF or image).
|
111 |
resume_text = extract_text_from_file(file_obj)
|
112 |
-
|
113 |
-
|
114 |
-
return resume_text, resume_info
|
115 |
|
116 |
#####################################
|
117 |
# Streamlit Interface
|
118 |
#####################################
|
119 |
-
st.title("Resume
|
120 |
-
st.markdown(
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
123 |
|
124 |
-
uploaded_file = st.file_uploader("Upload Resume", type=["
|
125 |
|
126 |
-
if st.button("
|
127 |
if uploaded_file is None:
|
128 |
st.error("Please upload a file first.")
|
129 |
else:
|
130 |
with st.spinner("Processing..."):
|
131 |
-
resume_text,
|
132 |
st.subheader("Extracted Resume Text")
|
133 |
-
st.text_area("", resume_text, height=
|
134 |
-
st.subheader("
|
135 |
-
st.
|
|
|
1 |
import os
|
2 |
+
import tempfile
|
3 |
import streamlit as st
|
|
|
4 |
from transformers import pipeline
|
5 |
+
import docx
|
6 |
+
import textract
|
7 |
|
8 |
+
#####################################
|
9 |
+
# Summarization Pipeline Setup
|
10 |
+
#####################################
|
11 |
@st.cache_resource(show_spinner=False)
|
12 |
+
def load_summarization_pipeline():
|
13 |
try:
|
14 |
+
summarizer = pipeline("summarization", model="recogna-nlp/ptt5-base-summ-xlsum")
|
15 |
+
return summarizer
|
|
|
16 |
except Exception as e:
|
17 |
+
st.error(f"Error loading summarization model: {e}")
|
18 |
st.stop()
|
19 |
|
20 |
+
summarizer = load_summarization_pipeline()
|
21 |
+
st.write("Summarization model loaded successfully!")
|
22 |
|
23 |
#####################################
|
24 |
+
# Function to Extract Text from File
|
25 |
#####################################
|
26 |
def extract_text_from_file(file_obj):
|
27 |
+
"""
|
28 |
+
Extract text from .txt, .docx, and .doc files.
|
29 |
+
"""
|
30 |
+
filename = file_obj.name
|
31 |
+
ext = os.path.splitext(filename)[1].lower()
|
32 |
+
text = ""
|
33 |
|
34 |
+
if ext == ".txt":
|
35 |
+
# For text files, decode the byte stream into a string.
|
36 |
try:
|
37 |
+
text = file_obj.read().decode("utf-8")
|
|
|
38 |
except Exception as e:
|
39 |
+
text = f"Error reading text file: {e}"
|
40 |
+
|
41 |
+
elif ext == ".docx":
|
42 |
try:
|
43 |
+
# Use python-docx to read .docx files.
|
44 |
+
document = docx.Document(file_obj)
|
45 |
+
text = "\n".join([para.text for para in document.paragraphs])
|
|
|
46 |
except Exception as e:
|
47 |
+
text = f"Error processing DOCX file: {e}"
|
48 |
+
|
49 |
+
elif ext == ".doc":
|
50 |
+
# For .doc files, use textract. textract expects a filename, so save temporarily.
|
51 |
+
try:
|
52 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
|
53 |
+
tmp.write(file_obj.read())
|
54 |
+
tmp.flush()
|
55 |
+
tmp_filename = tmp.name
|
56 |
+
text = textract.process(tmp_filename).decode("utf-8")
|
57 |
+
except Exception as e:
|
58 |
+
text = f"Error processing DOC file: {e}"
|
59 |
+
finally:
|
60 |
+
try:
|
61 |
+
os.remove(tmp_filename)
|
62 |
+
except Exception:
|
63 |
+
pass
|
64 |
+
else:
|
65 |
+
text = "Unsupported file type."
|
66 |
+
|
67 |
+
return text
|
68 |
|
69 |
#####################################
|
70 |
+
# Function to Summarize Extracted Text
|
71 |
#####################################
|
72 |
+
def summarize_text(text):
|
73 |
"""
|
74 |
+
Summarize the given text using the summarization pipeline.
|
75 |
+
Adjust max_length and min_length as needed.
|
|
|
|
|
|
|
|
|
76 |
"""
|
77 |
+
if not text.strip():
|
78 |
+
return "No text available to summarize."
|
79 |
+
|
80 |
+
try:
|
81 |
+
# Note: The summarization pipeline can have limitations on text length.
|
82 |
+
# If you face issues with long documents, consider summarizing in chunks.
|
83 |
+
summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
|
84 |
+
return summary[0]["summary_text"]
|
85 |
+
except Exception as e:
|
86 |
+
return f"Error during summarization: {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
#####################################
|
89 |
+
# Main Processing Logic
|
90 |
#####################################
|
91 |
def process_resume(file_obj):
|
92 |
if file_obj is None:
|
93 |
return None, None
|
94 |
|
|
|
95 |
resume_text = extract_text_from_file(file_obj)
|
96 |
+
summary_text = summarize_text(resume_text)
|
97 |
+
return resume_text, summary_text
|
|
|
98 |
|
99 |
#####################################
|
100 |
# Streamlit Interface
|
101 |
#####################################
|
102 |
+
st.title("Resume Summarization App")
|
103 |
+
st.markdown(
|
104 |
+
"""
|
105 |
+
Upload your resume file — supported formats: **.doc**, **.docx**, and **.txt**.
|
106 |
+
The app will extract the text content from your resume and generate a summarization.
|
107 |
+
"""
|
108 |
+
)
|
109 |
|
110 |
+
uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx", "txt"])
|
111 |
|
112 |
+
if st.button("Summarize Resume"):
|
113 |
if uploaded_file is None:
|
114 |
st.error("Please upload a file first.")
|
115 |
else:
|
116 |
with st.spinner("Processing..."):
|
117 |
+
resume_text, summary_text = process_resume(uploaded_file)
|
118 |
st.subheader("Extracted Resume Text")
|
119 |
+
st.text_area("", resume_text, height=250)
|
120 |
+
st.subheader("Summarized Resume")
|
121 |
+
st.text_area("", summary_text, height=150)
|