CR7CAD commited on
Commit
7716c5c
·
verified ·
1 Parent(s): 4c77f62

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -110
app.py CHANGED
@@ -1,98 +1,31 @@
1
  import os
2
  import tempfile
3
- import textwrap
4
  import streamlit as st
5
- from transformers import pipeline
6
  import docx
7
  import textract
8
 
9
- from PIL import Image, ImageDraw, ImageFont
10
-
11
- #####################################
12
- # Model Loading: Image-Text to Text
13
- #####################################
14
- @st.cache_resource(show_spinner=False)
15
- def load_image_to_text_pipeline():
16
- try:
17
- # Load the image-text to text model.
18
- model_pipeline = pipeline(
19
- "image-to-text",
20
- model="deepseek-ai/deepseek-vl2-tiny",
21
- trust_remote_code=True
22
- )
23
- return model_pipeline
24
- except Exception as e:
25
- st.error(f"Error loading image-to-text model: {e}")
26
- st.stop()
27
-
28
- model_pipeline = load_image_to_text_pipeline()
29
- st.write("Image-text to text model loaded successfully!")
30
-
31
- #####################################
32
- # Function: Convert Text to an Image
33
- #####################################
34
- def text_to_image(text, img_width=800, bg_color="white", text_color="black", font_size=20):
35
- """
36
- Convert a long text string into a PIL Image.
37
- The function wraps text so that it fits within the desired width.
38
- """
39
- # Load a default font.
40
- try:
41
- font = ImageFont.truetype("arial.ttf", font_size)
42
- except IOError:
43
- # Fallback to default PIL font if arial is not found.
44
- font = ImageFont.load_default()
45
-
46
- # Wrap the text into lines.
47
- wrapper = textwrap.TextWrapper(width=80)
48
- lines = wrapper.wrap(text=text)
49
- if not lines:
50
- lines = [" "]
51
-
52
- # Calculate the required image height.
53
- line_height = font.getsize("A")[1]
54
- img_height = line_height * (len(lines) + 2)
55
-
56
- # Create a new image with white background.
57
- img = Image.new("RGB", (img_width, img_height), color=bg_color)
58
- draw = ImageDraw.Draw(img)
59
-
60
- # Draw each line of text
61
- y_text = 10
62
- for line in lines:
63
- # Center text horizontally.
64
- text_width, _ = draw.textsize(line, font=font)
65
- x_text = (img_width - text_width) / 2
66
- draw.text((x_text, y_text), line, font=font, fill=text_color)
67
- y_text += line_height
68
- return img
69
-
70
  #####################################
71
  # Function: Extract Text from File
72
  #####################################
73
  def extract_text_from_file(file_obj):
74
  """
75
- Extract text from .txt, .docx, and .doc files.
 
76
  """
77
  filename = file_obj.name
78
  ext = os.path.splitext(filename)[1].lower()
79
  text = ""
80
-
81
- if ext == ".txt":
82
- try:
83
- text = file_obj.read().decode("utf-8")
84
- except Exception as e:
85
- text = f"Error reading text file: {e}"
86
-
87
- elif ext == ".docx":
88
  try:
89
  document = docx.Document(file_obj)
90
  text = "\n".join([para.text for para in document.paragraphs])
91
  except Exception as e:
92
  text = f"Error processing DOCX file: {e}"
93
-
94
  elif ext == ".doc":
95
  try:
 
96
  with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
97
  tmp.write(file_obj.read())
98
  tmp.flush()
@@ -107,58 +40,92 @@ def extract_text_from_file(file_obj):
107
  pass
108
  else:
109
  text = "Unsupported file type."
110
-
111
  return text
112
 
113
  #####################################
114
- # Function: Process Resume Using the Model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  #####################################
116
  def process_resume(file_obj):
117
  if file_obj is None:
118
  return None, None
119
-
120
- # Extract text from file.
121
  resume_text = extract_text_from_file(file_obj)
122
- if not resume_text.strip():
123
- return resume_text, "No text available to process."
124
-
125
- # Convert the extracted text to an image.
126
- text_image = text_to_image(resume_text)
127
-
128
- try:
129
- # Pass the generated image to the image-to-text model.
130
- result = model_pipeline(text_image)
131
- # The expected output is a list of dictionaries with key "generated_text".
132
- if isinstance(result, list) and "generated_text" in result[0]:
133
- processed_text = result[0]["generated_text"]
134
- else:
135
- processed_text = "Unexpected model output format."
136
- except Exception as e:
137
- processed_text = f"Error during model inference: {e}"
138
-
139
- return resume_text, processed_text
140
 
141
  #####################################
142
  # Streamlit Interface
143
  #####################################
144
- st.title("Resume Processing App")
145
- st.markdown(
146
- """
147
- Upload your resume file supported formats: **.doc**, **.docx**, and **.txt**.
148
- The app will extract the text content from your resume, convert it to an image,
149
- and then use the image-text to text model to process it.
150
- """
151
- )
152
 
153
- uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx", "txt"])
154
 
155
- if st.button("Process Resume"):
156
  if uploaded_file is None:
157
  st.error("Please upload a file first.")
158
  else:
159
- with st.spinner("Processing..."):
160
- resume_text, processed_text = process_resume(uploaded_file)
 
161
  st.subheader("Extracted Resume Text")
162
- st.text_area("", resume_text, height=250)
163
- st.subheader("Model Output")
164
- st.text_area("", processed_text, height=150)
 
 
1
  import os
2
  import tempfile
3
+ import re
4
  import streamlit as st
 
5
  import docx
6
  import textract
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  #####################################
9
  # Function: Extract Text from File
10
  #####################################
11
  def extract_text_from_file(file_obj):
12
  """
13
+ Extract text from .doc and .docx files.
14
+ Returns the extracted text or an error message if extraction fails.
15
  """
16
  filename = file_obj.name
17
  ext = os.path.splitext(filename)[1].lower()
18
  text = ""
19
+
20
+ if ext == ".docx":
 
 
 
 
 
 
21
  try:
22
  document = docx.Document(file_obj)
23
  text = "\n".join([para.text for para in document.paragraphs])
24
  except Exception as e:
25
  text = f"Error processing DOCX file: {e}"
 
26
  elif ext == ".doc":
27
  try:
28
+ # textract requires a filename, so create a temporary file.
29
  with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
30
  tmp.write(file_obj.read())
31
  tmp.flush()
 
40
  pass
41
  else:
42
  text = "Unsupported file type."
43
+
44
  return text
45
 
46
  #####################################
47
+ # Function: Extract Basic Resume Information
48
+ #####################################
49
+ def extract_basic_resume_info(text):
50
+ """
51
+ Parse the extracted text to summarize basic info:
52
+ - Name
53
+ - Age
54
+ - Work Experience (e.g., number of years or description)
55
+ - Expected Industry/Direction
56
+ """
57
+ info = {
58
+ "Name": None,
59
+ "Age": None,
60
+ "Work Experience": None,
61
+ "Expected Industry/Direction": None,
62
+ }
63
+
64
+ # Try to extract Name (e.g., lines like "Name: John Doe")
65
+ name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
66
+ if name_match:
67
+ info["Name"] = name_match.group(1).strip()
68
+ else:
69
+ # Fallback: Look for a potential name (heuristic: two or three capitalized words)
70
+ potential_names = re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}\b", text)
71
+ if potential_names:
72
+ info["Name"] = potential_names[0]
73
+
74
+ # Extract Age (assuming a line like "Age: 28")
75
+ age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
76
+ if age_match:
77
+ info["Age"] = age_match.group(1)
78
+
79
+ # Extract Work Experience (e.g., "5 years of experience" or "Experience: 5 years in...")
80
+ exp_match = re.search(r"(\d+)\s+(years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
81
+ if exp_match:
82
+ info["Work Experience"] = f"{exp_match.group(1)} {exp_match.group(2)}"
83
+ else:
84
+ # Look for a line that has work experience info
85
+ exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE)
86
+ if exp_line:
87
+ info["Work Experience"] = exp_line.group(2).strip()
88
+
89
+ # Extract Expected Industry/Direction
90
+ # (e.g., "Interest: Software Development" or "Expected Industry: Healthcare")
91
+ industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
92
+ if industry_match:
93
+ info["Expected Industry/Direction"] = industry_match.group(2).strip()
94
+
95
+ return info
96
+
97
+ #####################################
98
+ # Main Resume Processing Logic
99
  #####################################
100
  def process_resume(file_obj):
101
  if file_obj is None:
102
  return None, None
103
+ # Extract text content from the file.
 
104
  resume_text = extract_text_from_file(file_obj)
105
+ # Extract summarized basic info from the resume text.
106
+ basic_info = extract_basic_resume_info(resume_text)
107
+ return resume_text, basic_info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  #####################################
110
  # Streamlit Interface
111
  #####################################
112
+ st.title("Resume Summary App")
113
+ st.markdown("""
114
+ Upload your resume file (supported formats: **.doc** or **.docx**) to extract and summarize its content.
115
+ The basic details, including name, age, work experience, and expected industry, will be displayed along with the full text content.
116
+ """)
 
 
 
117
 
118
+ uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx"])
119
 
120
+ if st.button("Extract Information"):
121
  if uploaded_file is None:
122
  st.error("Please upload a file first.")
123
  else:
124
+ with st.spinner("Extracting information..."):
125
+ resume_text, resume_info = process_resume(uploaded_file)
126
+
127
  st.subheader("Extracted Resume Text")
128
+ st.text_area("", resume_text, height=300)
129
+
130
+ st.subheader("Basic Resume Information")
131
+ st.json(resume_info)