CR7CAD commited on
Commit
4c77f62
·
verified ·
1 Parent(s): 67a9893

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -43
app.py CHANGED
@@ -1,33 +1,74 @@
1
  import os
2
  import tempfile
 
3
  import streamlit as st
4
  from transformers import pipeline
5
  import docx
6
  import textract
7
 
 
 
8
  #####################################
9
- # Summarization Pipeline Setup
10
  #####################################
11
  @st.cache_resource(show_spinner=False)
12
- def load_summarization_pipeline():
13
  try:
14
- # Initialize the summarization pipeline using the specified model.
15
- # Adding trust_remote_code=True allows loading models with custom code.
16
- summarizer = pipeline(
17
- "summarization",
18
- model="llava-hf/llava-interleave-qwen-0.5b-hf",
19
  trust_remote_code=True
20
  )
21
- return summarizer
22
  except Exception as e:
23
- st.error(f"Error loading summarization model: {e}")
24
  st.stop()
25
 
26
- summarizer = load_summarization_pipeline()
27
- st.write("Summarization model loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  #####################################
30
- # Function to Extract Text from File
31
  #####################################
32
  def extract_text_from_file(file_obj):
33
  """
@@ -38,7 +79,6 @@ def extract_text_from_file(file_obj):
38
  text = ""
39
 
40
  if ext == ".txt":
41
- # For text files, decode the byte stream into a string.
42
  try:
43
  text = file_obj.read().decode("utf-8")
44
  except Exception as e:
@@ -46,14 +86,12 @@ def extract_text_from_file(file_obj):
46
 
47
  elif ext == ".docx":
48
  try:
49
- # Use python-docx to read .docx files.
50
  document = docx.Document(file_obj)
51
  text = "\n".join([para.text for para in document.paragraphs])
52
  except Exception as e:
53
  text = f"Error processing DOCX file: {e}"
54
 
55
  elif ext == ".doc":
56
- # For .doc files, use textract. textract expects a filename, so save temporarily.
57
  try:
58
  with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
59
  tmp.write(file_obj.read())
@@ -73,55 +111,54 @@ def extract_text_from_file(file_obj):
73
  return text
74
 
75
  #####################################
76
- # Function to Summarize Extracted Text
77
- #####################################
78
- def summarize_text(text):
79
- """
80
- Summarize the given text using the summarization pipeline.
81
- Adjust max_length and min_length as needed.
82
- """
83
- if not text.strip():
84
- return "No text available to summarize."
85
-
86
- try:
87
- # The summarization pipeline might have limitations on text length.
88
- # For long documents, consider splitting the text into smaller chunks.
89
- summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
90
- return summary[0]["summary_text"]
91
- except Exception as e:
92
- return f"Error during summarization: {e}"
93
-
94
- #####################################
95
- # Main Processing Logic
96
  #####################################
97
  def process_resume(file_obj):
98
  if file_obj is None:
99
  return None, None
100
 
 
101
  resume_text = extract_text_from_file(file_obj)
102
- summary_text = summarize_text(resume_text)
103
- return resume_text, summary_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  #####################################
106
  # Streamlit Interface
107
  #####################################
108
- st.title("Resume Summarization App")
109
  st.markdown(
110
  """
111
  Upload your resume file — supported formats: **.doc**, **.docx**, and **.txt**.
112
- The app will extract the text content from your resume and generate a summary.
 
113
  """
114
  )
115
 
116
  uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx", "txt"])
117
 
118
- if st.button("Summarize Resume"):
119
  if uploaded_file is None:
120
  st.error("Please upload a file first.")
121
  else:
122
  with st.spinner("Processing..."):
123
- resume_text, summary_text = process_resume(uploaded_file)
124
  st.subheader("Extracted Resume Text")
125
  st.text_area("", resume_text, height=250)
126
- st.subheader("Summarized Resume")
127
- st.text_area("", summary_text, height=150)
 
1
  import os
2
  import tempfile
3
+ import textwrap
4
  import streamlit as st
5
  from transformers import pipeline
6
  import docx
7
  import textract
8
 
9
+ from PIL import Image, ImageDraw, ImageFont
10
+
11
  #####################################
12
+ # Model Loading: Image-Text to Text
13
  #####################################
14
  @st.cache_resource(show_spinner=False)
15
+ def load_image_to_text_pipeline():
16
  try:
17
+ # Load the image-text to text model.
18
+ model_pipeline = pipeline(
19
+ "image-to-text",
20
+ model="deepseek-ai/deepseek-vl2-tiny",
 
21
  trust_remote_code=True
22
  )
23
+ return model_pipeline
24
  except Exception as e:
25
+ st.error(f"Error loading image-to-text model: {e}")
26
  st.stop()
27
 
28
+ model_pipeline = load_image_to_text_pipeline()
29
+ st.write("Image-text to text model loaded successfully!")
30
+
31
+ #####################################
32
+ # Function: Convert Text to an Image
33
+ #####################################
34
+ def text_to_image(text, img_width=800, bg_color="white", text_color="black", font_size=20):
35
+ """
36
+ Convert a long text string into a PIL Image.
37
+ The function wraps text so that it fits within the desired width.
38
+ """
39
+ # Load a default font.
40
+ try:
41
+ font = ImageFont.truetype("arial.ttf", font_size)
42
+ except IOError:
43
+ # Fallback to default PIL font if arial is not found.
44
+ font = ImageFont.load_default()
45
+
46
+ # Wrap the text into lines.
47
+ wrapper = textwrap.TextWrapper(width=80)
48
+ lines = wrapper.wrap(text=text)
49
+ if not lines:
50
+ lines = [" "]
51
+
52
+ # Calculate the required image height.
53
+ line_height = font.getsize("A")[1]
54
+ img_height = line_height * (len(lines) + 2)
55
+
56
+ # Create a new image with white background.
57
+ img = Image.new("RGB", (img_width, img_height), color=bg_color)
58
+ draw = ImageDraw.Draw(img)
59
+
60
+ # Draw each line of text
61
+ y_text = 10
62
+ for line in lines:
63
+ # Center text horizontally.
64
+ text_width, _ = draw.textsize(line, font=font)
65
+ x_text = (img_width - text_width) / 2
66
+ draw.text((x_text, y_text), line, font=font, fill=text_color)
67
+ y_text += line_height
68
+ return img
69
 
70
  #####################################
71
+ # Function: Extract Text from File
72
  #####################################
73
  def extract_text_from_file(file_obj):
74
  """
 
79
  text = ""
80
 
81
  if ext == ".txt":
 
82
  try:
83
  text = file_obj.read().decode("utf-8")
84
  except Exception as e:
 
86
 
87
  elif ext == ".docx":
88
  try:
 
89
  document = docx.Document(file_obj)
90
  text = "\n".join([para.text for para in document.paragraphs])
91
  except Exception as e:
92
  text = f"Error processing DOCX file: {e}"
93
 
94
  elif ext == ".doc":
 
95
  try:
96
  with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
97
  tmp.write(file_obj.read())
 
111
  return text
112
 
113
  #####################################
114
+ # Function: Process Resume Using the Model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  #####################################
116
  def process_resume(file_obj):
117
  if file_obj is None:
118
  return None, None
119
 
120
+ # Extract text from file.
121
  resume_text = extract_text_from_file(file_obj)
122
+ if not resume_text.strip():
123
+ return resume_text, "No text available to process."
124
+
125
+ # Convert the extracted text to an image.
126
+ text_image = text_to_image(resume_text)
127
+
128
+ try:
129
+ # Pass the generated image to the image-to-text model.
130
+ result = model_pipeline(text_image)
131
+ # The expected output is a list of dictionaries with key "generated_text".
132
+ if isinstance(result, list) and "generated_text" in result[0]:
133
+ processed_text = result[0]["generated_text"]
134
+ else:
135
+ processed_text = "Unexpected model output format."
136
+ except Exception as e:
137
+ processed_text = f"Error during model inference: {e}"
138
+
139
+ return resume_text, processed_text
140
 
141
  #####################################
142
  # Streamlit Interface
143
  #####################################
144
+ st.title("Resume Processing App")
145
  st.markdown(
146
  """
147
  Upload your resume file — supported formats: **.doc**, **.docx**, and **.txt**.
148
+ The app will extract the text content from your resume, convert it to an image,
149
+ and then use the image-text to text model to process it.
150
  """
151
  )
152
 
153
  uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx", "txt"])
154
 
155
+ if st.button("Process Resume"):
156
  if uploaded_file is None:
157
  st.error("Please upload a file first.")
158
  else:
159
  with st.spinner("Processing..."):
160
+ resume_text, processed_text = process_resume(uploaded_file)
161
  st.subheader("Extracted Resume Text")
162
  st.text_area("", resume_text, height=250)
163
+ st.subheader("Model Output")
164
+ st.text_area("", processed_text, height=150)