Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

CR7CAD commited on Mar 16

Commit

7716c5c

verified ·

1 Parent(s): 4c77f62

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -110

app.py CHANGED Viewed

@@ -1,98 +1,31 @@
 import os
 import tempfile
-import textwrap
 import streamlit as st
-from transformers import pipeline
 import docx
 import textract
-from PIL import Image, ImageDraw, ImageFont
-#####################################
-# Model Loading: Image-Text to Text
-#####################################
-@st.cache_resource(show_spinner=False)
-def load_image_to_text_pipeline():
-    try:
-        # Load the image-text to text model.
-        model_pipeline = pipeline(
-            "image-to-text",
-            model="deepseek-ai/deepseek-vl2-tiny",
-            trust_remote_code=True
-        )
-        return model_pipeline
-    except Exception as e:
-        st.error(f"Error loading image-to-text model: {e}")
-        st.stop()
-model_pipeline = load_image_to_text_pipeline()
-st.write("Image-text to text model loaded successfully!")
-#####################################
-# Function: Convert Text to an Image
-#####################################
-def text_to_image(text, img_width=800, bg_color="white", text_color="black", font_size=20):
-    """
-    Convert a long text string into a PIL Image.
-    The function wraps text so that it fits within the desired width.
-    """
-    # Load a default font.
-    try:
-        font = ImageFont.truetype("arial.ttf", font_size)
-    except IOError:
-        # Fallback to default PIL font if arial is not found.
-        font = ImageFont.load_default()
-    # Wrap the text into lines.
-    wrapper = textwrap.TextWrapper(width=80)
-    lines = wrapper.wrap(text=text)
-    if not lines:
-        lines = [" "]
-    # Calculate the required image height.
-    line_height = font.getsize("A")[1]
-    img_height = line_height * (len(lines) + 2)
-    # Create a new image with white background.
-    img = Image.new("RGB", (img_width, img_height), color=bg_color)
-    draw = ImageDraw.Draw(img)
-    # Draw each line of text
-    y_text = 10
-    for line in lines:
-        # Center text horizontally.
-        text_width, _ = draw.textsize(line, font=font)
-        x_text = (img_width - text_width) / 2
-        draw.text((x_text, y_text), line, font=font, fill=text_color)
-        y_text += line_height
-    return img
 #####################################
 # Function: Extract Text from File
 #####################################
 def extract_text_from_file(file_obj):
     """
-    Extract text from .txt, .docx, and .doc files.
     """
     filename = file_obj.name
     ext = os.path.splitext(filename)[1].lower()
     text = ""
-    if ext == ".txt":
-        try:
-            text = file_obj.read().decode("utf-8")
-        except Exception as e:
-            text = f"Error reading text file: {e}"
-    elif ext == ".docx":
         try:
             document = docx.Document(file_obj)
             text = "\n".join([para.text for para in document.paragraphs])
         except Exception as e:
             text = f"Error processing DOCX file: {e}"
     elif ext == ".doc":
         try:
             with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
                 tmp.write(file_obj.read())
                 tmp.flush()
@@ -107,58 +40,92 @@ def extract_text_from_file(file_obj):
                 pass
     else:
         text = "Unsupported file type."
     return text
 #####################################
-# Function: Process Resume Using the Model
 #####################################
 def process_resume(file_obj):
     if file_obj is None:
         return None, None
-    # Extract text from file.
     resume_text = extract_text_from_file(file_obj)
-    if not resume_text.strip():
-        return resume_text, "No text available to process."
-    # Convert the extracted text to an image.
-    text_image = text_to_image(resume_text)
-    try:
-        # Pass the generated image to the image-to-text model.
-        result = model_pipeline(text_image)
-        # The expected output is a list of dictionaries with key "generated_text".
-        if isinstance(result, list) and "generated_text" in result[0]:
-            processed_text = result[0]["generated_text"]
-        else:
-            processed_text = "Unexpected model output format."
-    except Exception as e:
-        processed_text = f"Error during model inference: {e}"
-    return resume_text, processed_text
 #####################################
 # Streamlit Interface
 #####################################
-st.title("Resume Processing App")
-st.markdown(
-    """
-    Upload your resume file — supported formats: **.doc**, **.docx**, and **.txt**.
-    The app will extract the text content from your resume, convert it to an image,
-    and then use the image-text to text model to process it.
-    """
-)
-uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx", "txt"])
-if st.button("Process Resume"):
     if uploaded_file is None:
         st.error("Please upload a file first.")
     else:
-        with st.spinner("Processing..."):
-            resume_text, processed_text = process_resume(uploaded_file)
         st.subheader("Extracted Resume Text")
-        st.text_area("", resume_text, height=250)
-        st.subheader("Model Output")
-        st.text_area("", processed_text, height=150)

 import os
 import tempfile
+import re
 import streamlit as st
 import docx
 import textract
 #####################################
 # Function: Extract Text from File
 #####################################
 def extract_text_from_file(file_obj):
     """
+    Extract text from .doc and .docx files.
+    Returns the extracted text or an error message if extraction fails.
     """
     filename = file_obj.name
     ext = os.path.splitext(filename)[1].lower()
     text = ""
+    if ext == ".docx":
         try:
             document = docx.Document(file_obj)
             text = "\n".join([para.text for para in document.paragraphs])
         except Exception as e:
             text = f"Error processing DOCX file: {e}"
     elif ext == ".doc":
         try:
+            # textract requires a filename, so create a temporary file.
             with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
                 tmp.write(file_obj.read())
                 tmp.flush()
                 pass
     else:
         text = "Unsupported file type."
     return text
 #####################################
+# Function: Extract Basic Resume Information
+#####################################
+def extract_basic_resume_info(text):
+    """
+    Parse the extracted text to summarize basic info:
+    - Name
+    - Age
+    - Work Experience (e.g., number of years or description)
+    - Expected Industry/Direction
+    """
+    info = {
+        "Name": None,
+        "Age": None,
+        "Work Experience": None,
+        "Expected Industry/Direction": None,
+    }
+    # Try to extract Name (e.g., lines like "Name: John Doe")
+    name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
+    if name_match:
+        info["Name"] = name_match.group(1).strip()
+    else:
+        # Fallback: Look for a potential name (heuristic: two or three capitalized words)
+        potential_names = re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}\b", text)
+        if potential_names:
+            info["Name"] = potential_names[0]
+    # Extract Age (assuming a line like "Age: 28")
+    age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
+    if age_match:
+        info["Age"] = age_match.group(1)
+    # Extract Work Experience (e.g., "5 years of experience" or "Experience: 5 years in...")
+    exp_match = re.search(r"(\d+)\s+(years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
+    if exp_match:
+        info["Work Experience"] = f"{exp_match.group(1)} {exp_match.group(2)}"
+    else:
+        # Look for a line that has work experience info
+        exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE)
+        if exp_line:
+            info["Work Experience"] = exp_line.group(2).strip()
+    # Extract Expected Industry/Direction
+    # (e.g., "Interest: Software Development" or "Expected Industry: Healthcare")
+    industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
+    if industry_match:
+        info["Expected Industry/Direction"] = industry_match.group(2).strip()
+    return info
+#####################################
+# Main Resume Processing Logic
 #####################################
 def process_resume(file_obj):
     if file_obj is None:
         return None, None
+    # Extract text content from the file.
     resume_text = extract_text_from_file(file_obj)
+    # Extract summarized basic info from the resume text.
+    basic_info = extract_basic_resume_info(resume_text)
+    return resume_text, basic_info
 #####################################
 # Streamlit Interface
 #####################################
+st.title("Resume Summary App")
+st.markdown("""
+Upload your resume file (supported formats: **.doc** or **.docx**) to extract and summarize its content.
+The basic details, including name, age, work experience, and expected industry, will be displayed along with the full text content.
+""")
+uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx"])
+if st.button("Extract Information"):
     if uploaded_file is None:
         st.error("Please upload a file first.")
     else:
+        with st.spinner("Extracting information..."):
+            resume_text, resume_info = process_resume(uploaded_file)
         st.subheader("Extracted Resume Text")
+        st.text_area("", resume_text, height=300)
+        st.subheader("Basic Resume Information")
+        st.json(resume_info)