Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

CR7CAD commited on Mar 16

Commit

4c77f62

verified ·

1 Parent(s): 67a9893

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -43

app.py CHANGED Viewed

@@ -1,33 +1,74 @@
 import os
 import tempfile
 import streamlit as st
 from transformers import pipeline
 import docx
 import textract
 #####################################
-# Summarization Pipeline Setup
 #####################################
 @st.cache_resource(show_spinner=False)
-def load_summarization_pipeline():
     try:
-        # Initialize the summarization pipeline using the specified model.
-        # Adding trust_remote_code=True allows loading models with custom code.
-        summarizer = pipeline(
-            "summarization",
-            model="llava-hf/llava-interleave-qwen-0.5b-hf",
             trust_remote_code=True
         )
-        return summarizer
     except Exception as e:
-        st.error(f"Error loading summarization model: {e}")
         st.stop()
-summarizer = load_summarization_pipeline()
-st.write("Summarization model loaded successfully!")
 #####################################
-# Function to Extract Text from File
 #####################################
 def extract_text_from_file(file_obj):
     """
@@ -38,7 +79,6 @@ def extract_text_from_file(file_obj):
     text = ""
     if ext == ".txt":
-        # For text files, decode the byte stream into a string.
         try:
             text = file_obj.read().decode("utf-8")
         except Exception as e:
@@ -46,14 +86,12 @@ def extract_text_from_file(file_obj):
     elif ext == ".docx":
         try:
-            # Use python-docx to read .docx files.
             document = docx.Document(file_obj)
             text = "\n".join([para.text for para in document.paragraphs])
         except Exception as e:
             text = f"Error processing DOCX file: {e}"
     elif ext == ".doc":
-        # For .doc files, use textract. textract expects a filename, so save temporarily.
         try:
             with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
                 tmp.write(file_obj.read())
@@ -73,55 +111,54 @@ def extract_text_from_file(file_obj):
     return text
 #####################################
-# Function to Summarize Extracted Text
-#####################################
-def summarize_text(text):
-    """
-    Summarize the given text using the summarization pipeline.
-    Adjust max_length and min_length as needed.
-    """
-    if not text.strip():
-        return "No text available to summarize."
-    try:
-        # The summarization pipeline might have limitations on text length.
-        # For long documents, consider splitting the text into smaller chunks.
-        summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
-        return summary[0]["summary_text"]
-    except Exception as e:
-        return f"Error during summarization: {e}"
-#####################################
-# Main Processing Logic
 #####################################
 def process_resume(file_obj):
     if file_obj is None:
         return None, None
     resume_text = extract_text_from_file(file_obj)
-    summary_text = summarize_text(resume_text)
-    return resume_text, summary_text
 #####################################
 # Streamlit Interface
 #####################################
-st.title("Resume Summarization App")
 st.markdown(
     """
     Upload your resume file — supported formats: **.doc**, **.docx**, and **.txt**.
-    The app will extract the text content from your resume and generate a summary.
     """
 )
 uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx", "txt"])
-if st.button("Summarize Resume"):
     if uploaded_file is None:
         st.error("Please upload a file first.")
     else:
         with st.spinner("Processing..."):
-            resume_text, summary_text = process_resume(uploaded_file)
         st.subheader("Extracted Resume Text")
         st.text_area("", resume_text, height=250)
-        st.subheader("Summarized Resume")
-        st.text_area("", summary_text, height=150)

 import os
 import tempfile
+import textwrap
 import streamlit as st
 from transformers import pipeline
 import docx
 import textract
+from PIL import Image, ImageDraw, ImageFont
 #####################################
+# Model Loading: Image-Text to Text
 #####################################
 @st.cache_resource(show_spinner=False)
+def load_image_to_text_pipeline():
     try:
+        # Load the image-text to text model.
+        model_pipeline = pipeline(
+            "image-to-text",
+            model="deepseek-ai/deepseek-vl2-tiny",
             trust_remote_code=True
         )
+        return model_pipeline
     except Exception as e:
+        st.error(f"Error loading image-to-text model: {e}")
         st.stop()
+model_pipeline = load_image_to_text_pipeline()
+st.write("Image-text to text model loaded successfully!")
+#####################################
+# Function: Convert Text to an Image
+#####################################
+def text_to_image(text, img_width=800, bg_color="white", text_color="black", font_size=20):
+    """
+    Convert a long text string into a PIL Image.
+    The function wraps text so that it fits within the desired width.
+    """
+    # Load a default font.
+    try:
+        font = ImageFont.truetype("arial.ttf", font_size)
+    except IOError:
+        # Fallback to default PIL font if arial is not found.
+        font = ImageFont.load_default()
+    # Wrap the text into lines.
+    wrapper = textwrap.TextWrapper(width=80)
+    lines = wrapper.wrap(text=text)
+    if not lines:
+        lines = [" "]
+    # Calculate the required image height.
+    line_height = font.getsize("A")[1]
+    img_height = line_height * (len(lines) + 2)
+    # Create a new image with white background.
+    img = Image.new("RGB", (img_width, img_height), color=bg_color)
+    draw = ImageDraw.Draw(img)
+    # Draw each line of text
+    y_text = 10
+    for line in lines:
+        # Center text horizontally.
+        text_width, _ = draw.textsize(line, font=font)
+        x_text = (img_width - text_width) / 2
+        draw.text((x_text, y_text), line, font=font, fill=text_color)
+        y_text += line_height
+    return img
 #####################################
+# Function: Extract Text from File
 #####################################
 def extract_text_from_file(file_obj):
     """
     text = ""
     if ext == ".txt":
         try:
             text = file_obj.read().decode("utf-8")
         except Exception as e:
     elif ext == ".docx":
         try:
             document = docx.Document(file_obj)
             text = "\n".join([para.text for para in document.paragraphs])
         except Exception as e:
             text = f"Error processing DOCX file: {e}"
     elif ext == ".doc":
         try:
             with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
                 tmp.write(file_obj.read())
     return text
 #####################################
+# Function: Process Resume Using the Model
 #####################################
 def process_resume(file_obj):
     if file_obj is None:
         return None, None
+    # Extract text from file.
     resume_text = extract_text_from_file(file_obj)
+    if not resume_text.strip():
+        return resume_text, "No text available to process."
+    # Convert the extracted text to an image.
+    text_image = text_to_image(resume_text)
+    try:
+        # Pass the generated image to the image-to-text model.
+        result = model_pipeline(text_image)
+        # The expected output is a list of dictionaries with key "generated_text".
+        if isinstance(result, list) and "generated_text" in result[0]:
+            processed_text = result[0]["generated_text"]
+        else:
+            processed_text = "Unexpected model output format."
+    except Exception as e:
+        processed_text = f"Error during model inference: {e}"
+    return resume_text, processed_text
 #####################################
 # Streamlit Interface
 #####################################
+st.title("Resume Processing App")
 st.markdown(
     """
     Upload your resume file — supported formats: **.doc**, **.docx**, and **.txt**.
+    The app will extract the text content from your resume, convert it to an image,
+    and then use the image-text to text model to process it.
     """
 )
 uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx", "txt"])
+if st.button("Process Resume"):
     if uploaded_file is None:
         st.error("Please upload a file first.")
     else:
         with st.spinner("Processing..."):
+            resume_text, processed_text = process_resume(uploaded_file)
         st.subheader("Extracted Resume Text")
         st.text_area("", resume_text, height=250)
+        st.subheader("Model Output")
+        st.text_area("", processed_text, height=150)