resume_parser / app.py
gneya-bacancy's picture
Update app.py
ed66a21 verified
import streamlit as st
import os
import tempfile
# from pypdf import PdfReader, PdfWriter
# from pdf2image import convert_from_path
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.chains.llm import LLMChain
import tempfile
import markdown2
from weasyprint import HTML
from io import BytesIO
def format_string(input_string):
# Find the index of the first occurrence of ":"
index = input_string.find(":")
# Check if ":" is found
if index != -1:
# Extract the substring starting from the found index to the end
substring = input_string[(index+1):]
else:
# If ":" is not found, return an empty string or an appropriate message
substring = input_string
return substring
def save_uploaded_file(uploadedfile):
# Create a temporary directory to save the file
temp_dir = tempfile.gettempdir()
save_path = os.path.join(temp_dir, uploadedfile.name)
with open(save_path, "wb") as f:
f.write(uploadedfile.getbuffer())
return save_path
def read_pdf(file_path):
# Dummy processing: copying the original PDF content to a new PDF
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()
text = ""
for page in pages:
text = text + " "+page.page_content+ "\n\n"
return text
def generate_pdf_from_markup(markup_text):
# Convert Markdown to HTML
html_content = markdown2.markdown(markup_text)
# Create a temporary file to save the PDF
temp_dir = tempfile.gettempdir()
pdf_path = os.path.join(temp_dir, "generated.pdf")
# Convert HTML to PDF
HTML(string=html_content).write_pdf(pdf_path)
return pdf_path
def parse_resume(data):
llm = ChatGroq(api_key=os.getenv("GROQ_API_KEY"),model="llama3-70b-8192")
system_prompt = """
You are an AI assistant designed to remove and format resume data. When provided with extracted text from a PDF resume, your task is to remove personal information and certain details while maintaining the professional content and structure.
Follow the guidelines below:
Keep projects, experience, technical skills as it is without any change.
Remove Salutations: Mr, Mrs, Ms, etc.
Remove Names: All instances of the candidate's names.
Remove Gender: Any mention of gender.
Remove Age/D.O.B./Astrology Info: Any references to age, date of birth, or astrological signs.
Remove Links of personal accounts for example: exail id, github url, linkedin url and all the other urls except the project and experience urls.
Remove email address, mobile number, or any other information that has personal identity.
Anonymize Location: Replace specific locations with more general terms (e.g., "Willing to relocate, currently based in Leicester").
Anonymize Education Institutions: Replace the names of educational institutions/schools with "top university (e.g. highly reputable university on the global stage) or top school" if applicable.
Anonymize Language Skills: Replace specific languages with regional groupings for multilingual candidates (e.g., "proficient in multiple European languages").
Remove Hobbies and INTERESTS: Remove specific details related to hobbies and interests
Anonymize Other Fields: Make specific removals as needed to protect the candidate's identity.
Remove professional summary, objective, agenda and all these type of sections.
Keep only related skills ACHIEVEMENTS, awards and certificate which are writen by you.
Ensure the remaining sections and information are formatted properly to maintain the professional appearance of the resume.
Ensure proper formatting of the resume with proper content justifications, add markdown, add bullet points and spacing wherever required.
Return the output of resume content only. Don't include any notes or comments.
"""
# Remove achievment, awards and certifactes that are not related to professional work.
user_prompt_template = """
{resume_text}
"""
prompt_template = PromptTemplate(
input_variables=["resume_text"],
template=system_prompt + user_prompt_template
)
anonymize_chain = LLMChain(
llm=llm,
prompt=prompt_template
)
response=anonymize_chain.invoke(data)
return response
def handle_pdf(file_path):
with st.spinner("Parsing Resume..."):
data = read_pdf(file_path)
modified_data = parse_resume(data)
formatted_data = format_string(modified_data["text"])
st.write(formatted_data)
pdf_path = ""
print("Formatted text generated")
print(formatted_data)
if formatted_data:
# if st.button("Generate PDF"):
print("Button Clicked")
# Add spinner while generating the PDF
with st.spinner("Generating PDF..."):
# Generate the PDF from markup text
pdf_path = generate_pdf_from_markup(formatted_data)
st.success("PDF generated successfully.")
# Show the preview of the first page of the PDF
with open(pdf_path, "rb") as f:
pdf_bytes = f.read()
st.download_button(
label="Download PDF",
data=pdf_bytes,
file_name="generated.pdf",
mime="application/pdf"
)
print("AT LAST")
def main():
st.title("Resume Parser")
option = st.radio(
"Choose an option:",
("Use Demo PDF", "Browse Files"),
)
if option == "Use Demo PDF":
demo_pdf_path = "demo.pdf"
st.info("You have selected the demo PDF.")
if st.button("Click to go with Demo pdf"):
handle_pdf(demo_pdf_path)
elif option == "Browse Files":
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:
original_file_path = save_uploaded_file(uploaded_file)
st.success(f"File saved at {original_file_path}")
handle_pdf(original_file_path)
if __name__ == "__main__":
main()