Spaces:

bacancydataprophets
/

resume_parser

Sleeping

App Files Files Community

gneya-bacancy commited on Jun 18, 2024

Commit

efe9805

verified ·

1 Parent(s): e0d8f76

Upload 3 files

Browse files

Files changed (3) hide show

app.py +177 -0
demo.pdf +0 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import streamlit as st
+import os
+import tempfile
+# from pypdf import PdfReader, PdfWriter
+# from pdf2image import convert_from_path
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.prompts import PromptTemplate
+from langchain_groq import ChatGroq
+from langchain.chains.llm import LLMChain
+import tempfile
+import markdown2
+from weasyprint import HTML
+from io import BytesIO
+def format_string(input_string):
+    # Find the index of the first occurrence of ":"
+    index = input_string.find(":")
+    # Check if ":" is found
+    if index != -1:
+        # Extract the substring starting from the found index to the end
+        substring = input_string[(index+1):]
+    else:
+        # If ":" is not found, return an empty string or an appropriate message
+        substring = input_string
+    return substring
+def save_uploaded_file(uploadedfile):
+    # Create a temporary directory to save the file
+    temp_dir = tempfile.gettempdir()
+    save_path = os.path.join(temp_dir, uploadedfile.name)
+    with open(save_path, "wb") as f:
+        f.write(uploadedfile.getbuffer())
+    return save_path
+def read_pdf(file_path):
+    # Dummy processing: copying the original PDF content to a new PDF
+    loader = PyPDFLoader(file_path)
+    pages = loader.load_and_split()
+    text = ""
+    for page in pages:
+        print(page.page_content)
+        text = text + " "+page.page_content+ "\n\n"
+    return text
+def generate_pdf_from_markup(markup_text):
+    # Convert Markdown to HTML
+    html_content = markdown2.markdown(markup_text)
+    # Create a temporary file to save the PDF
+    temp_dir = tempfile.gettempdir()
+    pdf_path = os.path.join(temp_dir, "generated.pdf")
+    # Convert HTML to PDF
+    HTML(string=html_content).write_pdf(pdf_path)
+    return pdf_path
+def parse_resume(data):
+    llm = ChatGroq(api_key=os.getenv("GROQ_API_KEY"),model="llama3-70b-8192")
+    system_prompt = """
+    You are an AI assistant designed to remove and format resume data. When provided with extracted text from a PDF resume, your task is to remove personal information and certain details while maintaining the professional content and structure.
+    Follow the guidelines below:
+    Keep projects, experience, technical skills as it is without any change.
+    Remove Salutations: Mr, Mrs, Ms, etc.
+    Remove Names: All instances of the candidate's names.
+    Remove Gender: Any mention of gender.
+    Remove Age/D.O.B./Astrology Info: Any references to age, date of birth, or astrological signs.
+    Remove Links of personal accounts for example: exail id, github url, linkedin url and all the other urls except the project and experience urls.
+    Remove email address, mobile number, or any other information that has personal identity.
+    Anonymize Location: Replace specific locations with more general terms (e.g., "Willing to relocate, currently based in Leicester").
+    Anonymize Education Institutions: Replace the names of educational institutions/schools with "top university (e.g. highly reputable university on the global stage) or top school" if applicable.
+    Anonymize Language Skills: Replace specific languages with regional groupings for multilingual candidates (e.g., "proficient in multiple European languages").
+    Remove Hobbies and INTERESTS: Remove specific details related to hobbies and interests
+    Anonymize Other Fields: Make specific removals as needed to protect the candidate's identity.
+    Remove professional summary, objective, agenda and all these type of sections.
+    Add only professional  achievment, awards and certifactes
+    Ensure the remaining sections and information are formatted properly to maintain the professional appearance of the resume.
+    Ensure proper formatting of the resume with proper content justifications, add markdown, add bullet points and spacing wherever required.
+    Return the output of resume content only. Don't include any notes or comments.
+    """
+    #    Remove achievment, awards and certifactes that are not related to professional work.
+    user_prompt_template = """
+    {resume_text}
+    """
+    prompt_template = PromptTemplate(
+        input_variables=["resume_text"],
+        template=system_prompt + user_prompt_template
+    )
+    anonymize_chain = LLMChain(
+        llm=llm,
+        prompt=prompt_template
+    )
+    response=anonymize_chain.invoke(data)
+    return response
+def handle_pdf(file_path):
+    with st.spinner("Parsing Resume..."):
+        data = read_pdf(file_path)
+        modified_data = parse_resume(data)
+        formatted_data = format_string(modified_data["text"])
+        st.write(formatted_data)
+        pdf_path = ""
+        if st.button("Generate PDF"):
+        # Add spinner while generating the PDF
+            with st.spinner("Generating PDF..."):
+            # Generate the PDF from markup text
+                pdf_path = generate_pdf_from_markup(formatted_data)
+            st.success("PDF generated successfully.")
+            # Show the preview of the first page of the PDF
+            with open(pdf_path, "rb") as f:
+                pdf_bytes = f.read()
+                st.download_button(
+                    label="Download PDF",
+                    data=pdf_bytes,
+                    file_name="generated.pdf",
+                    mime="application/pdf"
+                )
+def main():
+    st.title("Resume Parser")
+    option = st.radio(
+        "Choose an option:",
+        ("Use Demo PDF", "Browse Files"),
+    )
+    if option == "Use Demo PDF":
+        demo_pdf_path = "demo.pdf"
+        st.info("You have selected the demo PDF.")
+        if st.button("Click to go with Demo pdf"):
+            handle_pdf(demo_pdf_path)
+    elif option == "Browse Files":
+        uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+        if uploaded_file is not None:
+            original_file_path = save_uploaded_file(uploaded_file)
+            st.success(f"File saved at {original_file_path}")
+            handle_pdf(original_file_path)
+        # with st.spinner("Parsing Resume..."):
+        #     data = read_pdf(original_file_path)
+        #     modified_data = parse_resume(data)
+        #     formatted_data = format_string(modified_data["text"])
+        #     st.write(formatted_data)
+        # pdf_path = ""
+        # if st.button("Generate PDF"):
+        # # Add spinner while generating the PDF
+        #     with st.spinner("Generating PDF..."):
+        #     # Generate the PDF from markup text
+        #         pdf_path = generate_pdf_from_markup(formatted_data)
+        #     st.success("PDF generated successfully.")
+        #     # Show the preview of the first page of the PDF
+        #     with open(pdf_path, "rb") as f:
+        #         pdf_bytes = f.read()
+        #         st.download_button(
+        #             label="Download PDF",
+        #             data=pdf_bytes,
+        #             file_name="generated.pdf",
+        #             mime="application/pdf"
+        #         )
+if __name__ == "__main__":
+    main()

demo.pdf ADDED Viewed

Binary file (86.8 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+pypdf
+streamlit
+langchain
+langchain_groq
+langchain_community
+markdown2
+weasyprint