Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
import tempfile | |
# from pypdf import PdfReader, PdfWriter | |
# from pdf2image import convert_from_path | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain.prompts import PromptTemplate | |
from langchain_groq import ChatGroq | |
from langchain.chains.llm import LLMChain | |
import tempfile | |
import markdown2 | |
from weasyprint import HTML | |
from io import BytesIO | |
def format_string(input_string): | |
# Find the index of the first occurrence of ":" | |
index = input_string.find(":") | |
# Check if ":" is found | |
if index != -1: | |
# Extract the substring starting from the found index to the end | |
substring = input_string[(index+1):] | |
else: | |
# If ":" is not found, return an empty string or an appropriate message | |
substring = input_string | |
return substring | |
def save_uploaded_file(uploadedfile): | |
# Create a temporary directory to save the file | |
temp_dir = tempfile.gettempdir() | |
save_path = os.path.join(temp_dir, uploadedfile.name) | |
with open(save_path, "wb") as f: | |
f.write(uploadedfile.getbuffer()) | |
return save_path | |
def read_pdf(file_path): | |
# Dummy processing: copying the original PDF content to a new PDF | |
loader = PyPDFLoader(file_path) | |
pages = loader.load_and_split() | |
text = "" | |
for page in pages: | |
text = text + " "+page.page_content+ "\n\n" | |
return text | |
def generate_pdf_from_markup(markup_text): | |
# Convert Markdown to HTML | |
html_content = markdown2.markdown(markup_text) | |
# Create a temporary file to save the PDF | |
temp_dir = tempfile.gettempdir() | |
pdf_path = os.path.join(temp_dir, "generated.pdf") | |
# Convert HTML to PDF | |
HTML(string=html_content).write_pdf(pdf_path) | |
return pdf_path | |
def parse_resume(data): | |
llm = ChatGroq(api_key=os.getenv("GROQ_API_KEY"),model="llama3-70b-8192") | |
system_prompt = """ | |
You are an AI assistant designed to remove and format resume data. When provided with extracted text from a PDF resume, your task is to remove personal information and certain details while maintaining the professional content and structure. | |
Follow the guidelines below: | |
Keep projects, experience, technical skills as it is without any change. | |
Remove Salutations: Mr, Mrs, Ms, etc. | |
Remove Names: All instances of the candidate's names. | |
Remove Gender: Any mention of gender. | |
Remove Age/D.O.B./Astrology Info: Any references to age, date of birth, or astrological signs. | |
Remove Links of personal accounts for example: exail id, github url, linkedin url and all the other urls except the project and experience urls. | |
Remove email address, mobile number, or any other information that has personal identity. | |
Anonymize Location: Replace specific locations with more general terms (e.g., "Willing to relocate, currently based in Leicester"). | |
Anonymize Education Institutions: Replace the names of educational institutions/schools with "top university (e.g. highly reputable university on the global stage) or top school" if applicable. | |
Anonymize Language Skills: Replace specific languages with regional groupings for multilingual candidates (e.g., "proficient in multiple European languages"). | |
Remove Hobbies and INTERESTS: Remove specific details related to hobbies and interests | |
Anonymize Other Fields: Make specific removals as needed to protect the candidate's identity. | |
Remove professional summary, objective, agenda and all these type of sections. | |
Keep only related skills ACHIEVEMENTS, awards and certificate which are writen by you. | |
Ensure the remaining sections and information are formatted properly to maintain the professional appearance of the resume. | |
Ensure proper formatting of the resume with proper content justifications, add markdown, add bullet points and spacing wherever required. | |
Return the output of resume content only. Don't include any notes or comments. | |
""" | |
# Remove achievment, awards and certifactes that are not related to professional work. | |
user_prompt_template = """ | |
{resume_text} | |
""" | |
prompt_template = PromptTemplate( | |
input_variables=["resume_text"], | |
template=system_prompt + user_prompt_template | |
) | |
anonymize_chain = LLMChain( | |
llm=llm, | |
prompt=prompt_template | |
) | |
response=anonymize_chain.invoke(data) | |
return response | |
def handle_pdf(file_path): | |
with st.spinner("Parsing Resume..."): | |
data = read_pdf(file_path) | |
modified_data = parse_resume(data) | |
formatted_data = format_string(modified_data["text"]) | |
st.write(formatted_data) | |
pdf_path = "" | |
print("Formatted text generated") | |
print(formatted_data) | |
if formatted_data: | |
# if st.button("Generate PDF"): | |
print("Button Clicked") | |
# Add spinner while generating the PDF | |
with st.spinner("Generating PDF..."): | |
# Generate the PDF from markup text | |
pdf_path = generate_pdf_from_markup(formatted_data) | |
st.success("PDF generated successfully.") | |
# Show the preview of the first page of the PDF | |
with open(pdf_path, "rb") as f: | |
pdf_bytes = f.read() | |
st.download_button( | |
label="Download PDF", | |
data=pdf_bytes, | |
file_name="generated.pdf", | |
mime="application/pdf" | |
) | |
print("AT LAST") | |
def main(): | |
st.title("Resume Parser") | |
option = st.radio( | |
"Choose an option:", | |
("Use Demo PDF", "Browse Files"), | |
) | |
if option == "Use Demo PDF": | |
demo_pdf_path = "demo.pdf" | |
st.info("You have selected the demo PDF.") | |
if st.button("Click to go with Demo pdf"): | |
handle_pdf(demo_pdf_path) | |
elif option == "Browse Files": | |
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
if uploaded_file is not None: | |
original_file_path = save_uploaded_file(uploaded_file) | |
st.success(f"File saved at {original_file_path}") | |
handle_pdf(original_file_path) | |
if __name__ == "__main__": | |
main() | |