import re from utils import extract_text_from_pdf import streamlit as st def clean_text(text): replacements = { '\u2013': '-', '\u2014': '-', '\u201c': '"', '\u201d': '"', '\u2022': '', '\u2019': "'", } for key, value in replacements.items(): text = text.replace(key, value) return text def extract_contact_info(line, resume_json): phone_number_pattern = re.compile(r'\+?\d[\d\s\-]+\d') email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b') linkedin_pattern = re.compile(r'(linkedin.com/in/[A-Za-z0-9-_]+)', re.IGNORECASE) github_pattern = re.compile(r'(github.com/[A-Za-z0-9-_]+)', re.IGNORECASE) contact_lines = re.split(r'\s*[—-]\s*|\s*\|\|\s*|\s*,\s*', line) for item in contact_lines: item = item.strip() if not item: continue if "Email" not in resume_json["Contact Information"] and email_pattern.search(item): resume_json["Contact Information"]["Email"] = email_pattern.search(item).group() elif "Phone" not in resume_json["Contact Information"] and phone_number_pattern.search(item): resume_json["Contact Information"]["Phone"] = phone_number_pattern.search(item).group() elif "LinkedIn" not in resume_json["Contact Information"] and linkedin_pattern.search(item): resume_json["Contact Information"]["LinkedIn"] = linkedin_pattern.search(item).group() elif "GitHub" not in resume_json["Contact Information"] and github_pattern.search(item): resume_json["Contact Information"]["GitHub"] = github_pattern.search(item).group() def parse_resume(resume_text): resume_json = { "Contact Information": {}, "Professional Experience": [], "Projects": [], "Skills": [], "Education": [], "Achievements": [], "Extra-Curricular Activities": [] } section_keywords = { "Contact Information": ["Contact Information", "Contact Info", "Contact"], "Professional Experience": ["Professional Experience", "Work Experience", "Experience"], "Projects": ["Projects", "Project"], "Skills": ["Skills", "Technical Skills"], "Education": ["Education", "Academic Background"], "Achievements": ["Achievements", "Awards", "Honors"], "Extra-Curricular Activities": ["Extra-Curricular Activities", "Extracurricular Activities", "Activities", "Volunteer Experience"] } contact_info_patterns = ["@", "linkedin", "github", "phone", "+91"] section = None lines = resume_text.split('\n') for line in lines: line = clean_text(line.strip()) if not line: continue section_detected = False for key, keywords in section_keywords.items(): if any(keyword.lower() in line.lower() for keyword in keywords): section = key section_detected = True break if section_detected: continue if section == "Contact Information" or any(pattern in line.lower() for pattern in contact_info_patterns): section = "Contact Information" extract_contact_info(line, resume_json) elif section and section != "Contact Information": resume_json[section].append(line) if "Phone" not in resume_json["Contact Information"]: resume_json["Contact Information"]["Phone"] = "Not Provided" if "Email" not in resume_json["Contact Information"]: resume_json["Contact Information"]["Email"] = "Not Provided" if "LinkedIn" not in resume_json["Contact Information"]: resume_json["Contact Information"]["LinkedIn"] = "Not Provided" if "GitHub" not in resume_json["Contact Information"]: resume_json["Contact Information"]["GitHub"] = "Not Provided" return resume_json st.title("Resume Parser to JSON") uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") default_file_path = "Kushagra_Sharma_Resume.pdf" if uploaded_file is not None: resume_text = extract_text_from_pdf(uploaded_file) file_name = uploaded_file.name else: with open(default_file_path, "rb") as file: resume_text = extract_text_from_pdf(file) file_name = default_file_path parsed_resume = parse_resume(resume_text) st.text(f"Currently using file: {file_name}") st.json(parsed_resume)