File size: 4,432 Bytes
d5cc663
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import re
from utils import extract_text_from_pdf
import streamlit as st

def clean_text(text):
    replacements = {
        '\u2013': '-',
        '\u2014': '-',
        '\u201c': '"',
        '\u201d': '"',
        '\u2022': '',
        '\u2019': "'",
    }
    for key, value in replacements.items():
        text = text.replace(key, value)
    return text

def extract_contact_info(line, resume_json):
    phone_number_pattern = re.compile(r'\+?\d[\d\s\-]+\d')
    email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
    linkedin_pattern = re.compile(r'(linkedin.com/in/[A-Za-z0-9-_]+)', re.IGNORECASE)
    github_pattern = re.compile(r'(github.com/[A-Za-z0-9-_]+)', re.IGNORECASE)

    contact_lines = re.split(r'\s*[—-]\s*|\s*\|\|\s*|\s*,\s*', line)
    for item in contact_lines:
        item = item.strip()
        if not item:
            continue
        if "Email" not in resume_json["Contact Information"] and email_pattern.search(item):
            resume_json["Contact Information"]["Email"] = email_pattern.search(item).group()
        elif "Phone" not in resume_json["Contact Information"] and phone_number_pattern.search(item):
            resume_json["Contact Information"]["Phone"] = phone_number_pattern.search(item).group()
        elif "LinkedIn" not in resume_json["Contact Information"] and linkedin_pattern.search(item):
            resume_json["Contact Information"]["LinkedIn"] = linkedin_pattern.search(item).group()
        elif "GitHub" not in resume_json["Contact Information"] and github_pattern.search(item):
            resume_json["Contact Information"]["GitHub"] = github_pattern.search(item).group()

def parse_resume(resume_text):
    resume_json = {
        "Contact Information": {},
        "Professional Experience": [],
        "Projects": [],
        "Skills": [],
        "Education": [],
        "Achievements": [],
        "Extra-Curricular Activities": []
    }

    section_keywords = {
        "Contact Information": ["Contact Information", "Contact Info", "Contact"],
        "Professional Experience": ["Professional Experience", "Work Experience", "Experience"],
        "Projects": ["Projects", "Project"],
        "Skills": ["Skills", "Technical Skills"],
        "Education": ["Education", "Academic Background"],
        "Achievements": ["Achievements", "Awards", "Honors"],
        "Extra-Curricular Activities": ["Extra-Curricular Activities", "Extracurricular Activities", "Activities", "Volunteer Experience"]
    }

    contact_info_patterns = ["@", "linkedin", "github", "phone", "+91"]
    section = None

    lines = resume_text.split('\n')
    for line in lines:
        line = clean_text(line.strip())
        if not line:
            continue

        section_detected = False
        for key, keywords in section_keywords.items():
            if any(keyword.lower() in line.lower() for keyword in keywords):
                section = key
                section_detected = True
                break

        if section_detected:
            continue

        if section == "Contact Information" or any(pattern in line.lower() for pattern in contact_info_patterns):
            section = "Contact Information"
            extract_contact_info(line, resume_json)
        elif section and section != "Contact Information":
            resume_json[section].append(line)

    if "Phone" not in resume_json["Contact Information"]:
        resume_json["Contact Information"]["Phone"] = "Not Provided"
    if "Email" not in resume_json["Contact Information"]:
        resume_json["Contact Information"]["Email"] = "Not Provided"
    if "LinkedIn" not in resume_json["Contact Information"]:
        resume_json["Contact Information"]["LinkedIn"] = "Not Provided"
    if "GitHub" not in resume_json["Contact Information"]:
        resume_json["Contact Information"]["GitHub"] = "Not Provided"

    return resume_json

st.title("Resume Parser to JSON")

uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
default_file_path = "Kushagra_Sharma_Resume.pdf"

if uploaded_file is not None:
    resume_text = extract_text_from_pdf(uploaded_file)
    file_name = uploaded_file.name
else:
    with open(default_file_path, "rb") as file:
        resume_text = extract_text_from_pdf(file)
    file_name = default_file_path

parsed_resume = parse_resume(resume_text)
st.text(f"Currently using file: {file_name}")
st.json(parsed_resume)