Spaces:
Sleeping
Sleeping
working resume parser with streamlit front end, tested OpenAI and Anthropic.
Browse files- app.py +110 -0
- prompts/resume_extraction.prompt +28 -0
- requirements.txt +2 -1
- resume_template.py +70 -0
app.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
import io
|
3 |
+
import streamlit as st
|
4 |
+
from langchain.prompts import PromptTemplate
|
5 |
+
from langchain.output_parsers import PydanticOutputParser
|
6 |
+
from langchain_community.chat_models import ChatAnthropic
|
7 |
+
from langchain_openai import ChatOpenAI
|
8 |
+
from pydantic import ValidationError
|
9 |
+
from resume_template import Resume
|
10 |
+
from json import JSONDecodeError
|
11 |
+
import PyPDF2
|
12 |
+
import json
|
13 |
+
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
def pdf_to_string(file):
|
17 |
+
"""
|
18 |
+
Convert a PDF file to a string.
|
19 |
+
|
20 |
+
Parameters:
|
21 |
+
file (io.BytesIO): A file-like object representing the PDF file.
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
str: The extracted text from the PDF.
|
25 |
+
"""
|
26 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
27 |
+
num_pages = len(pdf_reader.pages)
|
28 |
+
text = ''
|
29 |
+
for i in range(num_pages):
|
30 |
+
page = pdf_reader.pages[i]
|
31 |
+
text += page.extract_text()
|
32 |
+
file.close()
|
33 |
+
return text
|
34 |
+
|
35 |
+
def extract_resume_fields(full_text, model):
|
36 |
+
"""
|
37 |
+
Analyze a resume text and extract structured information using a specified language model.
|
38 |
+
|
39 |
+
Parameters:
|
40 |
+
full_text (str): The text content of the resume.
|
41 |
+
model (str): The language model object to use for processing the text.
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
dict: A dictionary containing structured information extracted from the resume.
|
45 |
+
"""
|
46 |
+
# The Resume object is imported from the local resume_template file
|
47 |
+
|
48 |
+
with open("prompts/resume_extraction.prompt", "r") as f:
|
49 |
+
template = f.read()
|
50 |
+
|
51 |
+
parser = PydanticOutputParser(pydantic_object=Resume)
|
52 |
+
|
53 |
+
prompt_template = PromptTemplate(
|
54 |
+
template=template,
|
55 |
+
input_variables=["resume"],
|
56 |
+
partial_variables={"response_template": parser.get_format_instructions()},
|
57 |
+
)
|
58 |
+
# Invoke the language model and process the resume
|
59 |
+
formatted_input = prompt_template.format_prompt(resume=full_text)
|
60 |
+
llm = llm_dict.get(model, ChatOpenAI(temperature=0, model=model))
|
61 |
+
# print("llm", llm)
|
62 |
+
output = llm.invoke(formatted_input.to_string())
|
63 |
+
|
64 |
+
# print(output) # Print the output object for debugging
|
65 |
+
|
66 |
+
try:
|
67 |
+
parsed_output = parser.parse(output.content)
|
68 |
+
json_output = parsed_output.json()
|
69 |
+
print(json_output)
|
70 |
+
return json_output
|
71 |
+
|
72 |
+
except ValidationError as e:
|
73 |
+
print(f"Validation error: {e}")
|
74 |
+
print(output)
|
75 |
+
return output.content
|
76 |
+
|
77 |
+
except JSONDecodeError as e:
|
78 |
+
print(f"JSONDecodeError error: {e}")
|
79 |
+
print(output)
|
80 |
+
return output.content
|
81 |
+
|
82 |
+
st.title("Resume Parser")
|
83 |
+
|
84 |
+
# Set up the LLM dictionary
|
85 |
+
llm_dict = {
|
86 |
+
"gpt-4-1106-preview": ChatOpenAI(temperature=0, model="gpt-4-1106-preview"),
|
87 |
+
"gpt-4": ChatOpenAI(temperature=0, model="gpt-4"),
|
88 |
+
"gpt-3.5-turbo-1106": ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106"),
|
89 |
+
"claude-2": ChatAnthropic(model="claude-2", max_tokens=20_000),
|
90 |
+
"claude-instant-1": ChatAnthropic(model="claude-instant-1", max_tokens=20_000)
|
91 |
+
}
|
92 |
+
|
93 |
+
# Add a Streamlit dropdown menu for model selection
|
94 |
+
selected_model = st.selectbox("Select a model", list(llm_dict.keys()))
|
95 |
+
|
96 |
+
# Add a file uploader
|
97 |
+
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
|
98 |
+
|
99 |
+
# Check if a file is uploaded
|
100 |
+
if uploaded_file is not None:
|
101 |
+
# Add a button to trigger the conversion
|
102 |
+
if st.button("Convert PDF to Text"):
|
103 |
+
# Convert the uploaded file to a string
|
104 |
+
text = pdf_to_string(uploaded_file)
|
105 |
+
|
106 |
+
# Extract resume fields using the selected model
|
107 |
+
extracted_fields = extract_resume_fields(text, selected_model)
|
108 |
+
|
109 |
+
# Display the extracted fields on the Streamlit app
|
110 |
+
st.json(extracted_fields)
|
prompts/resume_extraction.prompt
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Given the task of analyzing the resume provided below, identify and organize the following information into structured data:
|
2 |
+
|
3 |
+
- Personal Details: Full name, contact information, and any professional summary or objective.
|
4 |
+
- Education: List of educational institutions attended, degrees obtained, fields of study, and graduation dates.
|
5 |
+
- Work Experience: For each position held, extract the company name, job title, duration of employment, a brief description of the role, and notable contributions or responsibilities.
|
6 |
+
- Projects: Details of significant projects worked on, including the project name, description, technologies used, and the role in the project.
|
7 |
+
- Skills: A compilation of technical and soft skills listed.
|
8 |
+
- Certifications: Any professional certifications, the certifying body, and the date of certification.
|
9 |
+
- Publications: Titles of publications, co-authors if applicable, and date of publication.
|
10 |
+
- Awards: Titles of any awards or honors received, awarding bodies, and date of receipt.
|
11 |
+
|
12 |
+
For fields not explicitly mentioned by the user, ensure to check for common sections such as volunteer experience, languages spoken, and hobbies or interests if they are professionally relevant.
|
13 |
+
|
14 |
+
Through each field make sure that you maintain as much details as possible, for example notable contributions should not be summarized but rather listed in full detail without adding any new material that isn't in the document.
|
15 |
+
|
16 |
+
Use the JSON structure below file to format your output. If any section does not apply or information is not available, it should be omitted from the JSON object. Ensure the output is formatted properly with the correct data types for each field (e.g., arrays, strings, objects).
|
17 |
+
|
18 |
+
----
|
19 |
+
|
20 |
+
{response_template}
|
21 |
+
|
22 |
+
----
|
23 |
+
|
24 |
+
{resume}
|
25 |
+
|
26 |
+
----
|
27 |
+
|
28 |
+
ensure that the output is formatted properly with the correct data types for each field.
|
requirements.txt
CHANGED
@@ -5,4 +5,5 @@ PyPDF2
|
|
5 |
|
6 |
openai
|
7 |
anthropic
|
8 |
-
langchain
|
|
|
|
5 |
|
6 |
openai
|
7 |
anthropic
|
8 |
+
langchain
|
9 |
+
langchain-community
|
resume_template.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, Field, ValidationError
|
2 |
+
from typing import List, Optional, Dict
|
3 |
+
|
4 |
+
# The following classes are for the resume template
|
5 |
+
|
6 |
+
class ContactInfo(BaseModel):
|
7 |
+
email: Optional[str] = None
|
8 |
+
phone: Optional[str] = None
|
9 |
+
linkedin: Optional[str] = None
|
10 |
+
|
11 |
+
class PersonalDetails(BaseModel):
|
12 |
+
full_name: str
|
13 |
+
contact_info: ContactInfo
|
14 |
+
professional_summary: Optional[str] = None
|
15 |
+
|
16 |
+
class Education(BaseModel):
|
17 |
+
institution: Optional[str] = None
|
18 |
+
degree: Optional[str] = None
|
19 |
+
field_of_study: Optional[str] = None
|
20 |
+
graduation_date: Optional[str] = None
|
21 |
+
|
22 |
+
class WorkExperience(BaseModel):
|
23 |
+
company: Optional[str] = None
|
24 |
+
title: Optional[str] = None
|
25 |
+
duration: Optional[str] = None
|
26 |
+
description: Optional[str] = None
|
27 |
+
notable_contributions: Optional[List[str]] = None
|
28 |
+
|
29 |
+
class Project(BaseModel):
|
30 |
+
name: Optional[str] = None
|
31 |
+
description: Optional[str] = None
|
32 |
+
technologies: Optional[str] = None
|
33 |
+
role: Optional[str] = None
|
34 |
+
|
35 |
+
class Certification(BaseModel):
|
36 |
+
title: Optional[str] = None
|
37 |
+
certifying_body: Optional[str] = None
|
38 |
+
date: Optional[str] = None
|
39 |
+
|
40 |
+
class Publication(BaseModel):
|
41 |
+
title: Optional[str] = None
|
42 |
+
co_authors: List[str] = []
|
43 |
+
date: Optional[str] = None
|
44 |
+
|
45 |
+
class Award(BaseModel):
|
46 |
+
title: Optional[str] = None
|
47 |
+
awarding_body: Optional[str] = None
|
48 |
+
date: Optional[str] = None
|
49 |
+
|
50 |
+
class VolunteerExperience(BaseModel):
|
51 |
+
organization: Optional[str] = None
|
52 |
+
role: Optional[str] = None
|
53 |
+
duration: Optional[str] = None
|
54 |
+
description: Optional[str] = None
|
55 |
+
|
56 |
+
class AdditionalSections(BaseModel):
|
57 |
+
volunteer_experience: Optional[List[VolunteerExperience]] = []
|
58 |
+
languages: Optional[List[str]] = []
|
59 |
+
interests: Optional[List[str]] = []
|
60 |
+
|
61 |
+
class Resume(BaseModel):
|
62 |
+
personal_details: PersonalDetails
|
63 |
+
education: List[Education] = []
|
64 |
+
work_experience: List[WorkExperience] = []
|
65 |
+
projects: List[Project] = []
|
66 |
+
skills: List[str] = []
|
67 |
+
certifications: List[Certification] = []
|
68 |
+
publications: List[Publication] = []
|
69 |
+
awards: List[Award] = []
|
70 |
+
additional_sections: Optional[AdditionalSections] = None
|