Spaces:
Sleeping
Sleeping
File size: 3,568 Bytes
318df0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
from dotenv import load_dotenv
import io
import streamlit as st
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain_community.chat_models import ChatAnthropic
from langchain_openai import ChatOpenAI
from pydantic import ValidationError
from resume_template import Resume
from json import JSONDecodeError
import PyPDF2
import json
load_dotenv()
def pdf_to_string(file):
"""
Convert a PDF file to a string.
Parameters:
file (io.BytesIO): A file-like object representing the PDF file.
Returns:
str: The extracted text from the PDF.
"""
pdf_reader = PyPDF2.PdfReader(file)
num_pages = len(pdf_reader.pages)
text = ''
for i in range(num_pages):
page = pdf_reader.pages[i]
text += page.extract_text()
file.close()
return text
def extract_resume_fields(full_text, model):
"""
Analyze a resume text and extract structured information using a specified language model.
Parameters:
full_text (str): The text content of the resume.
model (str): The language model object to use for processing the text.
Returns:
dict: A dictionary containing structured information extracted from the resume.
"""
# The Resume object is imported from the local resume_template file
with open("prompts/resume_extraction.prompt", "r") as f:
template = f.read()
parser = PydanticOutputParser(pydantic_object=Resume)
prompt_template = PromptTemplate(
template=template,
input_variables=["resume"],
partial_variables={"response_template": parser.get_format_instructions()},
)
# Invoke the language model and process the resume
formatted_input = prompt_template.format_prompt(resume=full_text)
llm = llm_dict.get(model, ChatOpenAI(temperature=0, model=model))
# print("llm", llm)
output = llm.invoke(formatted_input.to_string())
# print(output) # Print the output object for debugging
try:
parsed_output = parser.parse(output.content)
json_output = parsed_output.json()
print(json_output)
return json_output
except ValidationError as e:
print(f"Validation error: {e}")
print(output)
return output.content
except JSONDecodeError as e:
print(f"JSONDecodeError error: {e}")
print(output)
return output.content
st.title("Resume Parser")
# Set up the LLM dictionary
llm_dict = {
"gpt-4-1106-preview": ChatOpenAI(temperature=0, model="gpt-4-1106-preview"),
"gpt-4": ChatOpenAI(temperature=0, model="gpt-4"),
"gpt-3.5-turbo-1106": ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106"),
"claude-2": ChatAnthropic(model="claude-2", max_tokens=20_000),
"claude-instant-1": ChatAnthropic(model="claude-instant-1", max_tokens=20_000)
}
# Add a Streamlit dropdown menu for model selection
selected_model = st.selectbox("Select a model", list(llm_dict.keys()))
# Add a file uploader
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
# Check if a file is uploaded
if uploaded_file is not None:
# Add a button to trigger the conversion
if st.button("Convert PDF to Text"):
# Convert the uploaded file to a string
text = pdf_to_string(uploaded_file)
# Extract resume fields using the selected model
extracted_fields = extract_resume_fields(text, selected_model)
# Display the extracted fields on the Streamlit app
st.json(extracted_fields)
|