File size: 3,568 Bytes
318df0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from dotenv import load_dotenv
import io
import streamlit as st
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain_community.chat_models import ChatAnthropic
from langchain_openai import ChatOpenAI
from pydantic import ValidationError
from resume_template import Resume
from json import JSONDecodeError
import PyPDF2
import json

load_dotenv()

def pdf_to_string(file):
    """
    Convert a PDF file to a string.

    Parameters:
    file (io.BytesIO): A file-like object representing the PDF file.

    Returns:
    str: The extracted text from the PDF.
    """
    pdf_reader = PyPDF2.PdfReader(file)
    num_pages = len(pdf_reader.pages)
    text = ''
    for i in range(num_pages):
        page = pdf_reader.pages[i]
        text += page.extract_text()
    file.close()
    return text

def extract_resume_fields(full_text, model):
    """
    Analyze a resume text and extract structured information using a specified language model.

    Parameters:
    full_text (str): The text content of the resume.
    model (str): The language model object to use for processing the text.

    Returns:
    dict: A dictionary containing structured information extracted from the resume.
    """
    # The Resume object is imported from the local resume_template file

    with open("prompts/resume_extraction.prompt", "r") as f:
        template = f.read()

    parser = PydanticOutputParser(pydantic_object=Resume)

    prompt_template = PromptTemplate(
        template=template,
        input_variables=["resume"],
        partial_variables={"response_template": parser.get_format_instructions()},
    )
    # Invoke the language model and process the resume
    formatted_input = prompt_template.format_prompt(resume=full_text)
    llm = llm_dict.get(model, ChatOpenAI(temperature=0, model=model))
    # print("llm", llm)
    output = llm.invoke(formatted_input.to_string())
    
    # print(output)  # Print the output object for debugging
    
    try:
        parsed_output = parser.parse(output.content)
        json_output = parsed_output.json()
        print(json_output)
        return json_output
    
    except ValidationError as e:
        print(f"Validation error: {e}")
        print(output)
        return output.content
    
    except JSONDecodeError as e:
        print(f"JSONDecodeError error: {e}")
        print(output)
        return output.content

st.title("Resume Parser")

# Set up the LLM dictionary
llm_dict = {
    "gpt-4-1106-preview": ChatOpenAI(temperature=0, model="gpt-4-1106-preview"),
    "gpt-4": ChatOpenAI(temperature=0, model="gpt-4"),
    "gpt-3.5-turbo-1106": ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106"),
    "claude-2": ChatAnthropic(model="claude-2", max_tokens=20_000),
    "claude-instant-1": ChatAnthropic(model="claude-instant-1", max_tokens=20_000)
}

# Add a Streamlit dropdown menu for model selection
selected_model = st.selectbox("Select a model", list(llm_dict.keys()))

# Add a file uploader
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")

# Check if a file is uploaded
if uploaded_file is not None:
    # Add a button to trigger the conversion
    if st.button("Convert PDF to Text"):
        # Convert the uploaded file to a string
        text = pdf_to_string(uploaded_file)
        
        # Extract resume fields using the selected model
        extracted_fields = extract_resume_fields(text, selected_model)
        
        # Display the extracted fields on the Streamlit app
        st.json(extracted_fields)