File size: 3,285 Bytes
67ba08f
 
 
 
c0605d9
 
 
 
 
 
67ba08f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0605d9
 
 
 
 
67ba08f
 
c0605d9
67ba08f
 
 
c0605d9
67ba08f
 
 
c0605d9
67ba08f
c0605d9
67ba08f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0605d9
67ba08f
 
 
 
c0605d9
 
 
 
67ba08f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gradio as gr
import pdfplumber
import re
import openpyxl
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# Authenticate Hugging Face API (ensure you're logged in already)
model_name = "meta-llama/Llama-3.1-70B-Instruct"  # Replace with your actual model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to parse the resume text for name, email, phone, and skills
def parse_resume(text):
    # Define the prompts for each type of information
    prompts = {
        "name": "Extract the name from this resume:\n",
        "email": "Extract the email address from this resume:\n",
        "phone": "Extract the phone number from this resume:\n",
        "skills": "Extract the technical skills from this resume:\n"
    }

    results = {}

    for key, prompt in prompts.items():
        # Generate model response for each field
        inputs = tokenizer(prompt + text, return_tensors="pt")
        outputs = model.generate(**inputs, max_length=500)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if key == 'email':
            # Use regex to validate email format
            email = re.findall(r'\S+@\S+', response)
            results[key] = email[0] if email else None
        elif key == 'phone':
            # Use regex to validate phone number format
            phone = re.findall(r'\b\d{10,15}\b', response)
            results[key] = phone[0] if phone else None
        elif key == 'skills':
            # Extract technical skills
            results[key] = response
        else:
            results[key] = response
    
    return results

# Function to save parsed data to Excel file
def save_to_excel(parsed_data, output_file):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.append(["Name", "Email", "Phone", "Skills"])

    for data in parsed_data:
        ws.append([data["name"], data["email"], data["phone"], data["skills"]])
    
    wb.save(output_file)

# Function to process PDF files and output an Excel file
def process_pdfs(pdfs):
    parsed_data = []
    
    for pdf in pdfs:
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf.name)
        
        # Parse the text for relevant details
        parsed_info = parse_resume(text)
        
        # Add parsed information to the list
        parsed_data.append(parsed_info)

    # Save the parsed data to an Excel file
    output_file = "parsed_resumes.xlsx"
    save_to_excel(parsed_data, output_file)

    return output_file

# Gradio interface setup with blank API space (Hugging Face integration)
iface = gr.Interface(
    fn=process_pdfs,
    inputs=gr.File(file_count="multiple", type="file"),
    outputs=gr.File(),
    live=True,
    title="AI Resume Parser",
    description="Upload PDF resumes, and the app will parse and extract Name, Email, Phone, and Skills from them.",
    examples=[["path_to_sample_resume.pdf"]]  # Provide sample files if necessary
)

# Launch the Gradio app
iface.launch()