Manojajj commited on
Commit
75da080
·
verified ·
1 Parent(s): d328c3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -67
app.py CHANGED
@@ -1,87 +1,77 @@
1
  import gradio as gr
 
2
  import torch
3
  from transformers import pipeline
4
- import pdfplumber
5
- import re
6
  import pandas as pd
 
7
 
8
- # Load a different pre-trained model for Named Entity Recognition (NER)
9
- nlp = pipeline("ner", model="huggingface/bert-base-cased", framework="pt")
 
 
 
 
 
10
 
11
  def extract_text_from_pdf(pdf_file):
12
- """Extract text from the uploaded PDF resume."""
13
- with pdfplumber.open(pdf_file) as pdf:
14
- text = ""
15
  for page in pdf.pages:
16
  text += page.extract_text()
17
  return text
18
 
19
- def parse_resume(resume_text):
20
- """Parse the resume and extract details like name, email, phone, skills, etc."""
21
- # Define regex for phone and email extraction
22
- phone_pattern = r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
23
- email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
24
-
25
- # Extract phone and email using regex
26
- phone = re.findall(phone_pattern, resume_text)
27
- email = re.findall(email_pattern, resume_text)
28
-
29
- # Extract named entities (e.g., skills, education, and experience)
30
- entities = nlp(resume_text)
31
 
32
- # For simplicity, we just list out the entities here
33
- skills = []
34
- experience = []
35
- education = []
36
- certifications = []
37
 
38
- # Iterate through recognized entities and categorize them
 
 
 
39
  for entity in entities:
40
- if 'ORG' in entity['entity']:
41
- experience.append(entity['word'])
42
- elif 'MISC' in entity['entity']:
43
- skills.append(entity['word'])
44
- elif 'LOC' in entity['entity']:
45
- education.append(entity['word'])
46
- else:
47
- certifications.append(entity['word'])
48
 
49
- # Create a dictionary of parsed data
50
- parsed_data = {
51
- "Phone": phone[0] if phone else "Not found",
52
- "Email": email[0] if email else "Not found",
53
- "Skills": ", ".join(skills),
54
- "Experience": ", ".join(experience),
55
- "Education": ", ".join(education),
56
- "Certifications": ", ".join(certifications)
57
  }
58
 
59
- return parsed_data
60
-
61
- def process_resumes(pdf_files):
62
- """Process multiple resumes and output a single Excel file."""
63
- all_parsed_data = []
64
-
65
- # Loop through each uploaded PDF file and parse the data
66
  for pdf_file in pdf_files:
67
- resume_text = extract_text_from_pdf(pdf_file)
68
- parsed_info = parse_resume(resume_text)
69
- all_parsed_data.append(parsed_info)
70
-
71
- # Convert the parsed data into a pandas DataFrame
72
- df = pd.DataFrame(all_parsed_data)
73
-
74
- # Save the DataFrame to an Excel file
75
- output_file = "parsed_resumes.xlsx"
76
- df.to_excel(output_file, index=False)
77
 
78
- return output_file
 
 
 
 
 
 
 
79
 
80
- # Define Gradio interface
81
- gr.Interface(
82
- fn=process_resumes,
83
- inputs=gr.File(file_count="multiple", label="Upload Resumes (PDFs)"),
84
- outputs=gr.File(label="Download Parsed Data (Excel)"),
85
- title="AI Resume Parser",
86
- description="Upload multiple resumes (PDFs) to extract details like Name, Email, Phone, Skills, Experience, Education, and Certifications. The results will be saved in an Excel file."
87
- ).launch()
 
1
  import gradio as gr
2
+ import pdfplumber
3
  import torch
4
  from transformers import pipeline
 
 
5
  import pandas as pd
6
+ from huggingface_hub import login
7
 
8
+ # Log in to Hugging Face using your API token (if needed for private models)
9
+ # You can generate an API token from https://huggingface.co/settings/tokens
10
+ login(token="your_huggingface_token")
11
+
12
+ # Load the model for Named Entity Recognition (NER)
13
+ # You can replace 'dbmdz/bert-large-cased-finetuned-conll03-english' with any other model if needed
14
+ nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", framework="pt")
15
 
16
  def extract_text_from_pdf(pdf_file):
17
+ """Extracts text from a PDF file using pdfplumber"""
18
+ with pdfplumber.open(pdf_file.name) as pdf:
19
+ text = ''
20
  for page in pdf.pages:
21
  text += page.extract_text()
22
  return text
23
 
24
+ def parse_resume(pdf_file):
25
+ """Parses the resume and extracts relevant information"""
26
+ # Extract text from PDF
27
+ resume_text = extract_text_from_pdf(pdf_file)
 
 
 
 
 
 
 
 
28
 
29
+ # Use the NER model to identify entities in the resume
30
+ entities = nlp(resume_text)
 
 
 
31
 
32
+ # Initialize empty fields
33
+ name = email = phone = education = skills = experience = None
34
+
35
+ # Example parsing logic based on NER output
36
  for entity in entities:
37
+ if entity['label'] == 'PER':
38
+ name = entity['word'] # If detected, use the first person name
39
+ elif entity['label'] == 'ORG':
40
+ experience = entity['word'] # Could be an organization name (e.g., employer)
41
+ elif entity['label'] == 'EMAIL':
42
+ email = entity['word']
43
+ elif entity['label'] == 'MISC':
44
+ skills = entity['word'] # Example for skills or qualifications
45
 
46
+ return {
47
+ 'Name': name,
48
+ 'Email': email,
49
+ 'Phone': phone,
50
+ 'Education': education,
51
+ 'Skills': skills,
52
+ 'Experience': experience,
 
53
  }
54
 
55
+ def batch_process_resumes(pdf_files):
56
+ """Process a batch of resume PDFs and output in a DataFrame"""
57
+ all_resumes = []
 
 
 
 
58
  for pdf_file in pdf_files:
59
+ resume_info = parse_resume(pdf_file)
60
+ all_resumes.append(resume_info)
61
+
62
+ # Convert to DataFrame
63
+ df = pd.DataFrame(all_resumes)
64
+ # Save to Excel
65
+ df.to_excel("parsed_resumes.xlsx", index=False)
66
+ return "Excel file with parsed resumes has been saved as 'parsed_resumes.xlsx'."
 
 
67
 
68
+ # Gradio interface
69
+ with gr.Blocks() as demo:
70
+ gr.Markdown("### AI Resume Parser")
71
+ file_input = gr.File(file_count="multiple", label="Upload Resumes (PDFs)")
72
+ output = gr.Textbox(label="Result")
73
+ process_button = gr.Button("Process Resumes")
74
+
75
+ process_button.click(batch_process_resumes, inputs=file_input, outputs=output)
76
 
77
+ demo.launch()