import gradio as gr import pdfplumber import torch from transformers import pipeline import pandas as pd from huggingface_hub import login import os # Function to login using Hugging Face API token def login_with_token(hf_token): """Login to Hugging Face using provided token.""" try: login(token=hf_token) return "Logged in successfully!" except Exception as e: return f"Error: {str(e)}" # Load the model for Named Entity Recognition (NER) nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", framework="pt") def extract_text_from_pdf(pdf_file): """Extracts text from a PDF file using pdfplumber.""" try: with pdfplumber.open(pdf_file.name) as pdf: text = '' for page in pdf.pages: text += page.extract_text() or "" # Handle pages with no text return text except Exception as e: print(f"Error reading PDF: {e}") return "" def parse_resume(pdf_file): """Parses the resume and extracts relevant information.""" # Extract text from PDF resume_text = extract_text_from_pdf(pdf_file) if not resume_text.strip(): print("No text found in PDF.") return {} # Use the NER model to identify entities in the resume entities = nlp(resume_text) # Initialize empty fields name = email = phone = education = skills = experience = None # Example parsing logic based on NER output for entity in entities: label = entity.get("entity", "") word = entity.get("word", "").strip() if label == "B-PER" or label == "I-PER": name = (name or "") + word + " " elif label == "B-ORG" or label == "I-ORG": experience = (experience or "") + word + " " elif "@" in word: # Simple email detection email = word elif label == "I-MISC": skills = (skills or "") + word + ", " # Clean up trailing spaces and commas name = name.strip() if name else None skills = skills.rstrip(", ") if skills else None # Log the final parsed information print(f"Parsed Info: Name={name}, Email={email}, Skills={skills}, Experience={experience}") return { 'Name': name, 'Email': email, 'Phone': phone, 'Education': education, 'Skills': skills, 'Experience': experience, } def batch_process_resumes(pdf_files): """Processes a batch of resume PDFs and outputs an Excel file.""" all_resumes = [] for pdf_file in pdf_files: resume_info = parse_resume(pdf_file) # Only add the parsed resume info if there's meaningful data if any(resume_info.values()): all_resumes.append(resume_info) # If no resumes are successfully parsed, return None if not all_resumes: print("No valid resume information was parsed.") return None # Convert to DataFrame df = pd.DataFrame(all_resumes) # Define the file path for the Excel file output_file = "/tmp/parsed_resumes.xlsx" # Save to Excel df.to_excel(output_file, index=False) # Return the path to the file for download return output_file # Gradio interface with gr.Blocks() as demo: gr.Markdown("### AI Resume Parser") # User input for Hugging Face token hf_token_input = gr.Textbox(label="Hugging Face Token", placeholder="Enter your Hugging Face API Token here") # File input for resume files file_input = gr.File(file_count="multiple", label="Upload Resumes (PDFs)") # Output for results output = gr.Textbox(label="Result") # File output for the download link download_link = gr.File(label="Download Excel File", file_count="single") # Process button that triggers the login and resume parsing process_button = gr.Button("Process Resumes") # Function call when button is clicked def process_resumes(hf_token, pdf_files): # Attempt to log in with provided token login_message = login_with_token(hf_token) if "Error" not in login_message: # Process resumes and generate the download link excel_file_path = batch_process_resumes(pdf_files) if excel_file_path: return login_message + "\nExcel file with parsed resumes is ready for download.", excel_file_path else: return login_message + "\nNo valid resume information was parsed.", None else: return login_message, None # Set up the button click event process_button.click(process_resumes, inputs=[hf_token_input, file_input], outputs=[output, download_link]) # Launch the Gradio interface demo.launch()