Spaces:
Running
Running
import requests | |
from bs4 import BeautifulSoup | |
import re | |
import os | |
from PyPDF2 import PdfReader | |
def read_pdf_text(pdf_file): | |
# Reading the pdf | |
pdf_reader = PdfReader(pdf_file) | |
all_text = "" | |
# make it limited. min(5, len(pages)) | |
for page in pdf_reader.pages: | |
all_text += page.extract_text() | |
return all_text | |
def parse_linkedin_pdf(pdf_text): | |
sections = re.split(r'\n(?=\b(?:Experience|Contact|Education|Top Skills|Languages|Honors-Awards)\b)', pdf_text) | |
parsed_data = {} | |
for section in sections: | |
lines = section.split('\n') | |
section_name = lines[0] | |
section_text = '\n'.join(lines[1:]) | |
parsed_data[section_name] = section_text | |
return parsed_data | |