Spaces:
Running
Running
File size: 723 Bytes
d064c90 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
import requests
from bs4 import BeautifulSoup
import re
import os
from PyPDF2 import PdfReader
def read_pdf_text(pdf_file):
# Reading the pdf
pdf_reader = PdfReader(pdf_file)
all_text = ""
# make it limited. min(5, len(pages))
for page in pdf_reader.pages:
all_text += page.extract_text()
return all_text
def parse_linkedin_pdf(pdf_text):
sections = re.split(r'\n(?=\b(?:Experience|Contact|Education|Top Skills|Languages|Honors-Awards)\b)', pdf_text)
parsed_data = {}
for section in sections:
lines = section.split('\n')
section_name = lines[0]
section_text = '\n'.join(lines[1:])
parsed_data[section_name] = section_text
return parsed_data
|