File size: 723 Bytes
d064c90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import requests
from bs4 import BeautifulSoup
import re
import os
from PyPDF2 import PdfReader

def read_pdf_text(pdf_file):
    # Reading the pdf
    pdf_reader = PdfReader(pdf_file)
    all_text = ""
    # make it limited. min(5, len(pages))
    for page in pdf_reader.pages:
        all_text += page.extract_text()
    return all_text

def parse_linkedin_pdf(pdf_text):
    sections = re.split(r'\n(?=\b(?:Experience|Contact|Education|Top Skills|Languages|Honors-Awards)\b)', pdf_text)
    parsed_data = {}
    for section in sections:
        lines = section.split('\n')
        section_name = lines[0]
        section_text = '\n'.join(lines[1:])
        parsed_data[section_name] = section_text
    return parsed_data