LinkedIn-Profile-QA / doc_loading.py
Atif20024's picture
Main. Uploaded all the files required to run this app
d064c90 verified
raw
history blame contribute delete
723 Bytes
import requests
from bs4 import BeautifulSoup
import re
import os
from PyPDF2 import PdfReader
def read_pdf_text(pdf_file):
# Reading the pdf
pdf_reader = PdfReader(pdf_file)
all_text = ""
# make it limited. min(5, len(pages))
for page in pdf_reader.pages:
all_text += page.extract_text()
return all_text
def parse_linkedin_pdf(pdf_text):
sections = re.split(r'\n(?=\b(?:Experience|Contact|Education|Top Skills|Languages|Honors-Awards)\b)', pdf_text)
parsed_data = {}
for section in sections:
lines = section.split('\n')
section_name = lines[0]
section_text = '\n'.join(lines[1:])
parsed_data[section_name] = section_text
return parsed_data