Spaces:
Running
Running
File size: 2,312 Bytes
0d375ed 547a2b6 0d375ed 547a2b6 0d375ed 547a2b6 0d375ed 547a2b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import pypdfium2 as pdfium
import re
import wordninja
from PIL import Image
from pytesseract import image_to_string
from utils import recover_text, get_average_line_len
import pdfplumber
class ResumeReader:
def clean_text(self, raw_text):
clean_text = re.sub(r'\n+', '\n', raw_text)
clean_text = clean_text.replace("\r", "\n")
clean_text = clean_text.replace("\t", " ")
clean_text = re.sub(r"\uf0b7", " ", clean_text)
clean_text = re.sub(r'[^\x00-\x7F]+', '', clean_text) #remove non-ascii
clean_text = re.sub(r"\(cid:\d{0,3}\)", " ", clean_text)
clean_text = re.sub(r'• ', " ", clean_text)
return clean_text
def recover_text(self, text_without_spaces):
recovered_text = " ".join(wordninja.split(text_without_spaces))
return recovered_text
def read_image(self, path_file):
raw_text = str(image_to_string(Image.open(path_file)))
clean_text = self.clean_text(raw_text)
resume_lines = clean_text.splitlines(True)
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
#avg_line = get_average_line_len(resume_lines)
#resume_lines = [recover_text(line,avg_line) for line in resume_lines]
return resume_lines
def read_pdf(self, path_file):
raw_text = ""
with pdfplumber.open(path_file) as pdf:
# Extract text from all pages
for page_number in range(len(pdf.pages)):
page = pdf.pages[page_number]
raw_text += page.extract_text()
clean_text = self.clean_text(raw_text)
resume_lines = clean_text.splitlines(True)
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
#avg_line = get_average_line_len(resume_lines)
#resume_lines = [recover_text(line,avg_line) for line in resume_lines]
return resume_lines
def read(self, path_file):
if path_file.endswith('.pdf'):
return self.read_pdf(path_file)
elif path_file.endswith('.jpg') or path_file.endswith('.png') or path_file.endswith('.jpeg'):
return self.read_image(path_file)
else:
print("Unsupported file format")
return None |