File size: 2,312 Bytes
0d375ed
 
547a2b6
 
 
 
 
 
0d375ed
 
 
 
 
 
 
 
 
 
 
 
547a2b6
 
 
 
 
 
0d375ed
 
 
547a2b6
 
0d375ed
 
547a2b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pypdfium2 as pdfium
import re
import wordninja
from PIL import Image
from pytesseract import image_to_string
from utils import recover_text, get_average_line_len
import pdfplumber

class ResumeReader:
    
    def clean_text(self, raw_text):
        clean_text = re.sub(r'\n+', '\n', raw_text)
        clean_text = clean_text.replace("\r", "\n")
        clean_text = clean_text.replace("\t", " ")
        clean_text = re.sub(r"\uf0b7", " ", clean_text)
        clean_text = re.sub(r'[^\x00-\x7F]+', '', clean_text) #remove non-ascii
        clean_text = re.sub(r"\(cid:\d{0,3}\)", " ", clean_text)
        clean_text = re.sub(r'• ', " ", clean_text)
        return clean_text
    
    def recover_text(self, text_without_spaces):
        recovered_text = " ".join(wordninja.split(text_without_spaces))
        return recovered_text
    
    def read_image(self, path_file):
        raw_text = str(image_to_string(Image.open(path_file)))
        clean_text = self.clean_text(raw_text)
        resume_lines = clean_text.splitlines(True)
        resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
        #avg_line = get_average_line_len(resume_lines)
        #resume_lines = [recover_text(line,avg_line) for line in resume_lines]
        return resume_lines
    
    def read_pdf(self, path_file):
        raw_text = ""
        with pdfplumber.open(path_file) as pdf:
        # Extract text from all pages
            for page_number in range(len(pdf.pages)):
                page = pdf.pages[page_number]
                raw_text += page.extract_text()
        clean_text = self.clean_text(raw_text)        
        resume_lines = clean_text.splitlines(True)
        resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
        #avg_line = get_average_line_len(resume_lines)
        #resume_lines = [recover_text(line,avg_line) for line in resume_lines]
        return resume_lines
    def read(self, path_file):
        if path_file.endswith('.pdf'):
            return self.read_pdf(path_file)
        elif path_file.endswith('.jpg') or path_file.endswith('.png') or path_file.endswith('.jpeg'):
            return self.read_image(path_file)
        else:
            print("Unsupported file format")
            return None