|
import re |
|
import os |
|
import logging |
|
import pdfplumber |
|
|
|
class ResumeReader: |
|
|
|
def convert_docx_to_txt(self, docx_file,docx_parser): |
|
""" |
|
A utility function to convert a Microsoft docx files to raw text. |
|
|
|
This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo. |
|
:param docx_file: docx file with gets uploaded by the user |
|
:type docx_file: InMemoryUploadedFile |
|
:return: The text contents of the docx file |
|
:rtype: str |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
text = "" |
|
try: |
|
clean_text = re.sub(r'\n+', '\n', text) |
|
clean_text = clean_text.replace("\r", "\n").replace("\t", " ") |
|
resume_lines = clean_text.splitlines() |
|
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if |
|
line.strip()] |
|
return resume_lines, text |
|
except Exception as e: |
|
logging.error('Error in docx file:: ' + str(e)) |
|
return [], " " |
|
|
|
def convert_pdf_to_txt(self, pdf_file): |
|
""" |
|
A utility function to convert a machine-readable PDF to raw text. |
|
|
|
This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo. |
|
:param input_pdf_path: Path to the .pdf file which should be converted |
|
:type input_pdf_path: str |
|
:return: The text contents of the pdf |
|
:rtype: str |
|
""" |
|
|
|
pdf = pdfplumber.open(pdf_file) |
|
raw_text= "" |
|
|
|
for page in pdf.pages: |
|
raw_text += page.extract_text() + "\n" |
|
|
|
pdf.close() |
|
|
|
try: |
|
full_string = re.sub(r'\n+', '\n', raw_text) |
|
full_string = full_string.replace("\r", "\n") |
|
full_string = full_string.replace("\t", " ") |
|
|
|
|
|
full_string = re.sub(r"\uf0b7", " ", full_string) |
|
full_string = re.sub(r"\(cid:\d{0,3}\)", " ", full_string) |
|
full_string = re.sub(r'• ', " ", full_string) |
|
|
|
|
|
resume_lines = full_string.splitlines(True) |
|
|
|
|
|
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()] |
|
|
|
return resume_lines, raw_text |
|
except Exception as e: |
|
logging.error('Error in docx file:: ' + str(e)) |
|
return [], " " |
|
|
|
def read_file(self, file,docx_parser = "tika"): |
|
""" |
|
file : Give path of resume file |
|
docx_parser : Enter docx2txt or tika, by default is tika |
|
""" |
|
print("Reading the Resume...") |
|
|
|
file = os.path.join(file) |
|
if file.endswith('docx') or file.endswith('doc'): |
|
|
|
|
|
|
|
resume_lines, raw_text = self.convert_docx_to_txt(file,docx_parser) |
|
elif file.endswith('pdf'): |
|
resume_lines, raw_text = self.convert_pdf_to_txt(file) |
|
elif file.endswith('txt'): |
|
with open(file, 'r', encoding='utf-8') as f: |
|
resume_lines = f.readlines() |
|
|
|
else: |
|
resume_lines = None |
|
|
|
|
|
return resume_lines |