Spaces:
Sleeping
Sleeping
import spacy | |
import re | |
nlp = spacy.load("en_core_web_sm") | |
def split_text_recursively(text): | |
if '\n' not in text: | |
return [text] | |
parts = text.split('\n', 1) | |
return [parts[0]] + split_text_recursively(parts[1]) | |
def parse_post(path): | |
# Read the file | |
with open(path, 'r') as file: | |
text = file.read() | |
# Sentence tokenization | |
str_list = split_text_recursively(text) | |
str_list = [i.strip() for i in str_list] | |
str_list = list(filter(None, str_list)) | |
count = 0 | |
sents = [] | |
for line in str_list: | |
doc = nlp(line) | |
for sent in doc.sents: | |
print(f"{sent.text}") | |
sents.append(sent.text) | |
# Skill/knowledge extraction | |
path = './job-postings/03-01-2024/2.txt' | |
parse_post(path) | |