Spaces:

Robzy
/

jobbert_knowledge_extraction

Running

File size: 785 Bytes

0049d2e

import spacy
import re

nlp = spacy.load("en_core_web_sm")

def split_text_recursively(text):
    if '\n' not in text:
        return [text]
    parts = text.split('\n', 1)
    return [parts[0]] + split_text_recursively(parts[1])

def parse_post(path):

    # Read the file

    with open(path, 'r') as file:
        text = file.read()

    # Sentence tokenization

    str_list = split_text_recursively(text)
    str_list = [i.strip() for i in str_list]
    str_list = list(filter(None, str_list))

    count = 0
    sents = []

    for line in str_list:
        doc = nlp(line)
        for sent in doc.sents:
            print(f"{sent.text}")
            sents.append(sent.text)

    # Skill/knowledge extraction
    
    


path = './job-postings/03-01-2024/2.txt'
parse_post(path)