Pathfinder / utils.py
celise88's picture
reorganize flow
6d4e9bd
raw
history blame
1.4 kB
from cleantext import clean
import cohere
import string
import numpy as np
from numpy.linalg import norm
from nltk.tokenize import SpaceTokenizer
import nltk
import os
from dotenv import load_dotenv
load_dotenv()
def coSkillEmbed(text):
co = cohere.Client(os.getenv("COHERE_TOKEN"))
response = co.embed(
model='large',
texts=[text])
return response.embeddings
def cosine(A, B):
return np.dot(A,B)/(norm(A)*norm(B))
def clean_my_text(resume):
clean_text = ' '.join(resume.splitlines())
clean_text = clean_text.replace('-', " ").replace("/"," ")
clean_text = clean(clean_text.translate(str.maketrans('', '', string.punctuation)))
stops = set(nltk.corpus.stopwords.words('english'))
stops = stops.union({'eg', 'ie', 'etc', 'experience', 'experiences', 'experienced', 'experiencing', 'knowledge',
'ability', 'abilities', 'skill', 'skills', 'skilled', 'including', 'includes', 'included', 'include'
'education', 'follow', 'following', 'follows', 'followed', 'make', 'made', 'makes', 'making', 'maker',
'available', 'large', 'larger', 'largescale', 'client', 'clients', 'responsible', 'x', 'many', 'team', 'teams'})
resume = [word for word in SpaceTokenizer().tokenize(resume) if word not in stops]
resume = [word for word in resume if ")" not in word]
resume = [word for word in resume if "(" not in word]
return resume