sebastiansarasti
commited on
adding the files for the app
Browse files- data_processing.py +10 -0
- main.py +80 -0
- model.py +42 -0
- requirements.txt +3 -0
data_processing.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import DistilBertTokenizer
|
2 |
+
|
3 |
+
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
4 |
+
|
5 |
+
def get_tokens(string):
|
6 |
+
text = tokenizer(string, return_tensors="pt", padding="max_length", max_length=256, truncation=True)
|
7 |
+
return {
|
8 |
+
"input_ids": text["input_ids"],
|
9 |
+
"attention_mask": text["attention_mask"]
|
10 |
+
}
|
main.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from huggingface_hub import hf_hub_download
|
3 |
+
from model import JobFakeModel
|
4 |
+
import torch
|
5 |
+
import torch.nn.functional as F
|
6 |
+
from data_processing import get_tokens
|
7 |
+
import numpy as np
|
8 |
+
import time
|
9 |
+
|
10 |
+
def get_model_list():
|
11 |
+
model_name = "sebastiansarasti/fakeJobs"
|
12 |
+
filename = "best_model.pth"
|
13 |
+
file_path = hf_hub_download(repo_id=model_name, filename=filename)
|
14 |
+
return file_path
|
15 |
+
|
16 |
+
def load_model(path):
|
17 |
+
model = JobFakeModel(base_model="distilbert", freeze_base=True)
|
18 |
+
model.load_state_dict(torch.load(path))
|
19 |
+
return model
|
20 |
+
|
21 |
+
st.title('Fake Jobs Streamlit App')
|
22 |
+
st.write('This is a fake jobs streamlit app')
|
23 |
+
|
24 |
+
# download the model
|
25 |
+
path = get_model_list()
|
26 |
+
model = load_model(path)
|
27 |
+
|
28 |
+
with st.sidebar:
|
29 |
+
st.subheader('About the App')
|
30 |
+
st.markdown('Data used for the training come from the following source: https://www.kaggle.com/datasets/shivamb/real-or-fake-fake-jobposting-prediction')
|
31 |
+
st.empty()
|
32 |
+
st.subheader('Author')
|
33 |
+
st.markdown('Sebastián Sarasti Zambonino')
|
34 |
+
st.markdown('Data Scientist - Machine Learning Engineer')
|
35 |
+
st.markdown('https://www.linkedin.com/in/sebastiansarasti/')
|
36 |
+
st.markdown('https://github.com/sebassaras02')
|
37 |
+
|
38 |
+
# Create columns for the inputs
|
39 |
+
col1, col2, col3 = st.columns(3)
|
40 |
+
|
41 |
+
with col1:
|
42 |
+
# Create an input text box for the description
|
43 |
+
description = st.text_area('Description', 'Enter the job description here')
|
44 |
+
|
45 |
+
with col2:
|
46 |
+
# Create an input text box for the requirements
|
47 |
+
requirements = st.text_area('Requirements', 'Enter the job requirements here')
|
48 |
+
|
49 |
+
with col3:
|
50 |
+
# Create an input text box for the benefits
|
51 |
+
benefits = st.text_area('Benefits', 'Enter the job benefits here')
|
52 |
+
|
53 |
+
# if benefits is none, set it to an empty nothing
|
54 |
+
if benefits is None:
|
55 |
+
benefits = 'Nothing'
|
56 |
+
elif requirements is None:
|
57 |
+
requirements = 'Nothing'
|
58 |
+
elif description is None:
|
59 |
+
raise ValueError('Description cannot be empty')
|
60 |
+
|
61 |
+
# Create a button to submit the job
|
62 |
+
with st.spinner('Wait for it...'):
|
63 |
+
time.sleep(2)
|
64 |
+
if st.button('Submit Job'):
|
65 |
+
tokens_des = get_tokens(description)
|
66 |
+
tokens_req = get_tokens(requirements)
|
67 |
+
tokens_ben = get_tokens(benefits)
|
68 |
+
model.eval()
|
69 |
+
with torch.no_grad():
|
70 |
+
output = model(tokens_des, tokens_req, tokens_ben)
|
71 |
+
# calculate the probability
|
72 |
+
output = F.sigmoid(output).item()*100
|
73 |
+
# Create a box to show the result
|
74 |
+
st.metric('Fake Job Probability', f"{np.round(output, 5)}%")
|
75 |
+
if output > 75:
|
76 |
+
st.error('⚠️ High probability of being a fake job!')
|
77 |
+
elif output > 50:
|
78 |
+
st.warning('⚠️ Medium probability of being a fake job')
|
79 |
+
else:
|
80 |
+
st.success('✅ Low probability of being a fake job')
|
model.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
from torch import cat
|
3 |
+
from transformers import DistilBertModel
|
4 |
+
|
5 |
+
class JobFakeModel(nn.Module):
|
6 |
+
def __init__(self, base_model, freeze_base):
|
7 |
+
super(JobFakeModel, self).__init__()
|
8 |
+
self.base_model = base_model
|
9 |
+
self.fc = nn.Sequential(
|
10 |
+
nn.Linear(768*3, 600),
|
11 |
+
nn.ReLU(),
|
12 |
+
nn.Linear(600, 300),
|
13 |
+
nn.ReLU(),
|
14 |
+
nn.Linear(300, 1)
|
15 |
+
)
|
16 |
+
self.head1, self.head2, self.head3 = self._create_base_model()
|
17 |
+
|
18 |
+
if freeze_base:
|
19 |
+
for param in self.head1.parameters():
|
20 |
+
param.requires_grad = False
|
21 |
+
for param in self.head2.parameters():
|
22 |
+
param.requires_grad = False
|
23 |
+
for param in self.head3.parameters():
|
24 |
+
param.requires_grad = False
|
25 |
+
|
26 |
+
def forward(self, x, y , z):
|
27 |
+
x = self.head1(**x).last_hidden_state.mean(dim=1)
|
28 |
+
y = self.head2(**y).last_hidden_state.mean(dim=1)
|
29 |
+
z = self.head3(**z).last_hidden_state.mean(dim=1)
|
30 |
+
output = cat([x, y, z], dim=1)
|
31 |
+
output = self.fc(output)
|
32 |
+
return output
|
33 |
+
|
34 |
+
def _create_base_model(self):
|
35 |
+
if self.base_model == "distilbert":
|
36 |
+
model1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
|
37 |
+
model2 = DistilBertModel.from_pretrained("distilbert-base-uncased")
|
38 |
+
model3 = DistilBertModel.from_pretrained("distilbert-base-uncased")
|
39 |
+
return model1, model2, model3
|
40 |
+
else:
|
41 |
+
raise ValueError("Model not supported")
|
42 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
torch==2.4.0
|
2 |
+
huggingface-hub==0.26.5
|
3 |
+
numpy==2.2.0
|