sebastiansarasti commited on
Commit
4736ae1
·
verified ·
1 Parent(s): 19ad08c

adding the files for the app

Browse files
Files changed (4) hide show
  1. data_processing.py +10 -0
  2. main.py +80 -0
  3. model.py +42 -0
  4. requirements.txt +3 -0
data_processing.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import DistilBertTokenizer
2
+
3
+ tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
4
+
5
+ def get_tokens(string):
6
+ text = tokenizer(string, return_tensors="pt", padding="max_length", max_length=256, truncation=True)
7
+ return {
8
+ "input_ids": text["input_ids"],
9
+ "attention_mask": text["attention_mask"]
10
+ }
main.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from huggingface_hub import hf_hub_download
3
+ from model import JobFakeModel
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from data_processing import get_tokens
7
+ import numpy as np
8
+ import time
9
+
10
+ def get_model_list():
11
+ model_name = "sebastiansarasti/fakeJobs"
12
+ filename = "best_model.pth"
13
+ file_path = hf_hub_download(repo_id=model_name, filename=filename)
14
+ return file_path
15
+
16
+ def load_model(path):
17
+ model = JobFakeModel(base_model="distilbert", freeze_base=True)
18
+ model.load_state_dict(torch.load(path))
19
+ return model
20
+
21
+ st.title('Fake Jobs Streamlit App')
22
+ st.write('This is a fake jobs streamlit app')
23
+
24
+ # download the model
25
+ path = get_model_list()
26
+ model = load_model(path)
27
+
28
+ with st.sidebar:
29
+ st.subheader('About the App')
30
+ st.markdown('Data used for the training come from the following source: https://www.kaggle.com/datasets/shivamb/real-or-fake-fake-jobposting-prediction')
31
+ st.empty()
32
+ st.subheader('Author')
33
+ st.markdown('Sebastián Sarasti Zambonino')
34
+ st.markdown('Data Scientist - Machine Learning Engineer')
35
+ st.markdown('https://www.linkedin.com/in/sebastiansarasti/')
36
+ st.markdown('https://github.com/sebassaras02')
37
+
38
+ # Create columns for the inputs
39
+ col1, col2, col3 = st.columns(3)
40
+
41
+ with col1:
42
+ # Create an input text box for the description
43
+ description = st.text_area('Description', 'Enter the job description here')
44
+
45
+ with col2:
46
+ # Create an input text box for the requirements
47
+ requirements = st.text_area('Requirements', 'Enter the job requirements here')
48
+
49
+ with col3:
50
+ # Create an input text box for the benefits
51
+ benefits = st.text_area('Benefits', 'Enter the job benefits here')
52
+
53
+ # if benefits is none, set it to an empty nothing
54
+ if benefits is None:
55
+ benefits = 'Nothing'
56
+ elif requirements is None:
57
+ requirements = 'Nothing'
58
+ elif description is None:
59
+ raise ValueError('Description cannot be empty')
60
+
61
+ # Create a button to submit the job
62
+ with st.spinner('Wait for it...'):
63
+ time.sleep(2)
64
+ if st.button('Submit Job'):
65
+ tokens_des = get_tokens(description)
66
+ tokens_req = get_tokens(requirements)
67
+ tokens_ben = get_tokens(benefits)
68
+ model.eval()
69
+ with torch.no_grad():
70
+ output = model(tokens_des, tokens_req, tokens_ben)
71
+ # calculate the probability
72
+ output = F.sigmoid(output).item()*100
73
+ # Create a box to show the result
74
+ st.metric('Fake Job Probability', f"{np.round(output, 5)}%")
75
+ if output > 75:
76
+ st.error('⚠️ High probability of being a fake job!')
77
+ elif output > 50:
78
+ st.warning('⚠️ Medium probability of being a fake job')
79
+ else:
80
+ st.success('✅ Low probability of being a fake job')
model.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ from torch import cat
3
+ from transformers import DistilBertModel
4
+
5
+ class JobFakeModel(nn.Module):
6
+ def __init__(self, base_model, freeze_base):
7
+ super(JobFakeModel, self).__init__()
8
+ self.base_model = base_model
9
+ self.fc = nn.Sequential(
10
+ nn.Linear(768*3, 600),
11
+ nn.ReLU(),
12
+ nn.Linear(600, 300),
13
+ nn.ReLU(),
14
+ nn.Linear(300, 1)
15
+ )
16
+ self.head1, self.head2, self.head3 = self._create_base_model()
17
+
18
+ if freeze_base:
19
+ for param in self.head1.parameters():
20
+ param.requires_grad = False
21
+ for param in self.head2.parameters():
22
+ param.requires_grad = False
23
+ for param in self.head3.parameters():
24
+ param.requires_grad = False
25
+
26
+ def forward(self, x, y , z):
27
+ x = self.head1(**x).last_hidden_state.mean(dim=1)
28
+ y = self.head2(**y).last_hidden_state.mean(dim=1)
29
+ z = self.head3(**z).last_hidden_state.mean(dim=1)
30
+ output = cat([x, y, z], dim=1)
31
+ output = self.fc(output)
32
+ return output
33
+
34
+ def _create_base_model(self):
35
+ if self.base_model == "distilbert":
36
+ model1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
37
+ model2 = DistilBertModel.from_pretrained("distilbert-base-uncased")
38
+ model3 = DistilBertModel.from_pretrained("distilbert-base-uncased")
39
+ return model1, model2, model3
40
+ else:
41
+ raise ValueError("Model not supported")
42
+
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch==2.4.0
2
+ huggingface-hub==0.26.5
3
+ numpy==2.2.0