runaksh commited on
Commit
0c99867
·
1 Parent(s): ef51994

Upload ResumeCode.txt

Browse files
Files changed (1) hide show
  1. ResumeCode.txt +214 -0
ResumeCode.txt ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install opendatasets
2
+
3
+ #!pip install wandb
4
+
5
+ !pip install transformers[torch]
6
+
7
+ !pip install evaluate
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ import os
12
+ import random
13
+ from datasets import Dataset
14
+ import opendatasets as od
15
+ import matplotlib.pyplot as plt
16
+
17
+ import torch
18
+ from transformers import (
19
+ AutoTokenizer,
20
+ AutoModelForSequenceClassification,
21
+ TrainingArguments,
22
+ Trainer,
23
+ DataCollatorWithPadding,
24
+ pipeline
25
+ )
26
+ import evaluate
27
+
28
+ plt.style.use('seaborn-v0_8')
29
+
30
+ from sklearn.model_selection import train_test_split
31
+ from sklearn.preprocessing import LabelEncoder
32
+ from sklearn.naive_bayes import MultinomialNB
33
+ from sklearn import metrics
34
+ from sklearn.metrics import accuracy_score
35
+ from pandas.plotting import scatter_matrix
36
+ from sklearn import metrics
37
+ from sklearn.feature_extraction.text import TfidfVectorizer
38
+ from matplotlib.gridspec import GridSpec
39
+ import nltk
40
+ nltk.download('stopwords')
41
+ nltk.download('punkt')
42
+ from nltk.corpus import stopwords
43
+ import string
44
+ from wordcloud import WordCloud
45
+
46
+ DIRECTORY = '/content/UpdatedResumeDataSet.csv'
47
+ MODEL_NAME = 'distilbert-base-uncased'
48
+ BATCH_SIZE = 32
49
+ LR = 2e-5
50
+ EPOCHS = 10
51
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
52
+
53
+ # read the dataset
54
+ df = pd.read_csv('UpdatedResumeDataSet.csv')
55
+ print(df.shape)
56
+ df.head(10) # first 10 rows
57
+
58
+ # Display the distinct categories of resume
59
+ df['Category'].unique()
60
+
61
+ # Display the distinct categories of resume and the number of records belonging to each category
62
+ df['Category'].value_counts()
63
+
64
+ import seaborn as sns
65
+
66
+ sns.countplot(y = df['Category'], data = df['Resume'])
67
+
68
+ # Convert all characters to lowercase
69
+ def convert_lower(text):
70
+ return text.lower()
71
+
72
+ df['Resume'] = df['Resume'].apply(convert_lower)
73
+
74
+ import re
75
+ def cleanResume(resumeText):
76
+ resumeText = re.sub(r'http\S+', '', resumeText,flags = re.MULTILINE) # remove URLs
77
+ resumeText = re.sub('RT|cc', '', resumeText) # remove RT and cc
78
+ resumeText = re.sub('#\S+', '', resumeText) # remove hashtags
79
+ resumeText = re.sub('@\S+', '', resumeText) # remove mentions
80
+ resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', resumeText) # remove punctuations
81
+ resumeText = re.sub('â\S+', '', resumeText) # remove •
82
+ resumeText = re.sub('€+', '', resumeText) # remove €
83
+ resumeText = re.sub('\s+', ' ', resumeText) # remove extra whitespace
84
+
85
+ return resumeText
86
+
87
+ # apply the function defined above and save the
88
+ df['cleaned_resume'] = df['Resume'].apply(cleanResume)
89
+
90
+ # stop words
91
+ stopword_list = nltk.corpus.stopwords.words('english')
92
+ print(stopword_list)
93
+
94
+ # removing the stopwords
95
+ from nltk.tokenize import word_tokenize
96
+
97
+ def remove_stopwords(text, is_lower_case=False):
98
+ # splitting strings into tokens (list of words)
99
+ tokens = word_tokenize(text)
100
+ tokens = [token.strip() for token in tokens]
101
+ filtered_tokens = [token for token in tokens if token not in stopword_list]
102
+ filtered_text = ' '.join(filtered_tokens)
103
+ return filtered_text
104
+
105
+ # apply function on cleaned resume to remove stopwords
106
+ df['text'] = df['cleaned_resume'].apply(remove_stopwords)
107
+ df['label'] = df['Category']
108
+
109
+ # reorder dataframe columns
110
+ df = df[['text', 'label']]
111
+
112
+ # view shape
113
+ df.shape
114
+
115
+ # view number of classes
116
+ n_classes = df['label'].nunique()
117
+ print(f"Number of Resume classes: {n_classes}")
118
+
119
+ # view some statistics about are texts
120
+ lengths = df['text'].apply(lambda x: len(x))
121
+ print(
122
+ f'Max text length: {lengths.max()}\nMin text length: {lengths.min()}\nAvg text length: {lengths.mean():.2f}'
123
+ )
124
+
125
+ # create mappings
126
+ id2label = {idx: label for idx, label in enumerate(df['label'].unique())}
127
+ label2id = {label: idx for idx, label in id2label.items()}
128
+
129
+ # label encode our labels
130
+ df['label'] = df['label'].map(label2id)
131
+
132
+ # create and split dataset
133
+ dataset = Dataset.from_pandas(df).train_test_split(train_size=0.8)
134
+ print(dataset)
135
+
136
+ # initialize tokenizer
137
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
138
+
139
+ # Tokenize and encode the dataset
140
+ def tokenize(batch):
141
+ tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True)
142
+ return tokenized_batch
143
+
144
+ dataset_enc = dataset.map(tokenize, batched=True)
145
+
146
+ print(dataset_enc)
147
+
148
+ accuracy = evaluate.load('accuracy')
149
+
150
+ def compute_metrics(eval_pred):
151
+ predictions, labels = eval_pred
152
+ predictions = np.argmax(predictions, axis=1)
153
+ return accuracy.compute(predictions=predictions, references=labels)
154
+
155
+ # define model
156
+ model = AutoModelForSequenceClassification.from_pretrained(
157
+ MODEL_NAME,
158
+ num_labels=n_classes,
159
+ id2label=id2label,
160
+ label2id=label2id
161
+ )
162
+
163
+ model.to(DEVICE)
164
+
165
+ # define collator function
166
+ collator_fn = DataCollatorWithPadding(tokenizer, return_tensors='pt')
167
+
168
+ pip install accelerate -U
169
+
170
+ import accelerate
171
+ import transformers
172
+
173
+ transformers.__version__, accelerate.__version__
174
+
175
+ from transformers import TrainingArguments
176
+
177
+ training_args = TrainingArguments(
178
+ output_dir = "Resume_training",
179
+ learning_rate=LR,
180
+ per_device_train_batch_size= BATCH_SIZE,
181
+ per_device_eval_batch_size = BATCH_SIZE,
182
+ num_train_epochs = EPOCHS,
183
+ weight_decay = 0.01,
184
+ evaluation_strategy = "epoch",
185
+ save_strategy = "epoch",
186
+ load_best_model_at_end = True,
187
+ push_to_hub = False,
188
+ report_to="none"
189
+ )
190
+
191
+ trainer = Trainer(
192
+ model=model,
193
+ args=training_args,
194
+ train_dataset=dataset_enc["train"],
195
+ eval_dataset=dataset_enc["test"],
196
+ tokenizer=tokenizer,
197
+ data_collator=collator_fn,
198
+ compute_metrics=compute_metrics
199
+ )
200
+
201
+ trainer.train()
202
+
203
+ trainer.save_model('ResumeClassification_distilBERT')
204
+
205
+ trainer.evaluate()
206
+
207
+ def predict(sample, validate=True):
208
+ classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
209
+ pred = classifier(sample)[0]['label']
210
+ return pred
211
+
212
+ sample1 = "I have working expereince in Java and javascript"
213
+
214
+ predict(sample1)