runaksh commited on
Commit
e983676
·
1 Parent(s): 0c99867

Delete ResumeCode.txt

Browse files
Files changed (1) hide show
  1. ResumeCode.txt +0 -214
ResumeCode.txt DELETED
@@ -1,214 +0,0 @@
1
- !pip install opendatasets
2
-
3
- #!pip install wandb
4
-
5
- !pip install transformers[torch]
6
-
7
- !pip install evaluate
8
-
9
- import pandas as pd
10
- import numpy as np
11
- import os
12
- import random
13
- from datasets import Dataset
14
- import opendatasets as od
15
- import matplotlib.pyplot as plt
16
-
17
- import torch
18
- from transformers import (
19
- AutoTokenizer,
20
- AutoModelForSequenceClassification,
21
- TrainingArguments,
22
- Trainer,
23
- DataCollatorWithPadding,
24
- pipeline
25
- )
26
- import evaluate
27
-
28
- plt.style.use('seaborn-v0_8')
29
-
30
- from sklearn.model_selection import train_test_split
31
- from sklearn.preprocessing import LabelEncoder
32
- from sklearn.naive_bayes import MultinomialNB
33
- from sklearn import metrics
34
- from sklearn.metrics import accuracy_score
35
- from pandas.plotting import scatter_matrix
36
- from sklearn import metrics
37
- from sklearn.feature_extraction.text import TfidfVectorizer
38
- from matplotlib.gridspec import GridSpec
39
- import nltk
40
- nltk.download('stopwords')
41
- nltk.download('punkt')
42
- from nltk.corpus import stopwords
43
- import string
44
- from wordcloud import WordCloud
45
-
46
- DIRECTORY = '/content/UpdatedResumeDataSet.csv'
47
- MODEL_NAME = 'distilbert-base-uncased'
48
- BATCH_SIZE = 32
49
- LR = 2e-5
50
- EPOCHS = 10
51
- DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
52
-
53
- # read the dataset
54
- df = pd.read_csv('UpdatedResumeDataSet.csv')
55
- print(df.shape)
56
- df.head(10) # first 10 rows
57
-
58
- # Display the distinct categories of resume
59
- df['Category'].unique()
60
-
61
- # Display the distinct categories of resume and the number of records belonging to each category
62
- df['Category'].value_counts()
63
-
64
- import seaborn as sns
65
-
66
- sns.countplot(y = df['Category'], data = df['Resume'])
67
-
68
- # Convert all characters to lowercase
69
- def convert_lower(text):
70
- return text.lower()
71
-
72
- df['Resume'] = df['Resume'].apply(convert_lower)
73
-
74
- import re
75
- def cleanResume(resumeText):
76
- resumeText = re.sub(r'http\S+', '', resumeText,flags = re.MULTILINE) # remove URLs
77
- resumeText = re.sub('RT|cc', '', resumeText) # remove RT and cc
78
- resumeText = re.sub('#\S+', '', resumeText) # remove hashtags
79
- resumeText = re.sub('@\S+', '', resumeText) # remove mentions
80
- resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', resumeText) # remove punctuations
81
- resumeText = re.sub('â\S+', '', resumeText) # remove •
82
- resumeText = re.sub('€+', '', resumeText) # remove €
83
- resumeText = re.sub('\s+', ' ', resumeText) # remove extra whitespace
84
-
85
- return resumeText
86
-
87
- # apply the function defined above and save the
88
- df['cleaned_resume'] = df['Resume'].apply(cleanResume)
89
-
90
- # stop words
91
- stopword_list = nltk.corpus.stopwords.words('english')
92
- print(stopword_list)
93
-
94
- # removing the stopwords
95
- from nltk.tokenize import word_tokenize
96
-
97
- def remove_stopwords(text, is_lower_case=False):
98
- # splitting strings into tokens (list of words)
99
- tokens = word_tokenize(text)
100
- tokens = [token.strip() for token in tokens]
101
- filtered_tokens = [token for token in tokens if token not in stopword_list]
102
- filtered_text = ' '.join(filtered_tokens)
103
- return filtered_text
104
-
105
- # apply function on cleaned resume to remove stopwords
106
- df['text'] = df['cleaned_resume'].apply(remove_stopwords)
107
- df['label'] = df['Category']
108
-
109
- # reorder dataframe columns
110
- df = df[['text', 'label']]
111
-
112
- # view shape
113
- df.shape
114
-
115
- # view number of classes
116
- n_classes = df['label'].nunique()
117
- print(f"Number of Resume classes: {n_classes}")
118
-
119
- # view some statistics about are texts
120
- lengths = df['text'].apply(lambda x: len(x))
121
- print(
122
- f'Max text length: {lengths.max()}\nMin text length: {lengths.min()}\nAvg text length: {lengths.mean():.2f}'
123
- )
124
-
125
- # create mappings
126
- id2label = {idx: label for idx, label in enumerate(df['label'].unique())}
127
- label2id = {label: idx for idx, label in id2label.items()}
128
-
129
- # label encode our labels
130
- df['label'] = df['label'].map(label2id)
131
-
132
- # create and split dataset
133
- dataset = Dataset.from_pandas(df).train_test_split(train_size=0.8)
134
- print(dataset)
135
-
136
- # initialize tokenizer
137
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
138
-
139
- # Tokenize and encode the dataset
140
- def tokenize(batch):
141
- tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True)
142
- return tokenized_batch
143
-
144
- dataset_enc = dataset.map(tokenize, batched=True)
145
-
146
- print(dataset_enc)
147
-
148
- accuracy = evaluate.load('accuracy')
149
-
150
- def compute_metrics(eval_pred):
151
- predictions, labels = eval_pred
152
- predictions = np.argmax(predictions, axis=1)
153
- return accuracy.compute(predictions=predictions, references=labels)
154
-
155
- # define model
156
- model = AutoModelForSequenceClassification.from_pretrained(
157
- MODEL_NAME,
158
- num_labels=n_classes,
159
- id2label=id2label,
160
- label2id=label2id
161
- )
162
-
163
- model.to(DEVICE)
164
-
165
- # define collator function
166
- collator_fn = DataCollatorWithPadding(tokenizer, return_tensors='pt')
167
-
168
- pip install accelerate -U
169
-
170
- import accelerate
171
- import transformers
172
-
173
- transformers.__version__, accelerate.__version__
174
-
175
- from transformers import TrainingArguments
176
-
177
- training_args = TrainingArguments(
178
- output_dir = "Resume_training",
179
- learning_rate=LR,
180
- per_device_train_batch_size= BATCH_SIZE,
181
- per_device_eval_batch_size = BATCH_SIZE,
182
- num_train_epochs = EPOCHS,
183
- weight_decay = 0.01,
184
- evaluation_strategy = "epoch",
185
- save_strategy = "epoch",
186
- load_best_model_at_end = True,
187
- push_to_hub = False,
188
- report_to="none"
189
- )
190
-
191
- trainer = Trainer(
192
- model=model,
193
- args=training_args,
194
- train_dataset=dataset_enc["train"],
195
- eval_dataset=dataset_enc["test"],
196
- tokenizer=tokenizer,
197
- data_collator=collator_fn,
198
- compute_metrics=compute_metrics
199
- )
200
-
201
- trainer.train()
202
-
203
- trainer.save_model('ResumeClassification_distilBERT')
204
-
205
- trainer.evaluate()
206
-
207
- def predict(sample, validate=True):
208
- classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
209
- pred = classifier(sample)[0]['label']
210
- return pred
211
-
212
- sample1 = "I have working expereince in Java and javascript"
213
-
214
- predict(sample1)