manasvinid commited on
Commit
b5bf8cc
·
verified ·
1 Parent(s): b0df9c3

Create functions.py

Browse files
Files changed (1) hide show
  1. functions.py +386 -0
functions.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np # linear algebra
2
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
3
+ import os
4
+ import nltk
5
+ import zipfile
6
+ import os
7
+ from bs4 import BeautifulSoup
8
+ import re
9
+ from nltk.corpus import stopwords
10
+ from nltk.stem import WordNetLemmatizer
11
+ from transformers import BartForConditionalGeneration, BartTokenizer
12
+ import torch
13
+ from tqdm import tqdm
14
+ from sentence_transformers import SentenceTransformer
15
+
16
+ from qdrant_client import QdrantClient
17
+ from qdrant_client.http.models import VectorParams, Distance, Record, Filter
18
+ from random import uniform
19
+
20
+
21
+ def setup_nltk_resources():
22
+ """
23
+ Sets up the custom NLTK data path and downloads necessary resources.
24
+ Downloads 'wordnet' for lemmatization, 'stopwords' for stopwords removal,
25
+ and 'punkt' for sentence tokenization.
26
+ """
27
+ nltk_data_path = "/kaggle/working/nltk_data"
28
+ nltk.data.path.append(nltk_data_path)
29
+
30
+ nltk.download('wordnet', download_dir=nltk_data_path)
31
+ nltk.download('stopwords', download_dir=nltk_data_path)
32
+ nltk.download('punkt', download_dir=nltk_data_path)
33
+
34
+ def unzip_nltk_resource(zip_path, extract_to):
35
+ """
36
+ Unzips an NLTK resource file to a specified directory.
37
+
38
+ Args:
39
+ zip_path (str): The path to the zipped NLTK resource file.
40
+ extract_to (str): The directory where the contents of the zip file will be extracted.
41
+ """
42
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
43
+ zip_ref.extractall(extract_to)
44
+
45
+
46
+ def preprocess_text(text):
47
+ """
48
+ Preprocesses a given text string for NLP tasks. This includes cleaning the text,
49
+ tokenizing, removing stopwords, and lemmatizing the words.
50
+
51
+ Args:
52
+ text (str): The text string to preprocess.
53
+
54
+ Returns:
55
+ str: The preprocessed text.
56
+ """
57
+ if not text:
58
+ return ""
59
+ text = re.sub(r'[\r\n\t]+', ' ', text)
60
+ text = re.sub(r'[^a-zA-Z\s]', '', text)
61
+ text = text.lower()
62
+
63
+ tokens = word_tokenize(text)
64
+ stop_words = set(stopwords.words('english'))
65
+ filtered_tokens = [word for word in tokens if word not in stop_words]
66
+
67
+ lemmatizer = WordNetLemmatizer()
68
+ lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_tokens]
69
+
70
+ return ' '.join(lemmatized_text)
71
+
72
+
73
+ def drop_duplicates(df, column_name):
74
+ """
75
+ Drops duplicates based on a specified column from the DataFrame.
76
+
77
+ Args:
78
+ df (pd.DataFrame): The DataFrame from which to remove duplicates.
79
+ column_name (str): The name of the column based on which duplicates will be identified.
80
+
81
+ Returns:
82
+ pd.DataFrame: DataFrame with duplicates removed based on the specified column.
83
+ """
84
+ if column_name not in df.columns:
85
+ raise ValueError(f"Column '{column_name}' not found in DataFrame")
86
+
87
+ original_size = df.shape[0]
88
+ df_cleaned = df.drop_duplicates(subset=[column_name])
89
+ new_size = df_cleaned.shape[0]
90
+
91
+ print(f"Dropped {original_size - new_size} duplicates from '{column_name}'. New dataset size: {new_size}")
92
+
93
+ return df_cleaned
94
+
95
+ def add_token_count_column(df, column_name):
96
+ """
97
+ Adds a new column to the DataFrame with the token count for each entry in the specified column.
98
+ This function creates a copy of the DataFrame to avoid 'SettingWithCopyWarning'.
99
+
100
+ Args:
101
+ df (pd.DataFrame): The DataFrame to process.
102
+ column_name (str): The name of the column for which to count tokens.
103
+
104
+ Returns:
105
+ pd.DataFrame: DataFrame with an additional column 'token_count'.
106
+ """
107
+ if column_name not in df.columns:
108
+ raise ValueError(f"Column '{column_name}' not found in DataFrame")
109
+
110
+ # Creating a copy of the DataFrame to avoid modifying a slice
111
+ df_copy = df.copy()
112
+
113
+ # Tokenize each entry in the specified column and count the number of tokens
114
+ df_copy['token_count'] = df_copy[column_name].apply(lambda x: len(word_tokenize(x)) if pd.notnull(x) else 0)
115
+
116
+ return df_copy
117
+
118
+
119
+ class TextSummarizer:
120
+ """
121
+ A text summarization class that uses a fine-tuned BART model to summarize text.
122
+
123
+ Attributes:
124
+ device (str): Device to run the model on, either 'cuda' or 'cpu'.
125
+ model (BartForConditionalGeneration): The loaded BART model.
126
+ tokenizer (BartTokenizer): The tokenizer for the BART model.
127
+ """
128
+
129
+ def __init__(self, model_name):
130
+ """
131
+ Initializes the TextSummarizer with a specified BART model.
132
+
133
+ Args:
134
+ model_name (str): The name or path of the fine-tuned BART model.
135
+ """
136
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
137
+ self.model = BartForConditionalGeneration.from_pretrained(model_name).to(self.device)
138
+ self.tokenizer = BartTokenizer.from_pretrained(model_name)
139
+
140
+ def summarize(self, text, max_input_length=1024, max_output_length=150, min_output_length=40):
141
+ """
142
+ Summarizes the given text using the fine-tuned BART model.
143
+
144
+ Args:
145
+ text (str): The text to be summarized.
146
+ max_input_length (int): The maximum length of the input text in tokens.
147
+ max_output_length (int): The maximum length of the summary text in tokens.
148
+ min_output_length (int): The minimum length of the summary text in tokens.
149
+
150
+ Returns:
151
+ str: The summarized text.
152
+ """
153
+ inputs = self.tokenizer([text], max_length=max_input_length, return_tensors='pt', truncation=True)
154
+ summary_ids = self.model.generate(
155
+ inputs['input_ids'].to(self.device),
156
+ max_length=max_output_length,
157
+ min_length=min_output_length,
158
+ length_penalty=2.0,
159
+ num_beams=4,
160
+ early_stopping=True
161
+ )
162
+ return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
163
+
164
+
165
+
166
+ def batch_summarize(df, text_col, summarizer, batch_size=10, output_col=None):
167
+ """
168
+ Summarizes text in batches.
169
+
170
+ Args:
171
+ df (pd.DataFrame): The DataFrame containing text to summarize.
172
+ text_col (str): The column in the DataFrame with text to summarize.
173
+ summarizer: The summarizer object or function.
174
+ batch_size (int): The size of each batch for summarization.
175
+ output_col (str, optional): The name of the output column for summarized text.
176
+ If None, defaults to text_col.
177
+
178
+ Returns:
179
+ pd.DataFrame: DataFrame with summarized text in the specified output column.
180
+ """
181
+ summarized_texts = []
182
+
183
+ # Use the text_col as output_col if not specified
184
+ if output_col is None:
185
+ output_col = text_col
186
+
187
+ # Iterate through the DataFrame in batches
188
+ for start_idx in tqdm(range(0, len(df), batch_size), desc="Summarizing"):
189
+ end_idx = start_idx + batch_size
190
+ batch = df[text_col][start_idx:end_idx]
191
+
192
+ # Summarize each batch
193
+ summarized_batch = [summarizer.summarize(text) for text in batch]
194
+ summarized_texts.extend(summarized_batch)
195
+
196
+ # Create a new DataFrame with the summarized text
197
+ return pd.DataFrame({output_col: summarized_texts})
198
+
199
+
200
+ class SentenceTransformerEncoder:
201
+ """
202
+ A class to handle sentence encoding using Sentence Transformers, directly working with pandas DataFrames.
203
+ This class encodes text data in a specified DataFrame column into vector representations.
204
+
205
+ Attributes:
206
+ model (SentenceTransformer): The Sentence Transformer model used for encoding.
207
+ """
208
+
209
+ def __init__(self, model_name='all-MiniLM-L6-v2'):
210
+ """
211
+ Initializes the SentenceTransformerEncoder with a specified Sentence Transformer model.
212
+
213
+ Args:
214
+ model_name (str): The name of the Sentence Transformer model.
215
+ """
216
+ self.model = SentenceTransformer(model_name)
217
+
218
+ def encode_column(self, df, column, batch_size=32, encoded_column_suffix='_encoded'):
219
+ """
220
+ Encodes a specific column in a DataFrame and adds a new column with encoded vectors.
221
+
222
+ Args:
223
+ df (pd.DataFrame): The DataFrame containing the texts to encode.
224
+ column (str): The name of the column to encode.
225
+ batch_size (int): The size of each batch for processing.
226
+ encoded_column_suffix (str): Suffix for the new column containing encoded vectors.
227
+
228
+ Returns:
229
+ pd.DataFrame: The original DataFrame with an additional column containing encoded vectors.
230
+
231
+ Raises:
232
+ ValueError: If the specified column is not found in the DataFrame.
233
+ """
234
+ if column not in df.columns:
235
+ raise ValueError(f"Column '{column}' not found in DataFrame")
236
+
237
+ # Encoding the text data in batches
238
+ encoded_vectors = []
239
+ for start_idx in range(0, len(df), batch_size):
240
+ end_idx = min(start_idx + batch_size, len(df))
241
+ batch_texts = df[column][start_idx:end_idx].tolist()
242
+ batch_encoded = self.model.encode(batch_texts, show_progress_bar=True)
243
+ encoded_vectors.extend(batch_encoded)
244
+
245
+ # Adding the encoded vectors as a new column in the DataFrame
246
+ df[column + encoded_column_suffix] = encoded_vectors
247
+ return df
248
+
249
+ class QdrantInterface:
250
+ """
251
+ A class for interfacing with the Qdrant vector database.
252
+
253
+ Attributes:
254
+ client (QdrantClient): Client instance for interacting with Qdrant.
255
+ vector_dimension (int): Dimension of the vectors used in the collection.
256
+ """
257
+
258
+ """
259
+ A class for interfacing with the Qdrant vector database.
260
+ ...
261
+ """
262
+ def __init__(self, url, api_key, vector_dimension):
263
+ """
264
+ Initializes the QdrantInterface with the specified Qdrant URL, API key, and vector dimension.
265
+
266
+ Args:
267
+ url (str): Full URL of the Qdrant server.
268
+ api_key (str): API key for Qdrant.
269
+ vector_dimension (int): Dimension of vectors to be stored in Qdrant.
270
+ """
271
+ self.client = QdrantClient(url=url, api_key=api_key)
272
+ self.vector_dimension = vector_dimension
273
+ def create_collection(self, collection_name, distance_metric=Distance.COSINE):
274
+ """
275
+ Creates or recreates a collection in Qdrant.
276
+
277
+ Args:
278
+ collection_name (str): Name of the collection.
279
+ distance_metric (Distance): Distance metric for vector comparisons.
280
+ """
281
+ self.client.recreate_collection(
282
+ collection_name=collection_name,
283
+ vectors_config=VectorParams(size=self.vector_dimension, distance=distance_metric)
284
+ )
285
+ def save_to_qdrant(self, df, collection_name, vector_col, payload_cols, batch_size=100):
286
+ """
287
+ Saves a DataFrame to Qdrant in batches.
288
+
289
+ Args:
290
+ df (pd.DataFrame): DataFrame containing data to save.
291
+ collection_name (str): Name of the collection in Qdrant.
292
+ vector_col (str): Name of the column containing vectors.
293
+ payload_cols (list[str]): List of column names to include as payload.
294
+ batch_size (int): Number of records to process in each batch.
295
+ """
296
+
297
+ for start_idx in range(0, len(df), batch_size):
298
+ end_idx = min(start_idx + batch_size, len(df))
299
+ batch = df.iloc[start_idx:end_idx]
300
+ records = []
301
+ for idx, row in batch.iterrows():
302
+ # Debug print
303
+ print(f"Index: {idx}, Vector Type: {type(row[vector_col])}, First 10 Elements: {row[vector_col][:10]}")
304
+ record = Record(
305
+ id=idx,
306
+ vector=row[vector_col],
307
+ payload={col: row[col] for col in payload_cols}
308
+ )
309
+ records.append(record)
310
+ self.client.upload_records(collection_name=collection_name, records=records)
311
+
312
+
313
+ def retrieve_specific_records(self, collection_name, ids):
314
+ """
315
+ Retrieves specific records by their IDs from a Qdrant collection.
316
+
317
+ Args:
318
+ collection_name (str): The name of the collection.
319
+ ids (list): List of record IDs to retrieve.
320
+
321
+ Returns:
322
+ List of specific records from the collection.
323
+ """
324
+ return self.client.retrieve(collection_name=collection_name, ids=ids)
325
+
326
+ def view_sample_records(self, collection_name, vector_dimension, limit=10):
327
+ """
328
+ Retrieves a sample of records from a Qdrant collection using a dummy search.
329
+
330
+ Args:
331
+ collection_name (str): The name of the collection.
332
+ vector_dimension (int): Dimension of vectors in the collection.
333
+ limit (int): The number of records to retrieve.
334
+
335
+ Returns:
336
+ List of sample records from the collection.
337
+ """
338
+ # Generate a random vector
339
+ random_vector = [uniform(-1, 1) for _ in range(vector_dimension)]
340
+
341
+ # Perform a dummy search
342
+ return self.client.search(
343
+ collection_name=collection_name,
344
+ query_vector=random_vector,
345
+ limit=limit
346
+ )
347
+ def match_resumes_to_jobs(self, resume_vector, top_k=10):
348
+ """
349
+ Matches a given resume vector to job postings.
350
+
351
+ Args:
352
+ resume_vector (list): The vector representation of a resume.
353
+ top_k (int): Number of top similar matches to return.
354
+
355
+ Returns:
356
+ List of matched job postings with similarity scores.
357
+ """
358
+ hits = self.client.search(
359
+ collection_name="jobs",
360
+ query_vector=resume_vector,
361
+ limit=top_k,
362
+ with_payload=True
363
+ )
364
+ return [(hit.payload, hit.score) for hit in hits]
365
+ def match_jobs_to_resumes(self, job_vector, top_k=10):
366
+ """
367
+ Matches a given job vector to resumes.
368
+
369
+ Args:
370
+ job_vector (list): The vector representation of a job posting.
371
+ top_k (int): Number of top similar matches to return.
372
+
373
+ Returns:
374
+ List of tuples containing matched resumes and their similarity scores.
375
+ """
376
+ hits = self.client.search(
377
+ collection_name="resumes",
378
+ query_vector=job_vector,
379
+ limit=top_k,
380
+ with_payload=True
381
+ )
382
+ return [(hit.payload, hit.score) for hit in hits]
383
+
384
+
385
+
386
+