Spaces:
Sleeping
Sleeping
Create functions.py
Browse files- functions.py +386 -0
functions.py
ADDED
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np # linear algebra
|
2 |
+
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
|
3 |
+
import os
|
4 |
+
import nltk
|
5 |
+
import zipfile
|
6 |
+
import os
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
import re
|
9 |
+
from nltk.corpus import stopwords
|
10 |
+
from nltk.stem import WordNetLemmatizer
|
11 |
+
from transformers import BartForConditionalGeneration, BartTokenizer
|
12 |
+
import torch
|
13 |
+
from tqdm import tqdm
|
14 |
+
from sentence_transformers import SentenceTransformer
|
15 |
+
|
16 |
+
from qdrant_client import QdrantClient
|
17 |
+
from qdrant_client.http.models import VectorParams, Distance, Record, Filter
|
18 |
+
from random import uniform
|
19 |
+
|
20 |
+
|
21 |
+
def setup_nltk_resources():
|
22 |
+
"""
|
23 |
+
Sets up the custom NLTK data path and downloads necessary resources.
|
24 |
+
Downloads 'wordnet' for lemmatization, 'stopwords' for stopwords removal,
|
25 |
+
and 'punkt' for sentence tokenization.
|
26 |
+
"""
|
27 |
+
nltk_data_path = "/kaggle/working/nltk_data"
|
28 |
+
nltk.data.path.append(nltk_data_path)
|
29 |
+
|
30 |
+
nltk.download('wordnet', download_dir=nltk_data_path)
|
31 |
+
nltk.download('stopwords', download_dir=nltk_data_path)
|
32 |
+
nltk.download('punkt', download_dir=nltk_data_path)
|
33 |
+
|
34 |
+
def unzip_nltk_resource(zip_path, extract_to):
|
35 |
+
"""
|
36 |
+
Unzips an NLTK resource file to a specified directory.
|
37 |
+
|
38 |
+
Args:
|
39 |
+
zip_path (str): The path to the zipped NLTK resource file.
|
40 |
+
extract_to (str): The directory where the contents of the zip file will be extracted.
|
41 |
+
"""
|
42 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
43 |
+
zip_ref.extractall(extract_to)
|
44 |
+
|
45 |
+
|
46 |
+
def preprocess_text(text):
|
47 |
+
"""
|
48 |
+
Preprocesses a given text string for NLP tasks. This includes cleaning the text,
|
49 |
+
tokenizing, removing stopwords, and lemmatizing the words.
|
50 |
+
|
51 |
+
Args:
|
52 |
+
text (str): The text string to preprocess.
|
53 |
+
|
54 |
+
Returns:
|
55 |
+
str: The preprocessed text.
|
56 |
+
"""
|
57 |
+
if not text:
|
58 |
+
return ""
|
59 |
+
text = re.sub(r'[\r\n\t]+', ' ', text)
|
60 |
+
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
61 |
+
text = text.lower()
|
62 |
+
|
63 |
+
tokens = word_tokenize(text)
|
64 |
+
stop_words = set(stopwords.words('english'))
|
65 |
+
filtered_tokens = [word for word in tokens if word not in stop_words]
|
66 |
+
|
67 |
+
lemmatizer = WordNetLemmatizer()
|
68 |
+
lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_tokens]
|
69 |
+
|
70 |
+
return ' '.join(lemmatized_text)
|
71 |
+
|
72 |
+
|
73 |
+
def drop_duplicates(df, column_name):
|
74 |
+
"""
|
75 |
+
Drops duplicates based on a specified column from the DataFrame.
|
76 |
+
|
77 |
+
Args:
|
78 |
+
df (pd.DataFrame): The DataFrame from which to remove duplicates.
|
79 |
+
column_name (str): The name of the column based on which duplicates will be identified.
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
pd.DataFrame: DataFrame with duplicates removed based on the specified column.
|
83 |
+
"""
|
84 |
+
if column_name not in df.columns:
|
85 |
+
raise ValueError(f"Column '{column_name}' not found in DataFrame")
|
86 |
+
|
87 |
+
original_size = df.shape[0]
|
88 |
+
df_cleaned = df.drop_duplicates(subset=[column_name])
|
89 |
+
new_size = df_cleaned.shape[0]
|
90 |
+
|
91 |
+
print(f"Dropped {original_size - new_size} duplicates from '{column_name}'. New dataset size: {new_size}")
|
92 |
+
|
93 |
+
return df_cleaned
|
94 |
+
|
95 |
+
def add_token_count_column(df, column_name):
|
96 |
+
"""
|
97 |
+
Adds a new column to the DataFrame with the token count for each entry in the specified column.
|
98 |
+
This function creates a copy of the DataFrame to avoid 'SettingWithCopyWarning'.
|
99 |
+
|
100 |
+
Args:
|
101 |
+
df (pd.DataFrame): The DataFrame to process.
|
102 |
+
column_name (str): The name of the column for which to count tokens.
|
103 |
+
|
104 |
+
Returns:
|
105 |
+
pd.DataFrame: DataFrame with an additional column 'token_count'.
|
106 |
+
"""
|
107 |
+
if column_name not in df.columns:
|
108 |
+
raise ValueError(f"Column '{column_name}' not found in DataFrame")
|
109 |
+
|
110 |
+
# Creating a copy of the DataFrame to avoid modifying a slice
|
111 |
+
df_copy = df.copy()
|
112 |
+
|
113 |
+
# Tokenize each entry in the specified column and count the number of tokens
|
114 |
+
df_copy['token_count'] = df_copy[column_name].apply(lambda x: len(word_tokenize(x)) if pd.notnull(x) else 0)
|
115 |
+
|
116 |
+
return df_copy
|
117 |
+
|
118 |
+
|
119 |
+
class TextSummarizer:
|
120 |
+
"""
|
121 |
+
A text summarization class that uses a fine-tuned BART model to summarize text.
|
122 |
+
|
123 |
+
Attributes:
|
124 |
+
device (str): Device to run the model on, either 'cuda' or 'cpu'.
|
125 |
+
model (BartForConditionalGeneration): The loaded BART model.
|
126 |
+
tokenizer (BartTokenizer): The tokenizer for the BART model.
|
127 |
+
"""
|
128 |
+
|
129 |
+
def __init__(self, model_name):
|
130 |
+
"""
|
131 |
+
Initializes the TextSummarizer with a specified BART model.
|
132 |
+
|
133 |
+
Args:
|
134 |
+
model_name (str): The name or path of the fine-tuned BART model.
|
135 |
+
"""
|
136 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
137 |
+
self.model = BartForConditionalGeneration.from_pretrained(model_name).to(self.device)
|
138 |
+
self.tokenizer = BartTokenizer.from_pretrained(model_name)
|
139 |
+
|
140 |
+
def summarize(self, text, max_input_length=1024, max_output_length=150, min_output_length=40):
|
141 |
+
"""
|
142 |
+
Summarizes the given text using the fine-tuned BART model.
|
143 |
+
|
144 |
+
Args:
|
145 |
+
text (str): The text to be summarized.
|
146 |
+
max_input_length (int): The maximum length of the input text in tokens.
|
147 |
+
max_output_length (int): The maximum length of the summary text in tokens.
|
148 |
+
min_output_length (int): The minimum length of the summary text in tokens.
|
149 |
+
|
150 |
+
Returns:
|
151 |
+
str: The summarized text.
|
152 |
+
"""
|
153 |
+
inputs = self.tokenizer([text], max_length=max_input_length, return_tensors='pt', truncation=True)
|
154 |
+
summary_ids = self.model.generate(
|
155 |
+
inputs['input_ids'].to(self.device),
|
156 |
+
max_length=max_output_length,
|
157 |
+
min_length=min_output_length,
|
158 |
+
length_penalty=2.0,
|
159 |
+
num_beams=4,
|
160 |
+
early_stopping=True
|
161 |
+
)
|
162 |
+
return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
163 |
+
|
164 |
+
|
165 |
+
|
166 |
+
def batch_summarize(df, text_col, summarizer, batch_size=10, output_col=None):
|
167 |
+
"""
|
168 |
+
Summarizes text in batches.
|
169 |
+
|
170 |
+
Args:
|
171 |
+
df (pd.DataFrame): The DataFrame containing text to summarize.
|
172 |
+
text_col (str): The column in the DataFrame with text to summarize.
|
173 |
+
summarizer: The summarizer object or function.
|
174 |
+
batch_size (int): The size of each batch for summarization.
|
175 |
+
output_col (str, optional): The name of the output column for summarized text.
|
176 |
+
If None, defaults to text_col.
|
177 |
+
|
178 |
+
Returns:
|
179 |
+
pd.DataFrame: DataFrame with summarized text in the specified output column.
|
180 |
+
"""
|
181 |
+
summarized_texts = []
|
182 |
+
|
183 |
+
# Use the text_col as output_col if not specified
|
184 |
+
if output_col is None:
|
185 |
+
output_col = text_col
|
186 |
+
|
187 |
+
# Iterate through the DataFrame in batches
|
188 |
+
for start_idx in tqdm(range(0, len(df), batch_size), desc="Summarizing"):
|
189 |
+
end_idx = start_idx + batch_size
|
190 |
+
batch = df[text_col][start_idx:end_idx]
|
191 |
+
|
192 |
+
# Summarize each batch
|
193 |
+
summarized_batch = [summarizer.summarize(text) for text in batch]
|
194 |
+
summarized_texts.extend(summarized_batch)
|
195 |
+
|
196 |
+
# Create a new DataFrame with the summarized text
|
197 |
+
return pd.DataFrame({output_col: summarized_texts})
|
198 |
+
|
199 |
+
|
200 |
+
class SentenceTransformerEncoder:
|
201 |
+
"""
|
202 |
+
A class to handle sentence encoding using Sentence Transformers, directly working with pandas DataFrames.
|
203 |
+
This class encodes text data in a specified DataFrame column into vector representations.
|
204 |
+
|
205 |
+
Attributes:
|
206 |
+
model (SentenceTransformer): The Sentence Transformer model used for encoding.
|
207 |
+
"""
|
208 |
+
|
209 |
+
def __init__(self, model_name='all-MiniLM-L6-v2'):
|
210 |
+
"""
|
211 |
+
Initializes the SentenceTransformerEncoder with a specified Sentence Transformer model.
|
212 |
+
|
213 |
+
Args:
|
214 |
+
model_name (str): The name of the Sentence Transformer model.
|
215 |
+
"""
|
216 |
+
self.model = SentenceTransformer(model_name)
|
217 |
+
|
218 |
+
def encode_column(self, df, column, batch_size=32, encoded_column_suffix='_encoded'):
|
219 |
+
"""
|
220 |
+
Encodes a specific column in a DataFrame and adds a new column with encoded vectors.
|
221 |
+
|
222 |
+
Args:
|
223 |
+
df (pd.DataFrame): The DataFrame containing the texts to encode.
|
224 |
+
column (str): The name of the column to encode.
|
225 |
+
batch_size (int): The size of each batch for processing.
|
226 |
+
encoded_column_suffix (str): Suffix for the new column containing encoded vectors.
|
227 |
+
|
228 |
+
Returns:
|
229 |
+
pd.DataFrame: The original DataFrame with an additional column containing encoded vectors.
|
230 |
+
|
231 |
+
Raises:
|
232 |
+
ValueError: If the specified column is not found in the DataFrame.
|
233 |
+
"""
|
234 |
+
if column not in df.columns:
|
235 |
+
raise ValueError(f"Column '{column}' not found in DataFrame")
|
236 |
+
|
237 |
+
# Encoding the text data in batches
|
238 |
+
encoded_vectors = []
|
239 |
+
for start_idx in range(0, len(df), batch_size):
|
240 |
+
end_idx = min(start_idx + batch_size, len(df))
|
241 |
+
batch_texts = df[column][start_idx:end_idx].tolist()
|
242 |
+
batch_encoded = self.model.encode(batch_texts, show_progress_bar=True)
|
243 |
+
encoded_vectors.extend(batch_encoded)
|
244 |
+
|
245 |
+
# Adding the encoded vectors as a new column in the DataFrame
|
246 |
+
df[column + encoded_column_suffix] = encoded_vectors
|
247 |
+
return df
|
248 |
+
|
249 |
+
class QdrantInterface:
|
250 |
+
"""
|
251 |
+
A class for interfacing with the Qdrant vector database.
|
252 |
+
|
253 |
+
Attributes:
|
254 |
+
client (QdrantClient): Client instance for interacting with Qdrant.
|
255 |
+
vector_dimension (int): Dimension of the vectors used in the collection.
|
256 |
+
"""
|
257 |
+
|
258 |
+
"""
|
259 |
+
A class for interfacing with the Qdrant vector database.
|
260 |
+
...
|
261 |
+
"""
|
262 |
+
def __init__(self, url, api_key, vector_dimension):
|
263 |
+
"""
|
264 |
+
Initializes the QdrantInterface with the specified Qdrant URL, API key, and vector dimension.
|
265 |
+
|
266 |
+
Args:
|
267 |
+
url (str): Full URL of the Qdrant server.
|
268 |
+
api_key (str): API key for Qdrant.
|
269 |
+
vector_dimension (int): Dimension of vectors to be stored in Qdrant.
|
270 |
+
"""
|
271 |
+
self.client = QdrantClient(url=url, api_key=api_key)
|
272 |
+
self.vector_dimension = vector_dimension
|
273 |
+
def create_collection(self, collection_name, distance_metric=Distance.COSINE):
|
274 |
+
"""
|
275 |
+
Creates or recreates a collection in Qdrant.
|
276 |
+
|
277 |
+
Args:
|
278 |
+
collection_name (str): Name of the collection.
|
279 |
+
distance_metric (Distance): Distance metric for vector comparisons.
|
280 |
+
"""
|
281 |
+
self.client.recreate_collection(
|
282 |
+
collection_name=collection_name,
|
283 |
+
vectors_config=VectorParams(size=self.vector_dimension, distance=distance_metric)
|
284 |
+
)
|
285 |
+
def save_to_qdrant(self, df, collection_name, vector_col, payload_cols, batch_size=100):
|
286 |
+
"""
|
287 |
+
Saves a DataFrame to Qdrant in batches.
|
288 |
+
|
289 |
+
Args:
|
290 |
+
df (pd.DataFrame): DataFrame containing data to save.
|
291 |
+
collection_name (str): Name of the collection in Qdrant.
|
292 |
+
vector_col (str): Name of the column containing vectors.
|
293 |
+
payload_cols (list[str]): List of column names to include as payload.
|
294 |
+
batch_size (int): Number of records to process in each batch.
|
295 |
+
"""
|
296 |
+
|
297 |
+
for start_idx in range(0, len(df), batch_size):
|
298 |
+
end_idx = min(start_idx + batch_size, len(df))
|
299 |
+
batch = df.iloc[start_idx:end_idx]
|
300 |
+
records = []
|
301 |
+
for idx, row in batch.iterrows():
|
302 |
+
# Debug print
|
303 |
+
print(f"Index: {idx}, Vector Type: {type(row[vector_col])}, First 10 Elements: {row[vector_col][:10]}")
|
304 |
+
record = Record(
|
305 |
+
id=idx,
|
306 |
+
vector=row[vector_col],
|
307 |
+
payload={col: row[col] for col in payload_cols}
|
308 |
+
)
|
309 |
+
records.append(record)
|
310 |
+
self.client.upload_records(collection_name=collection_name, records=records)
|
311 |
+
|
312 |
+
|
313 |
+
def retrieve_specific_records(self, collection_name, ids):
|
314 |
+
"""
|
315 |
+
Retrieves specific records by their IDs from a Qdrant collection.
|
316 |
+
|
317 |
+
Args:
|
318 |
+
collection_name (str): The name of the collection.
|
319 |
+
ids (list): List of record IDs to retrieve.
|
320 |
+
|
321 |
+
Returns:
|
322 |
+
List of specific records from the collection.
|
323 |
+
"""
|
324 |
+
return self.client.retrieve(collection_name=collection_name, ids=ids)
|
325 |
+
|
326 |
+
def view_sample_records(self, collection_name, vector_dimension, limit=10):
|
327 |
+
"""
|
328 |
+
Retrieves a sample of records from a Qdrant collection using a dummy search.
|
329 |
+
|
330 |
+
Args:
|
331 |
+
collection_name (str): The name of the collection.
|
332 |
+
vector_dimension (int): Dimension of vectors in the collection.
|
333 |
+
limit (int): The number of records to retrieve.
|
334 |
+
|
335 |
+
Returns:
|
336 |
+
List of sample records from the collection.
|
337 |
+
"""
|
338 |
+
# Generate a random vector
|
339 |
+
random_vector = [uniform(-1, 1) for _ in range(vector_dimension)]
|
340 |
+
|
341 |
+
# Perform a dummy search
|
342 |
+
return self.client.search(
|
343 |
+
collection_name=collection_name,
|
344 |
+
query_vector=random_vector,
|
345 |
+
limit=limit
|
346 |
+
)
|
347 |
+
def match_resumes_to_jobs(self, resume_vector, top_k=10):
|
348 |
+
"""
|
349 |
+
Matches a given resume vector to job postings.
|
350 |
+
|
351 |
+
Args:
|
352 |
+
resume_vector (list): The vector representation of a resume.
|
353 |
+
top_k (int): Number of top similar matches to return.
|
354 |
+
|
355 |
+
Returns:
|
356 |
+
List of matched job postings with similarity scores.
|
357 |
+
"""
|
358 |
+
hits = self.client.search(
|
359 |
+
collection_name="jobs",
|
360 |
+
query_vector=resume_vector,
|
361 |
+
limit=top_k,
|
362 |
+
with_payload=True
|
363 |
+
)
|
364 |
+
return [(hit.payload, hit.score) for hit in hits]
|
365 |
+
def match_jobs_to_resumes(self, job_vector, top_k=10):
|
366 |
+
"""
|
367 |
+
Matches a given job vector to resumes.
|
368 |
+
|
369 |
+
Args:
|
370 |
+
job_vector (list): The vector representation of a job posting.
|
371 |
+
top_k (int): Number of top similar matches to return.
|
372 |
+
|
373 |
+
Returns:
|
374 |
+
List of tuples containing matched resumes and their similarity scores.
|
375 |
+
"""
|
376 |
+
hits = self.client.search(
|
377 |
+
collection_name="resumes",
|
378 |
+
query_vector=job_vector,
|
379 |
+
limit=top_k,
|
380 |
+
with_payload=True
|
381 |
+
)
|
382 |
+
return [(hit.payload, hit.score) for hit in hits]
|
383 |
+
|
384 |
+
|
385 |
+
|
386 |
+
|