Spaces:

manasvinid
/

RESUME_RANKER

Sleeping

App Files Files Community

manasvinid commited on Feb 28, 2024

Commit

b5bf8cc

verified ·

1 Parent(s): b0df9c3

Create functions.py

Browse files

Files changed (1) hide show

functions.py +386 -0

functions.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import numpy as np # linear algebra
+import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
+import os
+import nltk
+import zipfile
+import os
+from bs4 import BeautifulSoup
+import re
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from transformers import BartForConditionalGeneration, BartTokenizer
+import torch
+from tqdm import tqdm
+from sentence_transformers import SentenceTransformer
+from qdrant_client import QdrantClient
+from qdrant_client.http.models import VectorParams, Distance, Record, Filter
+from random import uniform
+def setup_nltk_resources():
+    """
+    Sets up the custom NLTK data path and downloads necessary resources.
+    Downloads 'wordnet' for lemmatization, 'stopwords' for stopwords removal,
+    and 'punkt' for sentence tokenization.
+    """
+    nltk_data_path = "/kaggle/working/nltk_data"
+    nltk.data.path.append(nltk_data_path)
+    nltk.download('wordnet', download_dir=nltk_data_path)
+    nltk.download('stopwords', download_dir=nltk_data_path)
+    nltk.download('punkt', download_dir=nltk_data_path)
+def unzip_nltk_resource(zip_path, extract_to):
+    """
+    Unzips an NLTK resource file to a specified directory.
+    Args:
+    zip_path (str): The path to the zipped NLTK resource file.
+    extract_to (str): The directory where the contents of the zip file will be extracted.
+    """
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(extract_to)
+def preprocess_text(text):
+    """
+    Preprocesses a given text string for NLP tasks. This includes cleaning the text,
+    tokenizing, removing stopwords, and lemmatizing the words.
+    Args:
+    text (str): The text string to preprocess.
+    Returns:
+    str: The preprocessed text.
+    """
+    if not text:
+        return ""
+    text = re.sub(r'[\r\n\t]+', ' ', text)
+    text = re.sub(r'[^a-zA-Z\s]', '', text)
+    text = text.lower()
+    tokens = word_tokenize(text)
+    stop_words = set(stopwords.words('english'))
+    filtered_tokens = [word for word in tokens if word not in stop_words]
+    lemmatizer = WordNetLemmatizer()
+    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_tokens]
+    return ' '.join(lemmatized_text)
+def drop_duplicates(df, column_name):
+    """
+    Drops duplicates based on a specified column from the DataFrame.
+    Args:
+    df (pd.DataFrame): The DataFrame from which to remove duplicates.
+    column_name (str): The name of the column based on which duplicates will be identified.
+    Returns:
+    pd.DataFrame: DataFrame with duplicates removed based on the specified column.
+    """
+    if column_name not in df.columns:
+        raise ValueError(f"Column '{column_name}' not found in DataFrame")
+    original_size = df.shape[0]
+    df_cleaned = df.drop_duplicates(subset=[column_name])
+    new_size = df_cleaned.shape[0]
+    print(f"Dropped {original_size - new_size} duplicates from '{column_name}'. New dataset size: {new_size}")
+    return df_cleaned
+def add_token_count_column(df, column_name):
+    """
+    Adds a new column to the DataFrame with the token count for each entry in the specified column.
+    This function creates a copy of the DataFrame to avoid 'SettingWithCopyWarning'.
+    Args:
+    df (pd.DataFrame): The DataFrame to process.
+    column_name (str): The name of the column for which to count tokens.
+    Returns:
+    pd.DataFrame: DataFrame with an additional column 'token_count'.
+    """
+    if column_name not in df.columns:
+        raise ValueError(f"Column '{column_name}' not found in DataFrame")
+    # Creating a copy of the DataFrame to avoid modifying a slice
+    df_copy = df.copy()
+    # Tokenize each entry in the specified column and count the number of tokens
+    df_copy['token_count'] = df_copy[column_name].apply(lambda x: len(word_tokenize(x)) if pd.notnull(x) else 0)
+    return df_copy
+class TextSummarizer:
+    """
+    A text summarization class that uses a fine-tuned BART model to summarize text.
+    Attributes:
+        device (str): Device to run the model on, either 'cuda' or 'cpu'.
+        model (BartForConditionalGeneration): The loaded BART model.
+        tokenizer (BartTokenizer): The tokenizer for the BART model.
+    """
+    def __init__(self, model_name):
+        """
+        Initializes the TextSummarizer with a specified BART model.
+        Args:
+            model_name (str): The name or path of the fine-tuned BART model.
+        """
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = BartForConditionalGeneration.from_pretrained(model_name).to(self.device)
+        self.tokenizer = BartTokenizer.from_pretrained(model_name)
+    def summarize(self, text, max_input_length=1024, max_output_length=150, min_output_length=40):
+        """
+        Summarizes the given text using the fine-tuned BART model.
+        Args:
+            text (str): The text to be summarized.
+            max_input_length (int): The maximum length of the input text in tokens.
+            max_output_length (int): The maximum length of the summary text in tokens.
+            min_output_length (int): The minimum length of the summary text in tokens.
+        Returns:
+            str: The summarized text.
+        """
+        inputs = self.tokenizer([text], max_length=max_input_length, return_tensors='pt', truncation=True)
+        summary_ids = self.model.generate(
+            inputs['input_ids'].to(self.device),
+            max_length=max_output_length,
+            min_length=min_output_length,
+            length_penalty=2.0,
+            num_beams=4,
+            early_stopping=True
+        )
+        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+def batch_summarize(df, text_col, summarizer, batch_size=10, output_col=None):
+    """
+    Summarizes text in batches.
+    Args:
+        df (pd.DataFrame): The DataFrame containing text to summarize.
+        text_col (str): The column in the DataFrame with text to summarize.
+        summarizer: The summarizer object or function.
+        batch_size (int): The size of each batch for summarization.
+        output_col (str, optional): The name of the output column for summarized text.
+                                   If None, defaults to text_col.
+    Returns:
+        pd.DataFrame: DataFrame with summarized text in the specified output column.
+    """
+    summarized_texts = []
+    # Use the text_col as output_col if not specified
+    if output_col is None:
+        output_col = text_col
+    # Iterate through the DataFrame in batches
+    for start_idx in tqdm(range(0, len(df), batch_size), desc="Summarizing"):
+        end_idx = start_idx + batch_size
+        batch = df[text_col][start_idx:end_idx]
+        # Summarize each batch
+        summarized_batch = [summarizer.summarize(text) for text in batch]
+        summarized_texts.extend(summarized_batch)
+    # Create a new DataFrame with the summarized text
+    return pd.DataFrame({output_col: summarized_texts})
+class SentenceTransformerEncoder:
+    """
+    A class to handle sentence encoding using Sentence Transformers, directly working with pandas DataFrames.
+    This class encodes text data in a specified DataFrame column into vector representations.
+    Attributes:
+        model (SentenceTransformer): The Sentence Transformer model used for encoding.
+    """
+    def __init__(self, model_name='all-MiniLM-L6-v2'):
+        """
+        Initializes the SentenceTransformerEncoder with a specified Sentence Transformer model.
+        Args:
+            model_name (str): The name of the Sentence Transformer model.
+        """
+        self.model = SentenceTransformer(model_name)
+    def encode_column(self, df, column, batch_size=32, encoded_column_suffix='_encoded'):
+        """
+        Encodes a specific column in a DataFrame and adds a new column with encoded vectors.
+        Args:
+            df (pd.DataFrame): The DataFrame containing the texts to encode.
+            column (str): The name of the column to encode.
+            batch_size (int): The size of each batch for processing.
+            encoded_column_suffix (str): Suffix for the new column containing encoded vectors.
+        Returns:
+            pd.DataFrame: The original DataFrame with an additional column containing encoded vectors.
+        Raises:
+            ValueError: If the specified column is not found in the DataFrame.
+        """
+        if column not in df.columns:
+            raise ValueError(f"Column '{column}' not found in DataFrame")
+        # Encoding the text data in batches
+        encoded_vectors = []
+        for start_idx in range(0, len(df), batch_size):
+            end_idx = min(start_idx + batch_size, len(df))
+            batch_texts = df[column][start_idx:end_idx].tolist()
+            batch_encoded = self.model.encode(batch_texts, show_progress_bar=True)
+            encoded_vectors.extend(batch_encoded)
+        # Adding the encoded vectors as a new column in the DataFrame
+        df[column + encoded_column_suffix] = encoded_vectors
+        return df
+class QdrantInterface:
+    """
+    A class for interfacing with the Qdrant vector database.
+    Attributes:
+        client (QdrantClient): Client instance for interacting with Qdrant.
+        vector_dimension (int): Dimension of the vectors used in the collection.
+    """
+    """
+    A class for interfacing with the Qdrant vector database.
+    ...
+    """
+    def __init__(self, url, api_key, vector_dimension):
+        """
+        Initializes the QdrantInterface with the specified Qdrant URL, API key, and vector dimension.
+        Args:
+            url (str): Full URL of the Qdrant server.
+            api_key (str): API key for Qdrant.
+            vector_dimension (int): Dimension of vectors to be stored in Qdrant.
+        """
+        self.client = QdrantClient(url=url, api_key=api_key)
+        self.vector_dimension = vector_dimension
+    def create_collection(self, collection_name, distance_metric=Distance.COSINE):
+        """
+        Creates or recreates a collection in Qdrant.
+        Args:
+            collection_name (str): Name of the collection.
+            distance_metric (Distance): Distance metric for vector comparisons.
+        """
+        self.client.recreate_collection(
+            collection_name=collection_name,
+            vectors_config=VectorParams(size=self.vector_dimension, distance=distance_metric)
+        )
+    def save_to_qdrant(self, df, collection_name, vector_col, payload_cols, batch_size=100):
+        """
+        Saves a DataFrame to Qdrant in batches.
+        Args:
+            df (pd.DataFrame): DataFrame containing data to save.
+            collection_name (str): Name of the collection in Qdrant.
+            vector_col (str): Name of the column containing vectors.
+            payload_cols (list[str]): List of column names to include as payload.
+            batch_size (int): Number of records to process in each batch.
+        """
+        for start_idx in range(0, len(df), batch_size):
+            end_idx = min(start_idx + batch_size, len(df))
+            batch = df.iloc[start_idx:end_idx]
+            records = []
+            for idx, row in batch.iterrows():
+                # Debug print
+                print(f"Index: {idx}, Vector Type: {type(row[vector_col])}, First 10 Elements: {row[vector_col][:10]}")
+                record = Record(
+                    id=idx,
+                    vector=row[vector_col],
+                    payload={col: row[col] for col in payload_cols}
+                )
+                records.append(record)
+            self.client.upload_records(collection_name=collection_name, records=records)
+    def retrieve_specific_records(self, collection_name, ids):
+        """
+        Retrieves specific records by their IDs from a Qdrant collection.
+        Args:
+            collection_name (str): The name of the collection.
+            ids (list): List of record IDs to retrieve.
+        Returns:
+            List of specific records from the collection.
+        """
+        return self.client.retrieve(collection_name=collection_name, ids=ids)
+    def view_sample_records(self, collection_name, vector_dimension, limit=10):
+        """
+        Retrieves a sample of records from a Qdrant collection using a dummy search.
+        Args:
+            collection_name (str): The name of the collection.
+            vector_dimension (int): Dimension of vectors in the collection.
+            limit (int): The number of records to retrieve.
+        Returns:
+            List of sample records from the collection.
+        """
+        # Generate a random vector
+        random_vector = [uniform(-1, 1) for _ in range(vector_dimension)]
+        # Perform a dummy search
+        return self.client.search(
+            collection_name=collection_name,
+            query_vector=random_vector,
+            limit=limit
+        )
+    def match_resumes_to_jobs(self, resume_vector, top_k=10):
+        """
+        Matches a given resume vector to job postings.
+        Args:
+            resume_vector (list): The vector representation of a resume.
+            top_k (int): Number of top similar matches to return.
+        Returns:
+            List of matched job postings with similarity scores.
+        """
+        hits = self.client.search(
+            collection_name="jobs",
+            query_vector=resume_vector,
+            limit=top_k,
+            with_payload=True
+        )
+        return [(hit.payload, hit.score) for hit in hits]
+    def match_jobs_to_resumes(self, job_vector, top_k=10):
+        """
+        Matches a given job vector to resumes.
+        Args:
+            job_vector (list): The vector representation of a job posting.
+            top_k (int): Number of top similar matches to return.
+        Returns:
+            List of tuples containing matched resumes and their similarity scores.
+        """
+        hits = self.client.search(
+            collection_name="resumes",
+            query_vector=job_vector,
+            limit=top_k,
+            with_payload=True
+        )
+        return [(hit.payload, hit.score) for hit in hits]