import pandas as pd import numpy as np from ast import literal_eval import yake import spacy from sklearn.metrics.pairwise import cosine_similarity from sentence_transformers import SentenceTransformer import os class ScriptMatcher: def __init__(self, data_path = None, model_name='paraphrase-mpnet-base-v2',dataframe = None): """ Initialize the SeriesMatcher object. Parameters: data_path (str): Path to the dataset file. model_name (str): Name of the sentence transformer model. Default is 'paraphrase-mpnet-base-v2'. """ if data_path is not None: self.dataset = pd.read_csv(data_path) if dataframe is not None: self.dataset = dataframe self.model = SentenceTransformer(model_name) self.kw_extractor = yake.KeywordExtractor("en", n=1, dedupLim=0.9) self.k_dataset = pd.read_csv('models/Similarity_K_Dataset/K_Dataset.csv') self._ent_type = ["PERSON","NORP","FAC","ORG","GPE","LOC","PRODUCT","EVENT","WORK","ART","LAW", "LANGUAGE","DATE","TIME","PERCENT","MONEY","QUANTITY","ORDINAL","CARDINAL"] self.embeddings_synopsis_list = np.load("models/Similarity_K_Dataset/plot_embeddings.npy") self.plot_embedding_list = np.load("models/Similarity_K_Dataset/synopsis_embeddings.npy") try: self.nlp = spacy.load("en_core_web_sm") except: print("Downloading spaCy NLP model...") os.system( "pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl") self.nlp = spacy.load("en_core_web_sm") def extract_keywords(self, text): """ Extract keywords from a given text using the YAKE keyword extraction algorithm. Parameters: text (str): Text from which to extract keywords. Returns: str: A string of extracted keywords joined by spaces. """ extracted_keywords = self.kw_extractor.extract_keywords(text) return " ".join([keywords[0] for keywords in extracted_keywords if keywords[0] not in self._ent_type]) def preprocess_text(self, text): """ Process a given text to replace named entities and extract keywords. Parameters: text (str): The text to process. Returns: str: Processed text with named entities replaced and keywords extracted. """ doc = self.nlp(text) replaced_text = text for token in doc: if token.ent_type_ != "MISC" and token.ent_type_ != "": replaced_text = replaced_text.replace(token.text, f"<{token.ent_type_}>") return self.extract_keywords(replaced_text) def find_similar_series(self, new_synopsis, genres_keywords,k=5): """ Find series similar to a new synopsis. Parameters: new_synopsis (str): The synopsis to compare. k (int): The number of similar series to return. Returns: pd.DataFrame: A dataframe of the closest series. """ processed_synopsis = self.preprocess_text(new_synopsis) genre_keywords = " ".join(genres_keywords) print(genre_keywords) synopsis_sentence = genre_keywords + self.extract_keywords(processed_synopsis) synopsis_embedding = self.model.encode([synopsis_sentence]) cosine_similarity_matrix = 0.75 * cosine_similarity(synopsis_embedding, self.embeddings_synopsis_list) + 0.25 * cosine_similarity(synopsis_embedding,self.plot_embedding_list) top_k_indices = cosine_similarity_matrix.argsort()[0, -k:][::-1] closest_series = self.k_dataset.iloc[top_k_indices] # Add scores column closest_series["Score"] = cosine_similarity_matrix[0, top_k_indices] return closest_series[["Series", "Genre","Score"]].to_dict(orient='records')