Spaces:

HConley
/

mv_recom

Running

App Files Files Community

HConley commited on Nov 12, 2023

Commit

ac2467f

1 Parent(s): e0aaf9b

Upload 17 files

Browse files

Files changed (18) hide show

.gitattributes +1 -0
.gitignore +89 -0
requirements.txt +11 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/data/.gitkeep +0 -0
src/data/__init__.py +0 -0
src/data/emb_generator.py +31 -0
src/data/metadata_dataset.py +43 -0
src/models/.gitkeep +0 -0
src/models/__init__.py +0 -0
src/models/__pycache__/__init__.cpython-310.pyc +0 -0
src/models/__pycache__/plot_similarity_finder.cpython-310.pyc +0 -0
src/models/plot_similarity_finder.py +49 -0
streamlit/__pycache__/streamlit_code.cpython-310.pyc +0 -0
streamlit/plot_embeddings/index.faiss +3 -0
streamlit/plot_embeddings/index.pkl +3 -0
streamlit/streamlit_code.py +32 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+streamlit/plot_embeddings/index.faiss filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,89 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# DotEnv configuration
+.env
+# Database
+*.db
+*.rdb
+# Pycharm
+.idea
+# VS Code
+.vscode/
+# Spyder
+.spyproject/
+# Jupyter NB Checkpoints
+.ipynb_checkpoints/
+# exclude data from source control by default
+/data/
+# Mac OS-specific storage files
+.DS_Store
+# vim
+*.swp
+*.swo
+# Mypy cache
+.mypy_cache/

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+# external requirements
+spacy~=3.7.1
+pandas~=2.1.1
+faiss-cpu~=1.7.4
+langchain~=0.0.311
+streamlit~=1.27.2
+sentence-transformers~=2.2.2
+en_core_web_trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.2/en_core_web_trf-3.7.2-py3-none-any.whl

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (155 Bytes). View file

src/data/.gitkeep ADDED Viewed

File without changes

src/data/__init__.py ADDED Viewed

File without changes

src/data/emb_generator.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Code to generate embeddings based on movie plots.
+import pandas as pd
+from langchain.document_loaders import DataFrameLoader
+from langchain.vectorstores import FAISS
+from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+# Importing the database ready to work.
+movies = pd.read_csv('../../data/processed/movies_clean.csv')
+movies.drop('Unnamed: 0', axis=1, inplace=True)
+# Creating the 'document' with metadata.
+df_loader = DataFrameLoader(movies, page_content_column='plot_sin_nombres')
+df_document = df_loader.load()
+# Defining the model to use for generating embeddings.
+embedding_function = SentenceTransformerEmbeddings(model_name="sentence-t5-xl")
+print('Transformer descargado.')
+# Creating the vectorial database.
+db = FAISS.from_documents(df_document, embedding_function)
+print('DB vectorial creada.')
+# Saving the database.
+db.save_local('plot_embeddings')
+if __name__ == '__main__':
+    __name__

src/data/metadata_dataset.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Code to generate the working database, taking into consideration the findings from the Exploratory
+# Data Analysis (EDA) in a Jupyter notebook.
+import pandas as pd
+import spacy
+# Load NLP model
+nlp = spacy.load("en_core_web_trf")
+# Function to remove names of individuals from a text.
+def remove_names(text):
+    """ Function to remove the names of people from a given text.
+    :param text: the text from which names will be removed.
+    :return: text without the names.
+    >>> remove_names('My name is John Connor, leader of the rebellion.')
+    'My name is , leader of the rebellion .'
+    """
+    doc = nlp(text)
+    words_wo_names = [token.text for token in doc if token.ent_type_ != "PERSON"]
+    return " ".join(words_wo_names)
+# Load raw data
+movies = pd.read_csv('../../data/raw/0_inicial/movies.csv')
+print(movies.columns)
+# Drop not-used columns
+movies.drop(['Unnamed: 0', 'Genre', 'Wiki Page', 'title'], inplace=True, axis=1)
+# Removing names of plots and creating a new column in the DB
+movies['plot_sin_nombres'] = movies['Plot'].apply(remove_names)
+movies.drop('Plot', inplace=True, axis=1)
+# Save
+movies.to_csv('../../data/processed/movies_clean.csv')
+if __name__ == '__main__':
+    __name__

src/models/.gitkeep ADDED Viewed

File without changes

src/models/__init__.py ADDED Viewed

File without changes

src/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (162 Bytes). View file

src/models/__pycache__/plot_similarity_finder.cpython-310.pyc ADDED Viewed

Binary file (1.94 kB). View file

src/models/plot_similarity_finder.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Code to find the most similar plot, using the embedding index.
+from langchain.vectorstores import FAISS
+from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+# Searching for similar plots to the users plot/input
+# Loading embedding function
+embedding_function = SentenceTransformerEmbeddings(model_name="sentence-t5-large")
+# Loading vectorstore
+db = FAISS.load_local('../streamlit/plot_embeddings', embedding_function)
+# Find the most similar plots to the user plot/input
+def plot_simil(user_plot, num_recom=5):
+    """ Find movies with plots similar to the one give by the user.
+    :param user_plot: user plot
+    :param num_recom: number of recommendations
+    :return: list of lists indicating [Release year, Title] for every recommendation
+    >>> plot_simil(
+    "
+    a time-travel adventure film that follows the story of Marty McFly, a high school student, and his eccentric
+    scientist friend, Doc Brown. Doc has invented a time machine out of a DeLorean car, and one night, during a test
+    run, Marty is accidentally transported from 1985 to 1955. In 1955, Marty encounters his younger parents, George
+    and Lorraine, and inadvertently interferes with their meeting, putting his own existence at risk. Marty seeks the
+    help of the younger Doc Brown from 1955 to repair the time machine and return to his own time. Along the way, he
+    must ensure that his parents meet and fall in love, all while avoiding running into his younger self. Marty also
+    has to contend with the school bully, Biff, who makes life difficult for his father.
+    ",
+    3
+    )
+    '[[1990, 'back to the future part iii'],
+      [1989, 'back to the future part ii'],
+      [1985, 'back to the future']]'
+    """
+    docs = db.similarity_search(user_plot, num_recom)
+    recom = []
+    for doc in docs:
+        year = doc.metadata['Release Year']
+        title = doc.metadata['Title']
+        recom.append([year, title])
+    return recom

streamlit/__pycache__/streamlit_code.cpython-310.pyc ADDED Viewed

Binary file (366 Bytes). View file

streamlit/plot_embeddings/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d00eda167b1dd22d915b827ac23e5a283087dcc49d50abf4ff98a3b440fbf639
+size 14592045

streamlit/plot_embeddings/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68397f28520df1553fd80f4d7baa519077550ff78c974a563a1b3f5d1f98e589
+size 16287401

streamlit/streamlit_code.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Code for Streamlit webapp
+import streamlit as st
+import sys
+import os
+root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(root_dir)
+from src.models.plot_similarity_finder import plot_simil
+# WebApp title and subtitle
+st.title('Movie Recommendation.')
+st.subheader('Write you movie plot and find a similar one.')
+# Plot input.
+user_plot = st.text_area("Write the plot here...")
+# Generación de la respuesta.
+# Agregue un botón "Responder" a la interfaz de usuario
+if st.button('Search'):
+    with st.spinner('Reading plot...'):
+        # Procesamiento
+        result = plot_simil(user_plot)
+        # Muestra de la respuesta y las páginas (fuentes)
+        st.markdown(f'{str.capitalize(result[0][1])}, {result[0][0]}')
+        st.markdown(f'{str.capitalize(result[1][1])}, {result[1][0]}')
+        st.markdown(f'{str.capitalize(result[2][1])}, {result[2][0]}')
+        st.markdown(f'{str.capitalize(result[3][1])}, {result[3][0]}')
+        st.markdown(f'{str.capitalize(result[4][1])}, {result[4][0]}')