Upload 17 files
Browse files- .gitattributes +1 -0
- .gitignore +89 -0
- requirements.txt +11 -0
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-310.pyc +0 -0
- src/data/.gitkeep +0 -0
- src/data/__init__.py +0 -0
- src/data/emb_generator.py +31 -0
- src/data/metadata_dataset.py +43 -0
- src/models/.gitkeep +0 -0
- src/models/__init__.py +0 -0
- src/models/__pycache__/__init__.cpython-310.pyc +0 -0
- src/models/__pycache__/plot_similarity_finder.cpython-310.pyc +0 -0
- src/models/plot_similarity_finder.py +49 -0
- streamlit/__pycache__/streamlit_code.cpython-310.pyc +0 -0
- streamlit/plot_embeddings/index.faiss +3 -0
- streamlit/plot_embeddings/index.pkl +3 -0
- streamlit/streamlit_code.py +32 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
streamlit/plot_embeddings/index.faiss filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
|
5 |
+
# C extensions
|
6 |
+
*.so
|
7 |
+
|
8 |
+
# Distribution / packaging
|
9 |
+
.Python
|
10 |
+
env/
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
*.egg-info/
|
23 |
+
.installed.cfg
|
24 |
+
*.egg
|
25 |
+
|
26 |
+
# PyInstaller
|
27 |
+
# Usually these files are written by a python script from a template
|
28 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
29 |
+
*.manifest
|
30 |
+
*.spec
|
31 |
+
|
32 |
+
# Installer logs
|
33 |
+
pip-log.txt
|
34 |
+
pip-delete-this-directory.txt
|
35 |
+
|
36 |
+
# Unit test / coverage reports
|
37 |
+
htmlcov/
|
38 |
+
.tox/
|
39 |
+
.coverage
|
40 |
+
.coverage.*
|
41 |
+
.cache
|
42 |
+
nosetests.xml
|
43 |
+
coverage.xml
|
44 |
+
*.cover
|
45 |
+
|
46 |
+
# Translations
|
47 |
+
*.mo
|
48 |
+
*.pot
|
49 |
+
|
50 |
+
# Django stuff:
|
51 |
+
*.log
|
52 |
+
|
53 |
+
# Sphinx documentation
|
54 |
+
docs/_build/
|
55 |
+
|
56 |
+
# PyBuilder
|
57 |
+
target/
|
58 |
+
|
59 |
+
# DotEnv configuration
|
60 |
+
.env
|
61 |
+
|
62 |
+
# Database
|
63 |
+
*.db
|
64 |
+
*.rdb
|
65 |
+
|
66 |
+
# Pycharm
|
67 |
+
.idea
|
68 |
+
|
69 |
+
# VS Code
|
70 |
+
.vscode/
|
71 |
+
|
72 |
+
# Spyder
|
73 |
+
.spyproject/
|
74 |
+
|
75 |
+
# Jupyter NB Checkpoints
|
76 |
+
.ipynb_checkpoints/
|
77 |
+
|
78 |
+
# exclude data from source control by default
|
79 |
+
/data/
|
80 |
+
|
81 |
+
# Mac OS-specific storage files
|
82 |
+
.DS_Store
|
83 |
+
|
84 |
+
# vim
|
85 |
+
*.swp
|
86 |
+
*.swo
|
87 |
+
|
88 |
+
# Mypy cache
|
89 |
+
.mypy_cache/
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# external requirements
|
2 |
+
|
3 |
+
spacy~=3.7.1
|
4 |
+
pandas~=2.1.1
|
5 |
+
faiss-cpu~=1.7.4
|
6 |
+
|
7 |
+
langchain~=0.0.311
|
8 |
+
streamlit~=1.27.2
|
9 |
+
sentence-transformers~=2.2.2
|
10 |
+
|
11 |
+
en_core_web_trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.2/en_core_web_trf-3.7.2-py3-none-any.whl
|
src/__init__.py
ADDED
File without changes
|
src/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (155 Bytes). View file
|
|
src/data/.gitkeep
ADDED
File without changes
|
src/data/__init__.py
ADDED
File without changes
|
src/data/emb_generator.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Code to generate embeddings based on movie plots.
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
from langchain.document_loaders import DataFrameLoader
|
5 |
+
from langchain.vectorstores import FAISS
|
6 |
+
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
7 |
+
|
8 |
+
|
9 |
+
# Importing the database ready to work.
|
10 |
+
movies = pd.read_csv('../../data/processed/movies_clean.csv')
|
11 |
+
movies.drop('Unnamed: 0', axis=1, inplace=True)
|
12 |
+
|
13 |
+
|
14 |
+
# Creating the 'document' with metadata.
|
15 |
+
df_loader = DataFrameLoader(movies, page_content_column='plot_sin_nombres')
|
16 |
+
df_document = df_loader.load()
|
17 |
+
|
18 |
+
# Defining the model to use for generating embeddings.
|
19 |
+
embedding_function = SentenceTransformerEmbeddings(model_name="sentence-t5-xl")
|
20 |
+
print('Transformer descargado.')
|
21 |
+
|
22 |
+
# Creating the vectorial database.
|
23 |
+
db = FAISS.from_documents(df_document, embedding_function)
|
24 |
+
print('DB vectorial creada.')
|
25 |
+
|
26 |
+
# Saving the database.
|
27 |
+
db.save_local('plot_embeddings')
|
28 |
+
|
29 |
+
|
30 |
+
if __name__ == '__main__':
|
31 |
+
__name__
|
src/data/metadata_dataset.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Code to generate the working database, taking into consideration the findings from the Exploratory
|
2 |
+
# Data Analysis (EDA) in a Jupyter notebook.
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
import spacy
|
6 |
+
|
7 |
+
# Load NLP model
|
8 |
+
nlp = spacy.load("en_core_web_trf")
|
9 |
+
|
10 |
+
|
11 |
+
# Function to remove names of individuals from a text.
|
12 |
+
def remove_names(text):
|
13 |
+
""" Function to remove the names of people from a given text.
|
14 |
+
|
15 |
+
:param text: the text from which names will be removed.
|
16 |
+
:return: text without the names.
|
17 |
+
|
18 |
+
>>> remove_names('My name is John Connor, leader of the rebellion.')
|
19 |
+
'My name is , leader of the rebellion .'
|
20 |
+
"""
|
21 |
+
doc = nlp(text)
|
22 |
+
words_wo_names = [token.text for token in doc if token.ent_type_ != "PERSON"]
|
23 |
+
return " ".join(words_wo_names)
|
24 |
+
|
25 |
+
|
26 |
+
# Load raw data
|
27 |
+
movies = pd.read_csv('../../data/raw/0_inicial/movies.csv')
|
28 |
+
print(movies.columns)
|
29 |
+
|
30 |
+
# Drop not-used columns
|
31 |
+
movies.drop(['Unnamed: 0', 'Genre', 'Wiki Page', 'title'], inplace=True, axis=1)
|
32 |
+
|
33 |
+
# Removing names of plots and creating a new column in the DB
|
34 |
+
movies['plot_sin_nombres'] = movies['Plot'].apply(remove_names)
|
35 |
+
movies.drop('Plot', inplace=True, axis=1)
|
36 |
+
|
37 |
+
|
38 |
+
# Save
|
39 |
+
movies.to_csv('../../data/processed/movies_clean.csv')
|
40 |
+
|
41 |
+
|
42 |
+
if __name__ == '__main__':
|
43 |
+
__name__
|
src/models/.gitkeep
ADDED
File without changes
|
src/models/__init__.py
ADDED
File without changes
|
src/models/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (162 Bytes). View file
|
|
src/models/__pycache__/plot_similarity_finder.cpython-310.pyc
ADDED
Binary file (1.94 kB). View file
|
|
src/models/plot_similarity_finder.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Code to find the most similar plot, using the embedding index.
|
2 |
+
|
3 |
+
from langchain.vectorstores import FAISS
|
4 |
+
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
5 |
+
|
6 |
+
|
7 |
+
# Searching for similar plots to the users plot/input
|
8 |
+
# Loading embedding function
|
9 |
+
embedding_function = SentenceTransformerEmbeddings(model_name="sentence-t5-large")
|
10 |
+
|
11 |
+
# Loading vectorstore
|
12 |
+
db = FAISS.load_local('../streamlit/plot_embeddings', embedding_function)
|
13 |
+
|
14 |
+
|
15 |
+
# Find the most similar plots to the user plot/input
|
16 |
+
def plot_simil(user_plot, num_recom=5):
|
17 |
+
""" Find movies with plots similar to the one give by the user.
|
18 |
+
|
19 |
+
:param user_plot: user plot
|
20 |
+
:param num_recom: number of recommendations
|
21 |
+
:return: list of lists indicating [Release year, Title] for every recommendation
|
22 |
+
|
23 |
+
>>> plot_simil(
|
24 |
+
"
|
25 |
+
a time-travel adventure film that follows the story of Marty McFly, a high school student, and his eccentric
|
26 |
+
scientist friend, Doc Brown. Doc has invented a time machine out of a DeLorean car, and one night, during a test
|
27 |
+
run, Marty is accidentally transported from 1985 to 1955. In 1955, Marty encounters his younger parents, George
|
28 |
+
and Lorraine, and inadvertently interferes with their meeting, putting his own existence at risk. Marty seeks the
|
29 |
+
help of the younger Doc Brown from 1955 to repair the time machine and return to his own time. Along the way, he
|
30 |
+
must ensure that his parents meet and fall in love, all while avoiding running into his younger self. Marty also
|
31 |
+
has to contend with the school bully, Biff, who makes life difficult for his father.
|
32 |
+
",
|
33 |
+
3
|
34 |
+
)
|
35 |
+
|
36 |
+
'[[1990, 'back to the future part iii'],
|
37 |
+
[1989, 'back to the future part ii'],
|
38 |
+
[1985, 'back to the future']]'
|
39 |
+
"""
|
40 |
+
|
41 |
+
docs = db.similarity_search(user_plot, num_recom)
|
42 |
+
|
43 |
+
recom = []
|
44 |
+
for doc in docs:
|
45 |
+
year = doc.metadata['Release Year']
|
46 |
+
title = doc.metadata['Title']
|
47 |
+
recom.append([year, title])
|
48 |
+
|
49 |
+
return recom
|
streamlit/__pycache__/streamlit_code.cpython-310.pyc
ADDED
Binary file (366 Bytes). View file
|
|
streamlit/plot_embeddings/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d00eda167b1dd22d915b827ac23e5a283087dcc49d50abf4ff98a3b440fbf639
|
3 |
+
size 14592045
|
streamlit/plot_embeddings/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:68397f28520df1553fd80f4d7baa519077550ff78c974a563a1b3f5d1f98e589
|
3 |
+
size 16287401
|
streamlit/streamlit_code.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Code for Streamlit webapp
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
|
5 |
+
import sys
|
6 |
+
import os
|
7 |
+
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
8 |
+
sys.path.append(root_dir)
|
9 |
+
from src.models.plot_similarity_finder import plot_simil
|
10 |
+
|
11 |
+
|
12 |
+
# WebApp title and subtitle
|
13 |
+
st.title('Movie Recommendation.')
|
14 |
+
st.subheader('Write you movie plot and find a similar one.')
|
15 |
+
|
16 |
+
# Plot input.
|
17 |
+
user_plot = st.text_area("Write the plot here...")
|
18 |
+
|
19 |
+
|
20 |
+
# Generaci贸n de la respuesta.
|
21 |
+
# Agregue un bot贸n "Responder" a la interfaz de usuario
|
22 |
+
if st.button('Search'):
|
23 |
+
with st.spinner('Reading plot...'):
|
24 |
+
# Procesamiento
|
25 |
+
result = plot_simil(user_plot)
|
26 |
+
|
27 |
+
# Muestra de la respuesta y las p谩ginas (fuentes)
|
28 |
+
st.markdown(f'{str.capitalize(result[0][1])}, {result[0][0]}')
|
29 |
+
st.markdown(f'{str.capitalize(result[1][1])}, {result[1][0]}')
|
30 |
+
st.markdown(f'{str.capitalize(result[2][1])}, {result[2][0]}')
|
31 |
+
st.markdown(f'{str.capitalize(result[3][1])}, {result[3][0]}')
|
32 |
+
st.markdown(f'{str.capitalize(result[4][1])}, {result[4][0]}')
|