HConley commited on
Commit
ac2467f
1 Parent(s): e0aaf9b

Upload 17 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ streamlit/plot_embeddings/index.faiss filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+
5
+ # C extensions
6
+ *.so
7
+
8
+ # Distribution / packaging
9
+ .Python
10
+ env/
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # PyInstaller
27
+ # Usually these files are written by a python script from a template
28
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
29
+ *.manifest
30
+ *.spec
31
+
32
+ # Installer logs
33
+ pip-log.txt
34
+ pip-delete-this-directory.txt
35
+
36
+ # Unit test / coverage reports
37
+ htmlcov/
38
+ .tox/
39
+ .coverage
40
+ .coverage.*
41
+ .cache
42
+ nosetests.xml
43
+ coverage.xml
44
+ *.cover
45
+
46
+ # Translations
47
+ *.mo
48
+ *.pot
49
+
50
+ # Django stuff:
51
+ *.log
52
+
53
+ # Sphinx documentation
54
+ docs/_build/
55
+
56
+ # PyBuilder
57
+ target/
58
+
59
+ # DotEnv configuration
60
+ .env
61
+
62
+ # Database
63
+ *.db
64
+ *.rdb
65
+
66
+ # Pycharm
67
+ .idea
68
+
69
+ # VS Code
70
+ .vscode/
71
+
72
+ # Spyder
73
+ .spyproject/
74
+
75
+ # Jupyter NB Checkpoints
76
+ .ipynb_checkpoints/
77
+
78
+ # exclude data from source control by default
79
+ /data/
80
+
81
+ # Mac OS-specific storage files
82
+ .DS_Store
83
+
84
+ # vim
85
+ *.swp
86
+ *.swo
87
+
88
+ # Mypy cache
89
+ .mypy_cache/
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # external requirements
2
+
3
+ spacy~=3.7.1
4
+ pandas~=2.1.1
5
+ faiss-cpu~=1.7.4
6
+
7
+ langchain~=0.0.311
8
+ streamlit~=1.27.2
9
+ sentence-transformers~=2.2.2
10
+
11
+ en_core_web_trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.2/en_core_web_trf-3.7.2-py3-none-any.whl
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (155 Bytes). View file
 
src/data/.gitkeep ADDED
File without changes
src/data/__init__.py ADDED
File without changes
src/data/emb_generator.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code to generate embeddings based on movie plots.
2
+
3
+ import pandas as pd
4
+ from langchain.document_loaders import DataFrameLoader
5
+ from langchain.vectorstores import FAISS
6
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
7
+
8
+
9
+ # Importing the database ready to work.
10
+ movies = pd.read_csv('../../data/processed/movies_clean.csv')
11
+ movies.drop('Unnamed: 0', axis=1, inplace=True)
12
+
13
+
14
+ # Creating the 'document' with metadata.
15
+ df_loader = DataFrameLoader(movies, page_content_column='plot_sin_nombres')
16
+ df_document = df_loader.load()
17
+
18
+ # Defining the model to use for generating embeddings.
19
+ embedding_function = SentenceTransformerEmbeddings(model_name="sentence-t5-xl")
20
+ print('Transformer descargado.')
21
+
22
+ # Creating the vectorial database.
23
+ db = FAISS.from_documents(df_document, embedding_function)
24
+ print('DB vectorial creada.')
25
+
26
+ # Saving the database.
27
+ db.save_local('plot_embeddings')
28
+
29
+
30
+ if __name__ == '__main__':
31
+ __name__
src/data/metadata_dataset.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code to generate the working database, taking into consideration the findings from the Exploratory
2
+ # Data Analysis (EDA) in a Jupyter notebook.
3
+
4
+ import pandas as pd
5
+ import spacy
6
+
7
+ # Load NLP model
8
+ nlp = spacy.load("en_core_web_trf")
9
+
10
+
11
+ # Function to remove names of individuals from a text.
12
+ def remove_names(text):
13
+ """ Function to remove the names of people from a given text.
14
+
15
+ :param text: the text from which names will be removed.
16
+ :return: text without the names.
17
+
18
+ >>> remove_names('My name is John Connor, leader of the rebellion.')
19
+ 'My name is , leader of the rebellion .'
20
+ """
21
+ doc = nlp(text)
22
+ words_wo_names = [token.text for token in doc if token.ent_type_ != "PERSON"]
23
+ return " ".join(words_wo_names)
24
+
25
+
26
+ # Load raw data
27
+ movies = pd.read_csv('../../data/raw/0_inicial/movies.csv')
28
+ print(movies.columns)
29
+
30
+ # Drop not-used columns
31
+ movies.drop(['Unnamed: 0', 'Genre', 'Wiki Page', 'title'], inplace=True, axis=1)
32
+
33
+ # Removing names of plots and creating a new column in the DB
34
+ movies['plot_sin_nombres'] = movies['Plot'].apply(remove_names)
35
+ movies.drop('Plot', inplace=True, axis=1)
36
+
37
+
38
+ # Save
39
+ movies.to_csv('../../data/processed/movies_clean.csv')
40
+
41
+
42
+ if __name__ == '__main__':
43
+ __name__
src/models/.gitkeep ADDED
File without changes
src/models/__init__.py ADDED
File without changes
src/models/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (162 Bytes). View file
 
src/models/__pycache__/plot_similarity_finder.cpython-310.pyc ADDED
Binary file (1.94 kB). View file
 
src/models/plot_similarity_finder.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code to find the most similar plot, using the embedding index.
2
+
3
+ from langchain.vectorstores import FAISS
4
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
5
+
6
+
7
+ # Searching for similar plots to the users plot/input
8
+ # Loading embedding function
9
+ embedding_function = SentenceTransformerEmbeddings(model_name="sentence-t5-large")
10
+
11
+ # Loading vectorstore
12
+ db = FAISS.load_local('../streamlit/plot_embeddings', embedding_function)
13
+
14
+
15
+ # Find the most similar plots to the user plot/input
16
+ def plot_simil(user_plot, num_recom=5):
17
+ """ Find movies with plots similar to the one give by the user.
18
+
19
+ :param user_plot: user plot
20
+ :param num_recom: number of recommendations
21
+ :return: list of lists indicating [Release year, Title] for every recommendation
22
+
23
+ >>> plot_simil(
24
+ "
25
+ a time-travel adventure film that follows the story of Marty McFly, a high school student, and his eccentric
26
+ scientist friend, Doc Brown. Doc has invented a time machine out of a DeLorean car, and one night, during a test
27
+ run, Marty is accidentally transported from 1985 to 1955. In 1955, Marty encounters his younger parents, George
28
+ and Lorraine, and inadvertently interferes with their meeting, putting his own existence at risk. Marty seeks the
29
+ help of the younger Doc Brown from 1955 to repair the time machine and return to his own time. Along the way, he
30
+ must ensure that his parents meet and fall in love, all while avoiding running into his younger self. Marty also
31
+ has to contend with the school bully, Biff, who makes life difficult for his father.
32
+ ",
33
+ 3
34
+ )
35
+
36
+ '[[1990, 'back to the future part iii'],
37
+ [1989, 'back to the future part ii'],
38
+ [1985, 'back to the future']]'
39
+ """
40
+
41
+ docs = db.similarity_search(user_plot, num_recom)
42
+
43
+ recom = []
44
+ for doc in docs:
45
+ year = doc.metadata['Release Year']
46
+ title = doc.metadata['Title']
47
+ recom.append([year, title])
48
+
49
+ return recom
streamlit/__pycache__/streamlit_code.cpython-310.pyc ADDED
Binary file (366 Bytes). View file
 
streamlit/plot_embeddings/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d00eda167b1dd22d915b827ac23e5a283087dcc49d50abf4ff98a3b440fbf639
3
+ size 14592045
streamlit/plot_embeddings/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68397f28520df1553fd80f4d7baa519077550ff78c974a563a1b3f5d1f98e589
3
+ size 16287401
streamlit/streamlit_code.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code for Streamlit webapp
2
+
3
+ import streamlit as st
4
+
5
+ import sys
6
+ import os
7
+ root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
8
+ sys.path.append(root_dir)
9
+ from src.models.plot_similarity_finder import plot_simil
10
+
11
+
12
+ # WebApp title and subtitle
13
+ st.title('Movie Recommendation.')
14
+ st.subheader('Write you movie plot and find a similar one.')
15
+
16
+ # Plot input.
17
+ user_plot = st.text_area("Write the plot here...")
18
+
19
+
20
+ # Generaci贸n de la respuesta.
21
+ # Agregue un bot贸n "Responder" a la interfaz de usuario
22
+ if st.button('Search'):
23
+ with st.spinner('Reading plot...'):
24
+ # Procesamiento
25
+ result = plot_simil(user_plot)
26
+
27
+ # Muestra de la respuesta y las p谩ginas (fuentes)
28
+ st.markdown(f'{str.capitalize(result[0][1])}, {result[0][0]}')
29
+ st.markdown(f'{str.capitalize(result[1][1])}, {result[1][0]}')
30
+ st.markdown(f'{str.capitalize(result[2][1])}, {result[2][0]}')
31
+ st.markdown(f'{str.capitalize(result[3][1])}, {result[3][0]}')
32
+ st.markdown(f'{str.capitalize(result[4][1])}, {result[4][0]}')