Last commit not found
import os | |
import json | |
from sentence_transformers import SentenceTransformer | |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
def update_filters(filters, data): | |
for key, value in filters.items(): | |
dont_add = ['Boys Love', 'Erotica', 'Girls Love', 'Hentai', 'Ecchi', 'Gore', 'Crossdressing', 'Magical Sex Shift', 'Rx - Hentai', 'R+ - Mild Nudity'] | |
if data[key] and key == 'rating': | |
if data[key] not in dont_add: | |
value.add(data[key]) | |
else: | |
for val in data[key]: | |
if val and val not in dont_add: | |
value.add(val) | |
return filters | |
def clean_filters(filters): | |
for key, val in filters.items(): | |
val.add('ALL') | |
filters[key] = list(val) | |
return filters | |
if __name__ == '__main__': | |
print('Embedding Started') | |
filters = { | |
'genres': set(), | |
'themes': set(), | |
'rating': set() | |
} | |
embeddings = {} | |
for name in os.listdir('./anime'): | |
with open(f"./anime/{name}", 'r') as file: | |
data = json.load(file) | |
if not data: | |
continue | |
filters = update_filters(filters, data) | |
name = name.replace('.json', '') | |
data['image'] = f"./images/{name}.jpg" | |
text = f''' | |
This anime has {data['episodes']} Episodes | | |
This anime premiered on {data['premiered']} | | |
This anime was broadcasted on: {data['broadcast']} | | |
This anime was produced by {' '.join(data['producers'])} | | |
This anime was licensed by Licensors: {' '.join(data['licensors'])} | | |
The studios in charge of this anime was {' '.join(data['studios'])} | | |
The source of this anime was {' '.join(data['source'])} | | |
The genres of this anime are {' '.join(data['genres'])} | | |
The themes of this anime are {' '.join(data['themes'])} | | |
The demographic of this anime is {data['demographic']} | | |
The duration of this anime is {data['duration']} | | |
The rating of this anime is {data['rating']} | | |
The description of this anime is {data['description']}''' | |
embeddings[name] = data.copy() | |
embeddings[name]['objective_embedding'] = [model.encode(text).tolist()] | |
subjective_embeddings = [] | |
for review in embeddings[name]['reviews']: | |
text = review['text'] | |
subjective_embeddings.append(model.encode(text).tolist()) | |
data['review'] = text | |
embeddings[name]['subjective_embeddings'] = subjective_embeddings | |
filters = clean_filters(filters) | |
with open('./embeddings/data.json', 'w') as f: | |
json.dump({'embeddings':embeddings, 'filters': filters}, f) | |
print('Embedding Complete') | |