|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
import numpy as np
|
|
import json
|
|
import zipfile
|
|
import pickle
|
|
import os
|
|
from nltk.corpus import stopwords
|
|
from nltk.tokenize import word_tokenize
|
|
from nltk.stem import WordNetLemmatizer
|
|
|
|
|
|
class TfidfWikiGuesser:
|
|
def __init__(self, wikidump = 'resources/wiki_text_16.json') -> None:
|
|
self.tfidf = None
|
|
self.corpus = None
|
|
self.titles = None
|
|
self.vectorizer = None
|
|
self.lemmatizer = WordNetLemmatizer()
|
|
model_file = "processed_tfidf_wiki_page_text_model.pkl"
|
|
|
|
|
|
full_model_path = os.path.join("./models", model_file)
|
|
|
|
if os.path.exists(full_model_path):
|
|
print("Loading model from pickle...")
|
|
self.load_from_pkl(full_model_path)
|
|
else:
|
|
if wikidump:
|
|
print("No pre-trained model found, loading data from dump...")
|
|
self.load_model(wikidump)
|
|
self.save_model(full_model_path)
|
|
|
|
|
|
def load_model(self, wikidump):
|
|
|
|
with open(wikidump) as f:
|
|
doc = json.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
self.corpus, self.titles = self.create_corpus(doc)
|
|
|
|
self.vectorizer = TfidfVectorizer(stop_words='english')
|
|
self.tfidf = self.vectorizer.fit_transform(self.corpus)
|
|
|
|
def preprocess_text(self,text):
|
|
if type(text) == float:
|
|
return str(text)
|
|
tokens = word_tokenize(text.lower())
|
|
filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
|
|
lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in filtered_tokens]
|
|
processed_text = ' '.join(lemmatized_tokens)
|
|
return processed_text
|
|
|
|
def create_corpus(self, json_file):
|
|
corpus = []
|
|
page_titles = []
|
|
|
|
for json_obj in json_file:
|
|
|
|
|
|
corpus.append(json_obj['text'])
|
|
page_titles.append(json_obj['page'])
|
|
|
|
return (corpus, page_titles)
|
|
|
|
def make_guess(self, question, num_guesses = 1):
|
|
tfidf_question = self.vectorizer.transform([question])
|
|
|
|
sim = cosine_similarity(self.tfidf, tfidf_question)
|
|
|
|
|
|
sim_indices = np.argsort(sim.flatten())[::-1]
|
|
best_indices = sim_indices[:num_guesses]
|
|
|
|
|
|
best_guesses = []
|
|
for i in best_indices:
|
|
|
|
best_guesses.append(self.titles[i])
|
|
|
|
return best_guesses
|
|
|
|
def save_model(self, file_name):
|
|
with open(file_name, 'wb') as f:
|
|
pickle.dump({
|
|
'vectorizer': self.vectorizer,
|
|
'tfidf_matrix': self.tfidf,
|
|
'titles': self.titles,
|
|
|
|
}, f)
|
|
|
|
def load_from_pkl(self, file_name):
|
|
with open(file_name, 'rb') as f:
|
|
data = pickle.load(f)
|
|
self.vectorizer = data['vectorizer']
|
|
self.tfidf = data['tfidf_matrix']
|
|
self.titles = data['titles']
|
|
|