StKirill commited on
Commit
25a8deb
·
verified ·
1 Parent(s): 3ba9296

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -1
app.py CHANGED
@@ -1 +1,126 @@
1
- import parsing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import parsing
2
+
3
+ import pandas as pd
4
+ import nltk
5
+ import numpy as np
6
+ import os
7
+ import re #regular expressions
8
+ from nltk.stem import wordnet # for lemmtization
9
+ from sklearn.feature_extraction.text import CountVectorizer # for bag of words (bow)
10
+ from sklearn.feature_extraction.text import TfidfVectorizer #for tfidf
11
+ from nltk import pos_tag # for parts of speech
12
+ from sklearn.metrics import pairwise_distances # cosine similarity
13
+ from nltk import word_tokenize
14
+ from nltk.corpus import stopwords
15
+ from sklearn.metrics.pairwise import cosine_similarity
16
+ import gradio as gr
17
+ import time
18
+ nltk.download('omw-1.4') #this is for the .apply() function to work
19
+ nltk.download('punkt')
20
+ nltk.download('averaged_perceptron_tagger')
21
+ nltk.download('wordnet')
22
+ nltk.download('stopwords')
23
+
24
+
25
+ # Take Rachel as main character
26
+ df = pd.read_csv("rachel_friends.csv") # read the database into a data frame
27
+
28
+ # Define function for text normalization
29
+ def text_normalization(text):
30
+ text = str(text).lower() # convert to all lower letters
31
+ spl_char_text = re.sub(r'[^a-z]', ' ', text) # remove any special characters including numbers
32
+ tokens = nltk.word_tokenize(spl_char_text) # tokenize words
33
+ lema = wordnet.WordNetLemmatizer() # lemmatizer initiation
34
+ tags_list = pos_tag(tokens, tagset = None) # parts of speech
35
+ lema_words = []
36
+ for token, pos_token in tags_list:
37
+ if pos_token.startswith('V'): # if the tag from tag_list is a verb, assign 'v' to it's pos_val
38
+ pos_val = 'v'
39
+ elif pos_token.startswith('J'): # adjective
40
+ pos_val = 'a'
41
+ elif pos_token.startswith('R'): # adverb
42
+ pos_val = 'r'
43
+ else: # otherwise it must be a noun
44
+ pos_val = 'n'
45
+ lema_token = lema.lemmatize(token, pos_val) # performing lemmatization
46
+ lema_words.append(lema_token) # addid the lemmatized words into our list
47
+ return " ".join(lema_words) # return our list as a human sentence
48
+
49
+ # Preprocess data and insert to dataframe
50
+ question_normalized = df['question'].apply(text_normalization)
51
+ df.insert(2, 'Normalized question', question_normalized, True)
52
+
53
+ # Define function to delete stopwords from the sentences
54
+ stop = stopwords.words('english') # Include stop words
55
+ stop = [] # Exclude stopwords
56
+ def removeStopWords(text):
57
+ Q = []
58
+ s = text.split() # create an array of words from our text sentence, cut it into words
59
+ q = ''
60
+ for w in s: # for every word in the given sentence if the word is a stop word ignore it
61
+ if w in stop:
62
+ continue
63
+ else: # otherwise add it to the end of our array
64
+ Q.append(w)
65
+ q = " ".join(Q) # create a sentence out of our array of non stop words
66
+ return q
67
+
68
+ # Preprocess data and insert to dataframe
69
+ question_norm_and_stop = df['Normalized question'].apply(removeStopWords)
70
+ df.insert(3, 'Normalized and StopWords question', question_norm_and_stop, True)
71
+
72
+ tfidf = TfidfVectorizer() # initializing tf-idf
73
+ x_tfidf = tfidf.fit_transform(df['Normalized and StopWords question']).toarray() # oversimplifying this converts words to vectors
74
+ features_tfidf = tfidf.get_feature_names_out() # use function to get all the normalized words
75
+ df_tfidf = pd.DataFrame(x_tfidf, columns = features_tfidf) # create dataframe to show the 0, 1 value for each word
76
+
77
+ def chat_tfidf(question):
78
+ tidy_question = text_normalization(removeStopWords(question)) # clean & lemmatize the question
79
+ tf = tfidf.transform([tidy_question]).toarray() # convert the question into a vector
80
+ cos = 1- pairwise_distances(df_tfidf, tf, metric = 'cosine') # calculate the cosine value
81
+ index_value = cos.argmax() # find the index of the maximum cosine value
82
+ # answer = Answer("Ross", df['answer'].loc[index_value])
83
+ answer = df['answer'].loc[index_value]
84
+ return answer
85
+
86
+
87
+
88
+
89
+
90
+ def echo(message, history, model):
91
+ print(model)
92
+ print(history)
93
+ if model=="TF-IDF":
94
+ answer = chat_tfidf(message)
95
+ return answer
96
+
97
+ elif model=="W2V":
98
+ answer = chat_word2vec(message)
99
+ return answer
100
+
101
+ elif model=="BERT":
102
+ answer = chat_bert(message)
103
+ return answer
104
+
105
+
106
+
107
+
108
+ title = "Chatbot who speaks like Rachel from Friends"
109
+ description = "You have a good opportunity to have a dialog with friend's actor - Rachel Green"
110
+
111
+ # model = gr.CheckboxGroup(["TF-IDF", "W2V", "BERT", "BI-Encoder", "Cross-Encoder"], label="Model", info="What model do you want to use?", value="TF-IDF")
112
+ model = gr.Dropdown(["TF-IDF", "W2V", "BERT", "BI-Encoder", "Cross-Encoder"], label="Retrieval model", info="What model do you want to use?", value="TF-IDF")
113
+
114
+ with gr.Blocks() as demo:
115
+
116
+ gr.ChatInterface(
117
+ fn=echo,
118
+ title=title,
119
+ description=description,
120
+ additional_inputs=[model],
121
+ retry_btn=None,
122
+ undo_btn=None,
123
+ clear_btn=None,
124
+ )
125
+
126
+ demo.launch(debug=True)