Spaces:

StKirill
/

chatbot

Sleeping

App Files Files Community

StKirill commited on Feb 16, 2024

Commit

25a8deb

verified ·

1 Parent(s): 3ba9296

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -1

app.py CHANGED Viewed

	@@ -1 +1,126 @@
1	- import parsing

+import parsing
+import pandas as pd
+import nltk
+import numpy as np
+import os
+import re  #regular expressions
+from nltk.stem import wordnet  # for lemmtization
+from sklearn.feature_extraction.text import CountVectorizer  # for bag of words (bow)
+from sklearn.feature_extraction.text import TfidfVectorizer  #for tfidf
+from nltk import pos_tag  # for parts of speech
+from sklearn.metrics import pairwise_distances  # cosine similarity
+from nltk import word_tokenize
+from nltk.corpus import stopwords
+from sklearn.metrics.pairwise import cosine_similarity
+import gradio as gr
+import time
+nltk.download('omw-1.4')  #this is for the .apply() function to work
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+nltk.download('wordnet')
+nltk.download('stopwords')
+# Take Rachel as main character
+df = pd.read_csv("rachel_friends.csv")  # read the database into a data frame
+# Define function for text normalization
+def text_normalization(text):
+    text = str(text).lower()  # convert to all lower letters
+    spl_char_text = re.sub(r'[^a-z]', ' ', text)  # remove any special characters including numbers
+    tokens = nltk.word_tokenize(spl_char_text)  # tokenize words
+    lema = wordnet.WordNetLemmatizer()  # lemmatizer initiation
+    tags_list = pos_tag(tokens, tagset = None)  # parts of speech
+    lema_words = []
+    for token, pos_token in tags_list:
+        if pos_token.startswith('V'):  # if the tag from tag_list is a verb, assign 'v' to it's pos_val
+            pos_val = 'v'
+        elif pos_token.startswith('J'):  # adjective
+            pos_val = 'a'
+        elif pos_token.startswith('R'):  # adverb
+            pos_val = 'r'
+        else:  # otherwise it must be a noun
+            pos_val = 'n'
+        lema_token = lema.lemmatize(token, pos_val)  # performing lemmatization
+        lema_words.append(lema_token)  # addid the lemmatized words into our list
+    return " ".join(lema_words)  # return our list as a human sentence
+# Preprocess data and insert to dataframe
+question_normalized = df['question'].apply(text_normalization)
+df.insert(2, 'Normalized question', question_normalized, True)
+# Define function to delete stopwords from the sentences
+stop = stopwords.words('english') # Include stop words
+stop = [] # Exclude stopwords
+def removeStopWords(text):
+  Q = []
+  s = text.split()  # create an array of words from our text sentence, cut it into words
+  q = ''
+  for w in s:  # for every word in the given sentence if the word is a stop word ignore it
+      if w in stop:
+          continue
+      else:  # otherwise add it to the end of our array
+          Q.append(w)
+      q = " ".join(Q)  # create a sentence out of our array of non stop words
+  return q
+# Preprocess data and insert to dataframe
+question_norm_and_stop = df['Normalized question'].apply(removeStopWords)
+df.insert(3, 'Normalized and StopWords question', question_norm_and_stop, True)
+tfidf = TfidfVectorizer()  # initializing tf-idf
+x_tfidf = tfidf.fit_transform(df['Normalized and StopWords question']).toarray()  # oversimplifying this converts words to vectors
+features_tfidf = tfidf.get_feature_names_out()  # use function to get all the normalized words
+df_tfidf = pd.DataFrame(x_tfidf, columns = features_tfidf)  # create dataframe to show the 0, 1 value for each word
+def chat_tfidf(question):
+    tidy_question = text_normalization(removeStopWords(question))  # clean & lemmatize the question
+    tf = tfidf.transform([tidy_question]).toarray()  # convert the question into a vector
+    cos = 1- pairwise_distances(df_tfidf, tf, metric = 'cosine')  # calculate the cosine value
+    index_value = cos.argmax()  # find the index of the maximum cosine value
+    # answer = Answer("Ross", df['answer'].loc[index_value])
+    answer = df['answer'].loc[index_value]
+    return answer
+def echo(message, history, model):
+  print(model)
+  print(history)
+  if model=="TF-IDF":
+    answer = chat_tfidf(message)
+    return answer
+  elif model=="W2V":
+    answer = chat_word2vec(message)
+    return answer
+  elif model=="BERT":
+    answer = chat_bert(message)
+    return answer
+title = "Chatbot who speaks like Rachel from Friends"
+description = "You have a good opportunity to have a dialog with friend's actor - Rachel Green"
+# model = gr.CheckboxGroup(["TF-IDF", "W2V", "BERT", "BI-Encoder", "Cross-Encoder"], label="Model", info="What model do you want to use?", value="TF-IDF")
+model = gr.Dropdown(["TF-IDF", "W2V", "BERT", "BI-Encoder", "Cross-Encoder"], label="Retrieval model", info="What model do you want to use?", value="TF-IDF")
+with gr.Blocks() as demo:
+    gr.ChatInterface(
+        fn=echo,
+        title=title,
+        description=description,
+        additional_inputs=[model],
+        retry_btn=None,
+        undo_btn=None,
+        clear_btn=None,
+    )
+demo.launch(debug=True)