Spaces:

bibekyess
/

bgpt

Sleeping

App Files Files Community

bibekyess commited on May 8, 2023

Commit

42e3a78

1 Parent(s): 873c8d9

Upload 9 files

Browse files

Files changed (9) hide show

README.md +51 -12
b_bot.py +76 -0
chat.py +105 -0
data.pth +3 -0
model.py +22 -0
nltk_utils.py +48 -0
requirements.txt +8 -0
spell_check.py +35 -0
train.py +148 -0

README.md CHANGED Viewed

@@ -1,12 +1,51 @@
----
-title: Bgpt
-emoji: 📉
-colorFrom: yellow
-colorTo: green
-sdk: streamlit
-sdk_version: 1.19.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Hosted Link:
+https://bibekbot.streamlit.app/
+courses? --response later on
+## Setup:
+Clone the repo
+```console
+git clone [email protected]:bibekyess/Personal-Chatbot.git
+cd Personal-Chatbot
+```
+### Create an environment using pipenv
+Install pipenv if not installed. [Pipenv is required to install streamlit in macOS]
+```console
+pip3 install pipenv
+```
+Creates a new Pipenv environment using python-3.9 and activates it
+```console
+pipenv --python 3.9
+pipenv shell
+```
+Installs streamlit in the recently created environment
+```console
+ pipenv install streamlit==1.11.1
+```
+Installs the dependencies mentioned in requirements.txt file
+```console
+pip install -r requirements.txt
+```
+Runs the b_bot app
+```console
+streamlit run b_bot.py
+```
+# For training:
+```console
+python train.py
+```
+This saves the checkpoint in 'data.pth' file. Then run the b_bot app
+```console
+streamlit run b_bot.py
+```
+Reference:
+I referred to https://github.com/patrickloeber/pytorch-chatbot for simple implementation of contextual chatbot with interactions from terminals.
+I referred to https://github.com/AI-Yash/st-chat for the beautiful chatbot interface.

b_bot.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import time
+import streamlit as st
+from streamlit_chat import message
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from chat import generate_response
+if "tokenizer" not in st.session_state:
+    st.session_state["tokenizer"] = AutoTokenizer.from_pretrained(
+        "./generative_model/LaMini-Flan-T5-783M"
+    )
+    st.session_state["model"] = AutoModelForSeq2SeqLM.from_pretrained(
+        "./generative_model/LaMini-Flan-T5-783M"
+    )
+st.title("B-Bot : Bibek's Personal Chatbot")
+# Storing the chat
+if "generated" not in st.session_state:
+    st.session_state["generated"] = []
+if "past" not in st.session_state:
+    st.session_state["past"] = []
+# We will get the user's input by calling the get_text function
+def get_text():
+    input_text = st.text_input("Enter your inquiries here: ", "Hi!!")
+    return input_text
+user_input = get_text()
+if user_input:
+    tokenizer = st.session_state["tokenizer"]
+    model = st.session_state["model"]
+    output = generate_response(user_input)
+    prompt_template = "\nPlease make meaningful sentence and try to be descriptive as possible, ending with proper punctuations. If you don't have descriptive answers from the available prompt, write sorry and advise them to contact Bibek directly."  # NoQA
+    short_response_template = "\nIf your response is very short like 1 or 2 sentence, add a followup sentence like 'Let me know if there's anything else I can help you with. or If there's anything else I can assist with, please don't hesitate to ask. I mean something similar in polite way."  # NoQA
+    start = time.time()
+    input_ids = tokenizer(
+        output + user_input + prompt_template + short_response_template,
+        return_tensors="pt",
+    ).input_ids
+    outputs = model.generate(input_ids, max_length=512, do_sample=True)
+    output = tokenizer.decode(outputs[0]).strip('<pad></s>').strip()
+    end = time.time()
+    print("Time for model inference: ", end - start)
+    # Checks for memory overflow
+    if len(st.session_state.past) == 15:
+        st.session_state.past.pop(0)
+        st.session_state.generated.pop(0)
+    # store the output
+    st.session_state.past.append(user_input)
+    st.session_state.generated.append(output)
+if st.session_state["generated"]:
+    # print(st.session_state)
+    for i in range(len(st.session_state["generated"]) - 1, -1, -1):
+        message(
+            st.session_state["generated"][i],
+            avatar_style="bottts",
+            seed=39,
+            key=str(i),  # NoQA
+        )
+        message(
+            st.session_state["past"][i],
+            is_user=True,
+            avatar_style="identicon",
+            seed=4,
+            key=str(i) + "_user",
+        )  # NoQA

chat.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import json
+import random
+import torch
+from model import NeuralNet
+from nltk_utils import bag_of_words, tokenize
+from spell_check import correct_typos
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+with open("intents.json") as json_data:
+    intents = json.load(json_data)
+FILE = "data.pth"
+data = torch.load(FILE)
+input_size = data["input_size"]
+hidden_size = data["hidden_size"]
+output_size = data["output_size"]
+all_words = data["all_words"]
+tags = data["tags"]
+model_state = data["model_state"]
+model = NeuralNet(input_size, hidden_size, output_size).to(device)
+model.load_state_dict(model_state)
+model.eval()
+bot_name = "B-Bot"
+# print(
+#     "Hello, I am B-BOT, personal ChatBOT of Mr. Bibek. Let's chat! (type 'quit' or 'q' to exit)"  # NoQA
+# )
+def generate_response(sentence):
+    # sentence = input("You: ")
+    sentence = correct_typos(sentence)
+    # print(sentence)
+    if sentence.lower() == "quit" or sentence.lower() == "q":
+        # Needs to quit
+        pass
+    sentence = tokenize(sentence)
+    X = bag_of_words(sentence, all_words)
+    X = X.reshape(1, X.shape[0])
+    X = torch.from_numpy(X).to(device)
+    output = model(X)
+    _, predicted = torch.max(output, dim=1)
+    tag = tags[predicted.item()]
+    probs = torch.softmax(output, dim=1)
+    prob = probs[0][predicted.item()]
+    print(prob.item())
+    if prob.item() > 0.95:
+        for intent in intents["intents"]:
+            if tag == intent["tag"]:
+                return f"{bot_name}: {random.choice(intent['responses'])}"
+    else:
+        return (
+            f"{bot_name}: Sorry, I didn't understand... Can you be more "
+            "specific on your question? You can ask about Bibek's skillset, "
+            "experiences, portfolio, education, achievements "
+            "and KAIST activities."
+            "These are some sample questions: "
+            "(I) Tell me about Bibek,\n"
+            "(II) What skills does he have?,\n"
+            "(III) What work experience does Bibek have?,\n"
+            "(IV) What is Bibek's educational background?,\n"
+            "(V) What awards has he won?,\n"
+            "(VI) What projects has he completed? &\n"
+            "(VII) How can I contact Bibek?"
+        )
+# while True:
+#     # sentence = "do you use credit cards?"
+#     sentence = input("You: ")
+#     if sentence.lower() == "quit" or sentence.lower() == "q":
+#         break
+#     sentence = tokenize(sentence)
+#     X = bag_of_words(sentence, all_words)
+#     X = X.reshape(1, X.shape[0])
+#     X = torch.from_numpy(X).to(device)
+#     output = model(X)
+#     _, predicted = torch.max(output, dim=1)
+#     tag = tags[predicted.item()]
+#     probs = torch.softmax(output, dim=1)
+#     prob = probs[0][predicted.item()]
+#     if prob.item() > 0.8:
+#         for intent in intents["intents"]:
+#             if tag == intent["tag"]:
+#                 print(f"{bot_name}: {random.choice(intent['responses'])}")
+#     else:
+#         print(
+#             f"{bot_name}: Sorry, I do not understand... Can you be more "
+#             "specific on your question? You can ask about Bibek's skillset, "
+#             "experiences, portfolio, education, achievements "
+#             "and KAIST activities."
+#         )

data.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edbf28e6717a7af532466dda112419472c7b8fb747dd9a10105a3aeb117ddce4
+size 102591

model.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch.nn as nn
+class NeuralNet(nn.Module):
+    def __init__(self, input_size, hidden_size, num_classes):
+        super().__init__()
+        self.l1 = nn.Linear(input_size, hidden_size)
+        self.l2 = nn.Linear(hidden_size, hidden_size)
+        self.l3 = nn.Linear(hidden_size, num_classes)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(p=0.5)
+    def forward(self, x):
+        out = self.l1(x)
+        out = self.relu(out)
+        out = self.dropout(out)
+        out = self.l2(out)
+        out = self.relu(out)
+        out = self.dropout(out)
+        out = self.l3(out)
+        # no activation and no softmax at the end
+        return out

nltk_utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import nltk
+import numpy as np
+from nltk.stem.porter import PorterStemmer
+# package with a pretrained tokenizer, may need to uncomment the following
+# to download for the first time
+# nltk.download('punkt')
+stemmer = PorterStemmer()
+def tokenize(sentence):
+    """
+    split sentence into array of words/tokens
+    a token can be a word or punctuation character, or number
+    """
+    return nltk.word_tokenize(sentence)
+def stem(word):
+    """
+    stemming = find the root form of the word
+    examples:
+    words = ["organize", "organizes", "organizing"]
+    words = [stem(w) for w in words]
+    -> ["organ", "organ", "organ"]
+    """
+    return stemmer.stem(word.lower())
+def bag_of_words(tokenized_sentence, words):
+    """
+    return bag of words array:
+    1 for each known word that exists in the sentence, 0 otherwise
+    example:
+    sentence = ["hello", "how", "are", "you"]
+    words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
+    bog   = [  0 ,    1 ,    0 ,   1 ,    0 ,    0 ,      0]
+    """
+    # stem each word
+    sentence_words = [stem(word) for word in tokenized_sentence]
+    # initialize bag with 0 for each word
+    bag = np.zeros(len(words), dtype=np.float32)
+    for idx, w in enumerate(words):
+        if w in sentence_words:
+            bag[idx] = 1
+    return bag

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+nltk
+pyspellchecker
+streamlit-chat
+torch
+torchaudio
+torchvision
+huggingface_hub
+transformers

spell_check.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import spellchecker
+def correct_typos(sentence):
+    # Initialize the spell checker object
+    spell = spellchecker.SpellChecker(language="en")
+    # Adds Bibek to its frequency dictionary to make it a known word
+    spell.word_frequency.load_words(
+        [
+            "Bibek",
+            "Bibek's",
+            "skillsets",
+            "skillset",
+            "CV",
+            "RIRO",
+            "Bisonai",
+            "IC",
+            "BMC",
+            "KAIST",
+        ]
+    )
+    sentence_split = sentence.split()
+    # Find the typos in the input sentence
+    typos = spell.unknown(sentence_split)
+    # Correct the typos
+    corrected_sentence = [
+        spell.correction(word)
+        if spell.correction(word)
+        else word
+        if word in typos
+        else word
+        for word in sentence_split
+    ]
+    # Return the corrected sentence as a string
+    return " ".join(corrected_sentence)

train.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import json
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, Dataset
+from model import NeuralNet
+from nltk_utils import bag_of_words, stem, tokenize
+with open("intents.json") as f:
+    intents = json.load(f)
+all_words = []
+tags = []
+xy = []
+# loop through each sentence in our intents patterns
+for intent in intents["intents"]:
+    tag = intent["tag"]
+    # add to tag list
+    tags.append(tag)
+    for pattern in intent["patterns"]:
+        # tokenize each word in the sentence
+        w = tokenize(pattern)
+        # add to our words list
+        all_words.extend(w)
+        # add to xy pair
+        xy.append((w, tag))
+        AUGMENT = False
+        if "Bibek" in pattern:
+            pattern = pattern.replace("Bibek", "he")
+            AUGMENT = True
+        elif "bibek" in pattern:
+            pattern = pattern.replace("bibek", "he")
+            AUGMENT = True
+        elif "BIBEK" in pattern:
+            pattern = pattern.replace("BIBEK", "he")
+            AUGMENT = True
+        if AUGMENT:
+            w = tokenize(pattern)
+            all_words.extend(w)
+            xy.append((w, tag))
+# stem and lower each word
+ignore_words = ["?", ".", "!"]
+all_words = [stem(w) for w in all_words if w not in ignore_words]
+# remove duplicates and sort
+all_words = sorted(set(all_words))
+tags = sorted(set(tags))
+print(len(xy), "patterns")
+print(len(tags), "tags:", tags)
+print(len(all_words), "unique stemmed words:", all_words)
+# create training data
+X_train = []
+y_train = []
+for (pattern_sentence, tag) in xy:
+    # X: bag of words for each pattern_sentence
+    bag = bag_of_words(pattern_sentence, all_words)
+    X_train.append(bag)
+    # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot
+    label = tags.index(tag)
+    y_train.append(label)
+X_train = np.array(X_train)
+y_train = np.array(y_train)
+# Hyper-parameters
+num_epochs = 1000
+batch_size = 32
+learning_rate = 0.001
+input_size = len(X_train[0])
+hidden_size = 64
+num_heads = 8
+num_layer = 6
+output_size = len(tags)
+print(input_size, output_size)
+class ChatDataset(Dataset):
+    """
+    Creates PyTorch dataset to automatically iterate and do batch training
+    """
+    def __init__(self):
+        self.n_samples = len(X_train)
+        self.x_data = X_train
+        self.y_data = y_train
+    # support indexing such that dataset[i] can be used to get i-th sample
+    def __getitem__(self, index):
+        return self.x_data[index], self.y_data[index]
+    # we can call len(dataset) to return the size
+    def __len__(self):
+        return self.n_samples
+dataset = ChatDataset()
+train_loader = DataLoader(
+    dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=0
+)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = NeuralNet(input_size, hidden_size, output_size).to(device)
+# Loss and optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+# Train the model
+for epoch in range(num_epochs):
+    for (words, labels) in train_loader:
+        words = words.to(device)
+        labels = labels.to(dtype=torch.long).to(device)
+        # Forward pass
+        outputs = model(words)
+        # if y would be one-hot, we must apply
+        # labels = torch.max(labels, 1)[1]
+        loss = criterion(outputs, labels)
+        # Backward and optimize
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+    if (epoch + 1) % 100 == 0:
+        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
+print(f"final loss: {loss.item():.4f}")
+data = {
+    "model_state": model.state_dict(),
+    "input_size": input_size,
+    "hidden_size": hidden_size,
+    "output_size": output_size,
+    "all_words": all_words,
+    "tags": tags,
+}
+FILE = "data.pth"
+torch.save(data, FILE)
+print(f"training complete. file saved to {FILE}")