Aldo07 commited on
Commit
e54728d
·
verified ·
1 Parent(s): 16e196d

Add project

Browse files
Makefile ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ ##@ Clean-up
2
+ clean: ## remove output files from pytest & coverage
3
+ @find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf
4
+ @find . | grep -E ".ipynb_checkpoints" | xargs rm -rf
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from model.model import model
4
+
5
+ # Making question examples
6
+ example1 = "Apa dashboard terkait KPI operational?"
7
+ example2 = "Apa link Dashboard Operational KPIs 2023?"
8
+ example3 = "Mau lihat data performance cabang, ada di dashboard apa?"
9
+ example4 = "Apa saja data di Dashboard Transactions?"
10
+
11
+ # Making UI
12
+ with gr.Blocks() as demo:
13
+ gr.Markdown(
14
+ """
15
+ # Chatbot Dashboard
16
+ This project is a chatbot based on LLM using the RAG method.
17
+ This chatbot will answer questions related to the company's dashboard.
18
+ You can ask questions according to the dashboard data below.
19
+ """)
20
+ gr.Interface(fn=model,
21
+ inputs="text",
22
+ outputs="text",
23
+ theme=gr.themes.Monochrome(),
24
+ examples = [example1, example2, example3, example4])
25
+ gr.Markdown(
26
+ """
27
+ ## Data dashboard
28
+ """)
29
+ gr.HTML("<img src='https://huggingface.co/spaces/Aldo07/chatbot_dashboard/resolve/main/data%20dashboard.png' width='500'>")
30
+
31
+ demo.launch()
32
+
data dashboard.png ADDED
data/data.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ "Dashboard Financial KPIs 2023 merupakan dashboard berisi KPI Financial tahun 2023. Dashboard Financial KPIs 2023 berisi data terkait Expenses dan revenue di tahun 2023.
2
+ Link Dashboard Financial KPIs 2023 adalah https://dashboard.com/financial_kpi_2023/"
3
+ Dashboard Operational KPIs 2023 merupakan dashboard KPI operational tahun 2023. Dashboard Operational KPIs 2023 berisi data terkait total deposit dan pinjaman di tahun 2023. Link Dashboard Operational KPIs 2023 adalah https://dashboard.com/operational_kpi_2023/
4
+ "Dashboard Customer Satisfaction merupakan dashboard yang berisi terkait kepuasan pelanggan terhadap pelayanan. Skor kepuasan pelanggan dihitung berdasarkan tren, demografi,, tingkat respons, survei, dll. Link Dashboard Customer Satisfaction adalah https://dashboard.com/customer_satisfactione/"
5
+ "Dashboard Teller Performance merupakan dashboard yang berisi informasi terkait performa teller. Dashboard Teller Performance berisi data terkait jumlah total customer yang dilayani, jumlah tiket yang diterbitkan per jam, dan rata-rata waktu yang dihabiskan customer di cabang, Link Dashboard Teller Performance adalah https://dashboard.com/teller_performance/"
6
+ Dashboard Branch Performance merupakan dashboard yang berisi informasi terkait performa cabang. Link Dashboard Branch Performance adalah https://dashboard.com/branch_performance/
7
+ Dashboard Transactions merupakan dashboard yang berisi informasi terkait seluruh transaksi yang terjadi. Dashboard Transactions berisi data terkait seluruh transaksi secara tahunan maupun historis. Link Dashboard Transactions adalah https://dashboard.com/transactions_report/
8
+ Dashboard Marketing merupakan dashboard yang berisi informasi terkait marketing tahun ini dan secara historikal.Link Dashboard Marketing adalah https://dashboard.com/marketing_report/
9
+ Dashboard Hardware Monitoring merupakan dashboard yang berisi informasi terkait performa hardware atau perangkat keras.Link Dashboard Hardware Monitoring adalah https://dashboard.com/hardware_monitoring/
model/__init__.py ADDED
File without changes
model/config/__init__.py ADDED
File without changes
model/config/config.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ modelPath = 'firqaaa/indo-sentence-bert-base'
2
+ model_kwargs = {'device':'cpu'}
3
+ encode_kwargs = {'normalize_embeddings': False}
model/model.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.embeddings import HuggingFaceEmbeddings
2
+ from langchain.vectorstores import FAISS
3
+ from langchain import HuggingFacePipeline
4
+ from langchain.chains import RetrievalQA
5
+ from sentence_transformers import SentenceTransformer
6
+ import pandas as pd
7
+
8
+
9
+ from model.config.config import modelPath, model_kwargs, encode_kwargs
10
+ from model.processing.data_manager import text_chunk
11
+ from model.processing.pipeline_qa import pipeline
12
+
13
+
14
+ def embedding():
15
+ embeddings = HuggingFaceEmbeddings(
16
+ model_name=modelPath,
17
+ model_kwargs=model_kwargs,
18
+ encode_kwargs=encode_kwargs
19
+ )
20
+
21
+ return embeddings
22
+
23
+
24
+ def model(question):
25
+ text_chunks = text_chunk()
26
+ embeddings =embedding()
27
+
28
+ db = FAISS.from_documents(text_chunks, embeddings)
29
+
30
+ pipe = pipeline()
31
+
32
+ retriever = db.as_retriever(search_kwargs={"k": 1})
33
+ docs = retriever.get_relevant_documents(question)
34
+
35
+ answer = pipe({
36
+ 'context': docs[0],
37
+ 'question': question})
38
+
39
+ return answer
model/processing/__init__.py ADDED
File without changes
model/processing/data_manager.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ from langchain.document_loaders import DataFrameLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+
8
+
9
+ def load_data():
10
+
11
+ data_folder = Path("data")
12
+ data_path = os.path.join(data_folder, 'data.csv')
13
+ data = pd.read_csv(data_path, header=None, names=['description'])
14
+ data = pd.DataFrame(data)
15
+
16
+ return data
17
+
18
+ def text_chunk():
19
+ data = load_data()
20
+ text_chunks = DataFrameLoader(
21
+ data, page_content_column="description").load_and_split(
22
+ text_splitter=RecursiveCharacterTextSplitter(
23
+ chunk_size=1000, chunk_overlap=0, length_function=len
24
+ ))
25
+
26
+ return text_chunks
model/processing/pipeline_qa.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ from typing import Dict, Union
3
+
4
+ from nltk import sent_tokenize
5
+ import nltk
6
+ nltk.download('punkt')
7
+ import torch
8
+ from transformers import(
9
+ AutoModelForSeq2SeqLM,
10
+ AutoTokenizer
11
+ )
12
+
13
+ class QAPipeline:
14
+
15
+ def __init__(
16
+ self
17
+ ):
18
+
19
+ self.model = AutoModelForSeq2SeqLM.from_pretrained("muchad/idt5-qa-qg")
20
+ self.tokenizer = AutoTokenizer.from_pretrained("muchad/idt5-qa-qg")
21
+ self.qg_format = "highlight"
22
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
23
+ self.model.to(self.device)
24
+ assert self.model.__class__.__name__ in ["T5ForConditionalGeneration"]
25
+ self.model_type = "t5"
26
+
27
+ def __call__(self, inputs: str):
28
+ inputs = " ".join(inputs.split())
29
+ answers = self._extract_answers(inputs)
30
+ flat_answers = list(itertools.chain(*answers))
31
+
32
+ if len(flat_answers) == 0:
33
+ return []
34
+
35
+ def _tokenize(self,
36
+ inputs,
37
+ padding=True,
38
+ truncation=True,
39
+ add_special_tokens=True,
40
+ max_length=512
41
+ ):
42
+ inputs = self.tokenizer.batch_encode_plus(
43
+ inputs,
44
+ max_length=max_length,
45
+ add_special_tokens=add_special_tokens,
46
+ truncation=truncation,
47
+ padding="max_length" if padding else False,
48
+ pad_to_max_length=padding,
49
+ return_tensors="pt"
50
+ )
51
+ return inputs
52
+
53
+ class TaskPipeline(QAPipeline):
54
+ def __init__(self, **kwargs):
55
+ super().__init__(**kwargs)
56
+
57
+ def __call__(self, inputs: Union[Dict, str]):
58
+ return self._extract_answer(inputs["question"], inputs["context"])
59
+
60
+ def _prepare_inputs(self, question, context):
61
+ source_text = f"question: {question} context: {context}"
62
+ source_text = source_text + " </s>"
63
+ return source_text
64
+
65
+ def _extract_answer(self, question, context):
66
+ source_text = self._prepare_inputs(question, context)
67
+ inputs = self._tokenize([source_text], padding=False)
68
+
69
+ outs = self.model.generate(
70
+ input_ids=inputs['input_ids'].to(self.device),
71
+ attention_mask=inputs['attention_mask'].to(self.device),
72
+ max_length=80,
73
+ )
74
+ answer = self.tokenizer.decode(outs[0], skip_special_tokens=True)
75
+ return answer
76
+
77
+ def pipeline():
78
+ task = TaskPipeline
79
+ return task()
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "chatbot data"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Your Name <[email protected]>"]
6
+ readme = "README.md"
7
+ packages = [{include = "model"}]
8
+
9
+
10
+ [tool.poetry.dependencies]
11
+ python = "^3.9"
12
+ transformers = "^4.38.1"
13
+ langchain = "^0.1.9"
14
+ sentence-transformers = "^2.4.0"
15
+ faiss-cpu = "^1.7.4"
16
+ gradio = "^4.19.2"
17
+ pandas = "^2.2.1"
18
+ torch = "^2.2.1"
19
+ nltk = "^3.8.1"
20
+
21
+
22
+ [build-system]
23
+ requires = ["poetry-core"]
24
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ langchain
3
+ sentence-transformers
4
+ faiss-cpu
5
+ gradio
6
+ pandas
7
+ torch
8
+ nltk