Spaces:
Runtime error
Runtime error
VenkateshRoshan
commited on
Commit
Β·
94c58a1
1
Parent(s):
519f007
Initial Code Added
Browse files- app.py +22 -0
- dockerfile +0 -0
- notes.txt +7 -0
- project-structure.txt +24 -0
- requirements.txt +9 -0
- src/data.py +10 -0
- src/preprocess.py +33 -0
- src/train.py +0 -0
- tests/test_api.py +0 -0
app.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
+
|
5 |
+
app = FastAPI()
|
6 |
+
|
7 |
+
model = AutoModelForCausalLM.from_pretrained("models/customer_support_gpt")
|
8 |
+
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
|
9 |
+
|
10 |
+
class Query(BaseModel):
|
11 |
+
query: str
|
12 |
+
|
13 |
+
@app.post("/predict/")
|
14 |
+
def predict(query: Query):
|
15 |
+
inputs = tokenizer(query.query, return_tensors="pt", truncation=True)
|
16 |
+
outputs = model.generate(inputs['input_ids'], max_length=150)
|
17 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
18 |
+
return {"response": response}
|
19 |
+
|
20 |
+
if __name__ == "__main__":
|
21 |
+
import uvicorn
|
22 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
dockerfile
ADDED
File without changes
|
notes.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Download the dataset from datasets by using
|
2 |
+
|
3 |
+
ds = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")
|
4 |
+
|
5 |
+
Choose instruction and response as features to train and save the dataframe into csv file
|
6 |
+
|
7 |
+
Then Preprocessed the csv file as tokenized the words
|
project-structure.txt
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
customer-support-chatbot/
|
2 |
+
βββ data/ # Data folder for storing datasets
|
3 |
+
β βββ raw/ # Original, unprocessed datasets
|
4 |
+
β βββ processed/ # Preprocessed datasets
|
5 |
+
βββ models/ # Directory for saving fine-tuned models
|
6 |
+
βββ src/ # Source code directory
|
7 |
+
β βββ preprocess.py # Data preprocessing scripts
|
8 |
+
β βββ train.py # Fine-tuning script for the model
|
9 |
+
β βββ api.py # FastAPI app for real-time chatbot API
|
10 |
+
β βββ chatbot.py # Core chatbot logic (loading model, handling queries)
|
11 |
+
β βββ deploy_sagemaker.py # Script for AWS SageMaker deployment
|
12 |
+
β βββ config.py # Configuration file for hyperparameters
|
13 |
+
βββ tests/ # Unit and integration tests
|
14 |
+
β βββ test_api.py # Test script for API responses
|
15 |
+
βββ docker/ # Docker setup for containerized deployment
|
16 |
+
β βββ Dockerfile # Dockerfile for building the chatbot image
|
17 |
+
β βββ docker-compose.yml # Optional docker-compose setup for local testing
|
18 |
+
βββ mlflow/ # MLflow setup for model tracking
|
19 |
+
β βββ mlflow_config.yml # MLflow configuration file
|
20 |
+
βββ .github/ # GitHub Actions workflows for CI/CD
|
21 |
+
β βββ workflows/
|
22 |
+
β βββ deployment.yml # CI/CD pipeline for testing, building, and deploying
|
23 |
+
βββ requirements.txt # Python dependencies
|
24 |
+
βββ README.md # Documentation
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
torch
|
3 |
+
fastapi
|
4 |
+
uvicorn
|
5 |
+
mlflow
|
6 |
+
boto3
|
7 |
+
pytest
|
8 |
+
pydantic
|
9 |
+
datasets
|
src/data.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
ds = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")
|
5 |
+
# save the dataset to a pandas dataframe only the instruction and response features
|
6 |
+
df = pd.DataFrame(ds['train'])
|
7 |
+
df = df[['instruction', 'response']]
|
8 |
+
|
9 |
+
# save the dataframe to a csv file
|
10 |
+
df.to_csv('data/raw/customer_support.csv', index=False)
|
src/preprocess.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
def load_data(file_path):
|
5 |
+
"""
|
6 |
+
Load the customer support dataset from a CSV file.
|
7 |
+
"""
|
8 |
+
data = pd.read_csv(file_path)
|
9 |
+
return data
|
10 |
+
|
11 |
+
def preprocess_data(data):
|
12 |
+
"""
|
13 |
+
Preprocess data by tokenizing the instructions and responses.
|
14 |
+
"""
|
15 |
+
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
|
16 |
+
tokenizer.pad_token = tokenizer.eos_token
|
17 |
+
|
18 |
+
def tokenize_data(row):
|
19 |
+
"""
|
20 |
+
Helper function to tokenize instruction and response.
|
21 |
+
"""
|
22 |
+
instruction_tokens = tokenizer(row['instruction'], truncation=True, padding="max_length", max_length=256)
|
23 |
+
response_tokens = tokenizer(row['response'], truncation=True, padding="max_length", max_length=256)
|
24 |
+
return instruction_tokens, response_tokens
|
25 |
+
|
26 |
+
# Tokenize each row's instruction and response
|
27 |
+
data['instruction_tokens'], data['response_tokens'] = zip(*data.apply(tokenize_data, axis=1))
|
28 |
+
return data[['instruction_tokens', 'response_tokens']]
|
29 |
+
|
30 |
+
if __name__ == "__main__":
|
31 |
+
data = load_data('data/raw/customer_support.csv')
|
32 |
+
processed_data = preprocess_data(data)
|
33 |
+
processed_data.to_csv('data/processed/customer_support_preprocessed.csv', index=False)
|
src/train.py
ADDED
File without changes
|
tests/test_api.py
ADDED
File without changes
|