VenkateshRoshan commited on
Commit
94c58a1
Β·
1 Parent(s): 519f007

Initial Code Added

Browse files
Files changed (9) hide show
  1. app.py +22 -0
  2. dockerfile +0 -0
  3. notes.txt +7 -0
  4. project-structure.txt +24 -0
  5. requirements.txt +9 -0
  6. src/data.py +10 -0
  7. src/preprocess.py +33 -0
  8. src/train.py +0 -0
  9. tests/test_api.py +0 -0
app.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+
5
+ app = FastAPI()
6
+
7
+ model = AutoModelForCausalLM.from_pretrained("models/customer_support_gpt")
8
+ tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
9
+
10
+ class Query(BaseModel):
11
+ query: str
12
+
13
+ @app.post("/predict/")
14
+ def predict(query: Query):
15
+ inputs = tokenizer(query.query, return_tensors="pt", truncation=True)
16
+ outputs = model.generate(inputs['input_ids'], max_length=150)
17
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
18
+ return {"response": response}
19
+
20
+ if __name__ == "__main__":
21
+ import uvicorn
22
+ uvicorn.run(app, host="0.0.0.0", port=8000)
dockerfile ADDED
File without changes
notes.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Download the dataset from datasets by using
2
+
3
+ ds = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")
4
+
5
+ Choose instruction and response as features to train and save the dataframe into csv file
6
+
7
+ Then Preprocessed the csv file as tokenized the words
project-structure.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ customer-support-chatbot/
2
+ β”œβ”€β”€ data/ # Data folder for storing datasets
3
+ β”‚ β”œβ”€β”€ raw/ # Original, unprocessed datasets
4
+ β”‚ └── processed/ # Preprocessed datasets
5
+ β”œβ”€β”€ models/ # Directory for saving fine-tuned models
6
+ β”œβ”€β”€ src/ # Source code directory
7
+ β”‚ β”œβ”€β”€ preprocess.py # Data preprocessing scripts
8
+ β”‚ β”œβ”€β”€ train.py # Fine-tuning script for the model
9
+ β”‚ β”œβ”€β”€ api.py # FastAPI app for real-time chatbot API
10
+ β”‚ β”œβ”€β”€ chatbot.py # Core chatbot logic (loading model, handling queries)
11
+ β”‚ β”œβ”€β”€ deploy_sagemaker.py # Script for AWS SageMaker deployment
12
+ β”‚ └── config.py # Configuration file for hyperparameters
13
+ β”œβ”€β”€ tests/ # Unit and integration tests
14
+ β”‚ └── test_api.py # Test script for API responses
15
+ β”œβ”€β”€ docker/ # Docker setup for containerized deployment
16
+ β”‚ β”œβ”€β”€ Dockerfile # Dockerfile for building the chatbot image
17
+ β”‚ └── docker-compose.yml # Optional docker-compose setup for local testing
18
+ β”œβ”€β”€ mlflow/ # MLflow setup for model tracking
19
+ β”‚ └── mlflow_config.yml # MLflow configuration file
20
+ β”œβ”€β”€ .github/ # GitHub Actions workflows for CI/CD
21
+ β”‚ └── workflows/
22
+ β”‚ └── deployment.yml # CI/CD pipeline for testing, building, and deploying
23
+ β”œβ”€β”€ requirements.txt # Python dependencies
24
+ └── README.md # Documentation
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ fastapi
4
+ uvicorn
5
+ mlflow
6
+ boto3
7
+ pytest
8
+ pydantic
9
+ datasets
src/data.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import pandas as pd
3
+
4
+ ds = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")
5
+ # save the dataset to a pandas dataframe only the instruction and response features
6
+ df = pd.DataFrame(ds['train'])
7
+ df = df[['instruction', 'response']]
8
+
9
+ # save the dataframe to a csv file
10
+ df.to_csv('data/raw/customer_support.csv', index=False)
src/preprocess.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from transformers import AutoTokenizer
3
+
4
+ def load_data(file_path):
5
+ """
6
+ Load the customer support dataset from a CSV file.
7
+ """
8
+ data = pd.read_csv(file_path)
9
+ return data
10
+
11
+ def preprocess_data(data):
12
+ """
13
+ Preprocess data by tokenizing the instructions and responses.
14
+ """
15
+ tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
16
+ tokenizer.pad_token = tokenizer.eos_token
17
+
18
+ def tokenize_data(row):
19
+ """
20
+ Helper function to tokenize instruction and response.
21
+ """
22
+ instruction_tokens = tokenizer(row['instruction'], truncation=True, padding="max_length", max_length=256)
23
+ response_tokens = tokenizer(row['response'], truncation=True, padding="max_length", max_length=256)
24
+ return instruction_tokens, response_tokens
25
+
26
+ # Tokenize each row's instruction and response
27
+ data['instruction_tokens'], data['response_tokens'] = zip(*data.apply(tokenize_data, axis=1))
28
+ return data[['instruction_tokens', 'response_tokens']]
29
+
30
+ if __name__ == "__main__":
31
+ data = load_data('data/raw/customer_support.csv')
32
+ processed_data = preprocess_data(data)
33
+ processed_data.to_csv('data/processed/customer_support_preprocessed.csv', index=False)
src/train.py ADDED
File without changes
tests/test_api.py ADDED
File without changes