Spaces:

abven
/

Customer-Support-Chatbot

Runtime error

App Files Files Community

VenkateshRoshan commited on Nov 8, 2024

Commit

94c58a1

1 Parent(s): 519f007

Initial Code Added

Browse files

Files changed (9) hide show

app.py +22 -0
dockerfile +0 -0
notes.txt +7 -0
project-structure.txt +24 -0
requirements.txt +9 -0
src/data.py +10 -0
src/preprocess.py +33 -0
src/train.py +0 -0
tests/test_api.py +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from fastapi import FastAPI
+from pydantic import BaseModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+app = FastAPI()
+model = AutoModelForCausalLM.from_pretrained("models/customer_support_gpt")
+tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
+class Query(BaseModel):
+    query: str
+@app.post("/predict/")
+def predict(query: Query):
+    inputs = tokenizer(query.query, return_tensors="pt", truncation=True)
+    outputs = model.generate(inputs['input_ids'], max_length=150)
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return {"response": response}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

dockerfile ADDED Viewed

File without changes

notes.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+Download the dataset from datasets by using
+ds = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")
+Choose instruction and response as features to train and save the dataframe into csv file
+Then Preprocessed the csv file as tokenized the words

project-structure.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+customer-support-chatbot/
+├── data/                    # Data folder for storing datasets
+│   ├── raw/                 # Original, unprocessed datasets
+│   └── processed/           # Preprocessed datasets
+├── models/                  # Directory for saving fine-tuned models
+├── src/                     # Source code directory
+│   ├── preprocess.py        # Data preprocessing scripts
+│   ├── train.py             # Fine-tuning script for the model
+│   ├── api.py               # FastAPI app for real-time chatbot API
+│   ├── chatbot.py           # Core chatbot logic (loading model, handling queries)
+│   ├── deploy_sagemaker.py  # Script for AWS SageMaker deployment
+│   └── config.py            # Configuration file for hyperparameters
+├── tests/                   # Unit and integration tests
+│   └── test_api.py          # Test script for API responses
+├── docker/                  # Docker setup for containerized deployment
+│   ├── Dockerfile           # Dockerfile for building the chatbot image
+│   └── docker-compose.yml   # Optional docker-compose setup for local testing
+├── mlflow/                  # MLflow setup for model tracking
+│   └── mlflow_config.yml    # MLflow configuration file
+├── .github/                 # GitHub Actions workflows for CI/CD
+│   └── workflows/
+│       └── deployment.yml   # CI/CD pipeline for testing, building, and deploying
+├── requirements.txt         # Python dependencies
+└── README.md                # Documentation

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+transformers
+torch
+fastapi
+uvicorn
+mlflow
+boto3
+pytest
+pydantic
+datasets

src/data.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from datasets import load_dataset
+import pandas as pd
+ds = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")
+# save the dataset to a pandas dataframe only the instruction and response features
+df = pd.DataFrame(ds['train'])
+df = df[['instruction', 'response']]
+# save the dataframe to a csv file
+df.to_csv('data/raw/customer_support.csv', index=False)

src/preprocess.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import pandas as pd
+from transformers import AutoTokenizer
+def load_data(file_path):
+    """
+    Load the customer support dataset from a CSV file.
+    """
+    data = pd.read_csv(file_path)
+    return data
+def preprocess_data(data):
+    """
+    Preprocess data by tokenizing the instructions and responses.
+    """
+    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
+    tokenizer.pad_token = tokenizer.eos_token
+    def tokenize_data(row):
+        """
+        Helper function to tokenize instruction and response.
+        """
+        instruction_tokens = tokenizer(row['instruction'], truncation=True, padding="max_length", max_length=256)
+        response_tokens = tokenizer(row['response'], truncation=True, padding="max_length", max_length=256)
+        return instruction_tokens, response_tokens
+    # Tokenize each row's instruction and response
+    data['instruction_tokens'], data['response_tokens'] = zip(*data.apply(tokenize_data, axis=1))
+    return data[['instruction_tokens', 'response_tokens']]
+if __name__ == "__main__":
+    data = load_data('data/raw/customer_support.csv')
+    processed_data = preprocess_data(data)
+    processed_data.to_csv('data/processed/customer_support_preprocessed.csv', index=False)

src/train.py ADDED Viewed

File without changes

tests/test_api.py ADDED Viewed

File without changes