tharu22 commited on
Commit
877a8c0
Β·
1 Parent(s): b5a9ca6
sms_process_data_main.xlsx β†’ data/sms_process_data_main.xlsx RENAMED
File without changes
main.py CHANGED
@@ -1,40 +1,22 @@
 
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
- from sentence_transformers import SentenceTransformer
4
  import numpy as np
5
- import pandas as pd
 
 
6
 
7
  # Initialize FastAPI
8
  app = FastAPI()
9
 
10
- # Load the sentence transformer model
11
- try:
12
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", trust_remote_code=True)
13
- print("βœ… Model loaded successfully")
14
- except Exception as e:
15
- raise RuntimeError(f"❌ Failed to load model: {str(e)}")
16
-
17
- # Define request schemas
18
- class CosineSimilarityInput(BaseModel):
19
- text1: str
20
- text2: str
21
-
22
- class MessageInput(BaseModel):
23
- message: str
24
-
25
- # Load SMS dataset from Excel
26
- file_path = "sms_process_data_main.xlsx"
27
- df = pd.read_excel(file_path)
28
-
29
- # Precompute embeddings
30
- transactional_examples = df[df['label'] == 'Transaction']['MessageText'].tolist()
31
- offer_examples = df[df['label'] == 'Offer']['MessageText'].tolist()
32
-
33
- transactional_embeddings = [model.encode(msg, convert_to_tensor=True).cpu().numpy() for msg in transactional_examples]
34
- offer_embeddings = [model.encode(msg, convert_to_tensor=True).cpu().numpy() for msg in offer_examples]
35
 
36
  # Function to compute cosine similarity
37
  def cosine_similarity(vec1, vec2):
 
 
 
38
  norm1 = np.linalg.norm(vec1)
39
  norm2 = np.linalg.norm(vec2)
40
  if norm1 == 0 or norm2 == 0:
@@ -44,57 +26,48 @@ def cosine_similarity(vec1, vec2):
44
  # πŸš€ 1️⃣ Homepage Endpoint
45
  @app.get("/")
46
  async def home():
47
- return {"message": "Welcome to Classification of SMS"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- # πŸ”’ 2️⃣ Cosine Similarity Endpoint
50
- @app.post("/cosine_similarity")
51
  async def compute_similarity(input_data: CosineSimilarityInput):
52
  """
53
  Compute cosine similarity between two input texts.
54
  """
55
  try:
56
- emb1 = model.encode(input_data.text1, convert_to_tensor=True).cpu().numpy()
57
- emb2 = model.encode(input_data.text2, convert_to_tensor=True).cpu().numpy()
58
- similarity = cosine_similarity(emb1, emb2)
59
- return {"cosine_similarity": round(float(similarity), 4)}
 
 
 
60
  except Exception as e:
61
  raise HTTPException(status_code=500, detail=f"Error computing similarity: {str(e)}")
62
 
63
- # πŸ“© 3️⃣ SMS Classification Endpoint
64
- @app.post("/predict_label/")
65
- async def classify_message(input_data: MessageInput):
66
  """
67
- Classify an SMS as either 'Transaction' or 'Offer'.
68
  """
69
  try:
70
- # Validate input
71
- text_input = input_data.message.strip()
72
- if not text_input:
73
- raise HTTPException(status_code=400, detail="Input message cannot be empty")
74
-
75
- # Encode input text
76
- input_embedding = model.encode(text_input, convert_to_tensor=True).cpu().numpy()
77
-
78
- # Compute similarity scores
79
- transactional_scores = [cosine_similarity(input_embedding, emb) for emb in transactional_embeddings]
80
- offer_scores = [cosine_similarity(input_embedding, emb) for emb in offer_embeddings]
81
-
82
- # Get max similarity
83
- max_transactional = max(transactional_scores, default=0)
84
- max_offer = max(offer_scores, default=0)
85
-
86
- # Determine label and probability
87
- if max_transactional > max_offer:
88
- label = "Transaction"
89
-
90
- else:
91
- label = "Offer"
92
-
93
- return {
94
- "label": label
95
- }
96
-
97
  except Exception as e:
98
- raise HTTPException(status_code=500, detail=f"Unexpected error: {str(e)}")
99
-
100
-
 
1
+ # app/main.py
2
  from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
 
4
  import numpy as np
5
+ from linear.services.sms_service import classify_sms, load_trained_model
6
+ from linear.schemas.input_schemas import CosineSimilarityInput, CosineSimilarityOutput
7
+ from linear.schemas.input_schemas import EmbeddingInput, EmbeddingOutput
8
 
9
  # Initialize FastAPI
10
  app = FastAPI()
11
 
12
+ # Load the models from the 'models' folder
13
+ model, vectorizer = load_trained_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # Function to compute cosine similarity
16
  def cosine_similarity(vec1, vec2):
17
+ """
18
+ Compute cosine similarity between two vectors.
19
+ """
20
  norm1 = np.linalg.norm(vec1)
21
  norm2 = np.linalg.norm(vec2)
22
  if norm1 == 0 or norm2 == 0:
 
26
  # πŸš€ 1️⃣ Homepage Endpoint
27
  @app.get("/")
28
  async def home():
29
+ return {"message": "Welcome to SMS Classification API"}
30
+
31
+ # πŸ“© 2️⃣ SMS Classification Endpoint
32
+ class MessageInput(BaseModel):
33
+ message: str
34
+
35
+ @app.post("/predict_label/")
36
+ async def classify_sms_endpoint(input_data: MessageInput):
37
+ """
38
+ Classify an SMS as either 'Transaction' or 'Offer'.
39
+ """
40
+ try:
41
+ return classify_sms(input_data.message, model, vectorizer)
42
+ except Exception as e:
43
+ raise HTTPException(status_code=500, detail=f"Unexpected error: {str(e)}")
44
 
45
+ # πŸ”’ 3️⃣ Cosine Similarity Endpoint
46
+ @app.post("/cosine_similarity/", response_model=CosineSimilarityOutput)
47
  async def compute_similarity(input_data: CosineSimilarityInput):
48
  """
49
  Compute cosine similarity between two input texts.
50
  """
51
  try:
52
+ # Transform the input texts using the TF-IDF vectorizer
53
+ text1_vectorized = vectorizer.transform([input_data.text1])
54
+ text2_vectorized = vectorizer.transform([input_data.text2])
55
+
56
+ # Compute the cosine similarity between the two text embeddings
57
+ similarity = cosine_similarity(text1_vectorized.toarray(), text2_vectorized.toarray())
58
+ return CosineSimilarityOutput(cosine_similarity=round(float(similarity), 4))
59
  except Exception as e:
60
  raise HTTPException(status_code=500, detail=f"Error computing similarity: {str(e)}")
61
 
62
+ # 🧠 4️⃣ Get Embedding of Text Message
63
+ @app.post("/get_embedding/", response_model=EmbeddingOutput)
64
+ async def get_embedding(input_data: EmbeddingInput):
65
  """
66
+ Get the embedding (vector representation) of an input text message.
67
  """
68
  try:
69
+ # Transform the input text using the TF-IDF vectorizer
70
+ text_embedding = vectorizer.transform([input_data.message]).toarray().tolist()
71
+ return EmbeddingOutput(embedding=text_embedding[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  except Exception as e:
73
+ raise HTTPException(status_code=500, detail=f"Error generating embedding: {str(e)}")
 
 
models/sms_classifier_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c2b852be29075447f1306196af754cb31984406f591abc7891815e3b7c0e972
3
+ size 21305
models/tfidf_vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cebef878c5c22a6e58d1e80484d053b18402989ba83bbea4ce766ec3ace1bc6
3
+ size 93623
schemas/input_schemas.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/schemas/input_schemas.py
2
+ from pydantic import BaseModel
3
+
4
+ class CosineSimilarityInput(BaseModel):
5
+ text1: str
6
+ text2: str
7
+
8
+ class MessageInput(BaseModel):
9
+ message: str
10
+
11
+
12
+
13
+ class CosineSimilarityResponse(BaseModel):
14
+ cosine_similarity: float
15
+
16
+
17
+ class EmbeddingResponse(BaseModel):
18
+ embeddings: list
19
+
services/sms_service.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/services/sms_service.py
2
+ import pickle
3
+ import numpy as np
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from fastapi import HTTPException
6
+ from linear.schemas.input_schemas import CosineSimilarityResponse
7
+ from linear.schemas.input_schemas import EmbeddingResponse
8
+
9
+ # Load the trained model and vectorizer
10
+ def load_model():
11
+ model_path = "models/sms_classifier_model.pkl"
12
+ vectorizer_path = "models/tfidf_vectorizer.pkl"
13
+
14
+ try:
15
+ with open(model_path, 'rb') as f:
16
+ classifier = pickle.load(f)
17
+
18
+ with open(vectorizer_path, 'rb') as f:
19
+ vectorizer = pickle.load(f)
20
+
21
+ return classifier, vectorizer
22
+ except Exception as e:
23
+ raise HTTPException(status_code=500, detail=f"Error loading model: {str(e)}")
24
+
25
+ async def predict_label(message: str):
26
+ try:
27
+ classifier, vectorizer = load_model()
28
+ # Vectorize the input message
29
+ message_vec = vectorizer.transform([message])
30
+
31
+ # Predict the label
32
+ label = classifier.predict(message_vec)[0]
33
+ return {"label": label}
34
+ except Exception as e:
35
+ raise HTTPException(status_code=500, detail=f"Error predicting label: {str(e)}")
36
+
37
+ async def compute_cosine_similarity(text1: str, text2: str):
38
+ try:
39
+ classifier, vectorizer = load_model()
40
+
41
+ # Vectorize the input texts
42
+ vec1 = vectorizer.transform([text1]).toarray()
43
+ vec2 = vectorizer.transform([text2]).toarray()
44
+
45
+ # Compute cosine similarity
46
+ cosine_sim = np.dot(vec1, vec2.T) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
47
+ return CosineSimilarityResponse(cosine_similarity=cosine_sim[0][0])
48
+ except Exception as e:
49
+ raise HTTPException(status_code=500, detail=f"Error computing similarity: {str(e)}")
50
+
51
+ async def compute_embeddings(message: str):
52
+ try:
53
+ classifier, vectorizer = load_model()
54
+
55
+ # Vectorize the input message
56
+ embedding = vectorizer.transform([message]).toarray().tolist()
57
+ return EmbeddingResponse(embeddings=embedding)
58
+ except Exception as e:
59
+ raise HTTPException(status_code=500, detail=f"Error computing embeddings: {str(e)}")
services/train_model.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/services/train_model.py
2
+ import pandas as pd
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.linear_model import LogisticRegression
5
+ from sklearn.model_selection import train_test_split
6
+ import pickle
7
+ import os
8
+
9
+ # Load the dataset
10
+ file_path = "data/sms_process_data_main.xlsx"
11
+ df = pd.read_excel(file_path)
12
+
13
+ # Prepare the features and labels
14
+ X = df['MessageText'] # SMS messages
15
+ y = df['label'] # Labels: 'Transaction' or 'Offer'
16
+
17
+ # Split the data into training and testing sets
18
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
19
+
20
+ # Initialize the TF-IDF Vectorizer
21
+ vectorizer = TfidfVectorizer(max_features=5000)
22
+
23
+ # Fit the vectorizer on the training data and transform the training data
24
+ X_train_vec = vectorizer.fit_transform(X_train)
25
+
26
+ # Initialize and train the logistic regression model
27
+ classifier = LogisticRegression()
28
+ classifier.fit(X_train_vec, y_train)
29
+
30
+ # Save the trained model and vectorizer
31
+ models_dir = "models"
32
+ if not os.path.exists(models_dir):
33
+ os.makedirs(models_dir)
34
+
35
+ # Save the classifier model
36
+ with open(os.path.join(models_dir, 'sms_classifier_model.pkl'), 'wb') as model_file:
37
+ pickle.dump(classifier, model_file)
38
+
39
+ # Save the vectorizer
40
+ with open(os.path.join(models_dir, 'tfidf_vectorizer.pkl'), 'wb') as vectorizer_file:
41
+ pickle.dump(vectorizer, vectorizer_file)
42
+
43
+ print("Model and vectorizer saved successfully!")