Spaces:

Reyad-Ahmmed
/

HF_Python

Paused

App Files Files Community

Reyad-Ahmmed commited on Dec 29, 2024

Commit

99870b0

verified ·

1 Parent(s): affb121

Update app.py

Browse files

Files changed (1) hide show

app.py +229 -1

app.py CHANGED Viewed

	@@ -1 +1,229 @@
1	- ~~print("hello~~ ~~world")~~

+import pandas as pd
+from sklearn.model_selection import train_test_split
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
+import torch
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader
+from transformers import RobertaTokenizer, RobertaForSequenceClassification
+import pandas as pd
+#from sklearn.linear_model import LogisticRegression
+#from sklearn.metrics import accuracy_score, confusion_matrix
+#import matplotlib.pyplot as plt
+import seaborn as sns
+#import numpy as np
+import sys
+import torch.nn.functional as F
+#from torch.nn import CrossEntropyLoss
+#from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+if len(sys.argv) > 1:
+    # sys.argv[0] is the script name, sys.argv[1] is the first argument, etc.
+    runModel = sys.argv[1]
+    print(f"Passed value: {runModel}")
+    print (sys.argv[2])
+else:
+    print("No argument was passed.")
+device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+modelNameToUse = sys.argv[2]
+if (runModel=='1'):
+    dataFileName = sys.argv[2] + '.csv'
+    print (dataFileName)
+    # Load the data from the CSV file
+    df = pd.read_csv(dataFileName)
+    # Access the text and labels
+    texts = df['text'].tolist()
+    labels = df['label'].tolist()
+    print('Train Model')
+     # Encode the labels
+    sorted_labels = sorted(df['label'].unique())
+    label_mapping = {label: i for i, label in enumerate(sorted_labels)}
+    df['label'] = df['label'].map(label_mapping)
+    print(df['label'])
+    # Train/test split
+    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
+    # Tokenization
+    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+    # Model and training setup
+    model = RobertaForSequenceClassification.from_pretrained('roberta-base', output_attentions=True, num_labels=len(label_mapping)).to('cpu')
+    model.resize_token_embeddings(len(tokenizer))
+    train_encodings = tokenizer(list(train_df['text']), truncation=True, padding=True, max_length=64)
+    test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=64)
+    # Dataset class
+    class IntentDataset(Dataset):
+        def __init__(self, encodings, labels):
+            self.encodings = encodings
+            self.labels = labels
+        def __getitem__(self, idx):
+            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+            label = self.labels[idx]
+            item['labels'] = torch.tensor(self.labels[idx])
+            return item
+        def __len__(self):
+            return len(self.labels)
+    train_dataset = IntentDataset(train_encodings, list(train_df['label']))
+    test_dataset = IntentDataset(test_encodings, list(test_df['label']))
+    # Create an instance of the custom loss function
+    training_args = TrainingArguments(
+        output_dir='./results_' + modelNameToUse,
+        num_train_epochs=25,
+        per_device_train_batch_size=2,
+        per_device_eval_batch_size=2,
+        warmup_steps=500,
+        weight_decay=0.02,
+        logging_dir='./logs_' + modelNameToUse,
+        logging_steps=10,
+        evaluation_strategy="epoch",
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=test_dataset
+    )
+    # Train the model
+    trainer.train()
+    # Evaluate the model
+    trainer.evaluate()
+    label_mapping = {
+        0: "lastmonth",
+        1: "nextweek",
+        2: "sevendays",
+        3: "today",
+        4: "tomorrow",
+        5: "yesterday"
+    }
+    def evaluate_and_report_errors(model, dataloader, tokenizer):
+        model.eval()
+        incorrect_predictions = []
+        with torch.no_grad():
+            #print(dataloader)
+            for batch in dataloader:
+                input_ids = batch['input_ids'].to(device)
+                attention_mask = batch['attention_mask'].to(device)
+                labels = batch['labels'].to(device)
+                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+                logits = outputs.logits
+                predictions = torch.argmax(logits, dim=1)
+                for i, prediction in enumerate(predictions):
+                    if prediction != labels[i]:
+                        incorrect_predictions.append({
+                            "prompt": tokenizer.decode(input_ids[i], skip_special_tokens=True),
+                            "predicted": prediction.item(),
+                            "actual": labels[i].item()
+                        })
+        # Print incorrect predictions
+        if incorrect_predictions:
+            print("\nIncorrect Predictions:")
+            for error in incorrect_predictions:
+                print(f"Sentence: {error['prompt']}")
+                #print(f"Predicted Label: {GetCategoryFromCategoryLong(error['predicted'])} | Actual Label: {GetCategoryFromCategoryLong(error['actual'])}\n")
+                print(f"Predicted Label: {label_mapping[error['predicted']]} | Actual Label: {label_mapping[error['actual']]}\n")
+                #print(f"Predicted Label: {error['predicted']} | Actual Label: {label_mapping[error['actual']]}\n")
+        else:
+            print("\nNo incorrect predictions found.")
+    train_dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)
+    evaluate_and_report_errors(model,train_dataloader, tokenizer)
+    # Save the model and tokenizer
+    model.save_pretrained('./'  + modelNameToUse + '_model')
+    tokenizer.save_pretrained('./' + modelNameToUse + '_tokenizer')
+else:
+    print('Load Pre-trained')
+    model_save_path = "./" + modelNameToUse + "_model"
+    tokenizer_save_path = "./" + modelNameToUse + "_tokenizer"
+    # RobertaTokenizer.from_pretrained(model_save_path)
+    model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu')
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
+#Define the label mappings (this must match the mapping used during training)
+label_mapping = {
+    0: "lastmonth",
+    1: "nextweek",
+    2: "sevendays",
+    3: "today",
+    4: "tomorrow",
+    5: "yesterday"
+}
+#Function to classify user input
+def classifyTimeFrame():
+    while True:
+        user_input = input("Enter a command (or type 'q' to quit): ")
+        if user_input.lower() == 'q':
+            print("Exiting...")
+            break
+        # Tokenize and predict
+        input_encoding = tokenizer(user_input, padding=True, truncation=True, return_tensors="pt").to('cpu')
+        with torch.no_grad():
+            attention_mask = input_encoding['attention_mask'].clone()
+            # Modify the attention mask to emphasize certain key tokens
+            # for idx, token_id in enumerate(input_encoding['input_ids'][0]):
+            #     word = tokenizer.decode([token_id])
+            #     print(word)
+            #     if word.strip() in ["now", "same", "continue", "again", "also"]:  # Target key tokens
+            #         attention_mask[0, idx] = 3  # Increase attention weight for these words
+            #     else:
+            #         attention_mask[0, idx] = 0
+            # print (attention_mask)
+            # input_encoding['attention_mask'] = attention_mask
+            # print (input_encoding)
+            output = model(**input_encoding, output_hidden_states=True)
+            probabilities = F.softmax(output.logits, dim=-1)
+            prediction = torch.argmax(output.logits, dim=1).cpu().numpy()
+            # Map prediction back to label
+            print(prediction)
+            predicted_label = label_mapping[prediction[0]]
+            print(f"Predicted intent: {predicted_label}\n")
+            # Print the confidence for each label
+            print("\nLabel Confidence Scores:")
+            for i, label in label_mapping.items():
+                confidence = probabilities[0][i].item()  # Get confidence score for each label
+                print(f"{label}: {confidence:.4f}")
+            print("\n")
+#Run the function
+classifyTimeFrame()