Spaces:

Mishmosh
/

Data2gradiointerface

Build error

App Files Files Community

Mishmosh commited on Apr 11, 2024

Commit

d81e698

verified ·

1 Parent(s): 1e80c8e

Create app.py

Browse files

Files changed (1) hide show

app.py +272 -0

app.py ADDED Viewed

	@@ -0,0 +1,272 @@

+# https://www.kaggle.com/datasets/wcukierski/enron-email-dataset
+from google.colab import drive
+drive.mount('/content/drive')
+# libraries
+#!pip install transformers --upgrade
+#!pip install gradio
+#!pip install datasets
+#!pip install huggingface-hub
+#!pip install chromadb
+#!pip install accelerate==0.21.0
+#!pip install transformers[torch]
+#!pip install git+https://github.com/huggingface/accelerate.git
+import pandas as pd
+import numpy as np
+from transformers import AutoModel
+from sklearn.model_selection import train_test_split
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
+import gradio as gr
+import chromadb
+from datasets import Dataset
+from transformers import Trainer, TrainingArguments
+from transformers import AutoModelForMaskedLM, DataCollatorForLanguageModeling
+from transformers import TextDataset, DataCollatorForLanguageModeling
+#from transformers import TrainingArguments, Trainer
+#from transformers import pipeline
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+file_path = '/content/drive/MyDrive/emails.csv'
+df = pd.read_csv(file_path)
+df_columns = df.columns
+print(df.head(10))
+messages_df = df['message'] #extract message column
+print(messages_df.head())
+print(type(messages_df))
+# Extract 1% of the content as test set so that instead of 500,000 emails 5,000 are being used as a sample. (Kept changing test size to stop colab crashing.)
+emails_train, emails_test = train_test_split(messages_df, test_size=0.000008, random_state=42)
+print(emails_test)
+print(type(emails_test))
+pd.set_option('display.max_colwidth', None) #check content
+print(emails_test.head()) #first 5 rows
+print(type(emails_test))
+# Embeddings
+import os
+# Define maximum sequence length
+max_seq_length = 512
+# Truncate or pad sequences to the maximum length
+truncated_emails_test = [email[:max_seq_length] for email in emails_test]
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+model = AutoModel.from_pretrained("bert-base-uncased")
+embeddings_pipeline = pipeline('feature-extraction', model=model, tokenizer=tokenizer)
+embeddings = embeddings_pipeline(truncated_emails_test)
+print(type(embeddings))
+#print(embeddings[:5]) #cannot see embeddings like this
+# to see the embeddings
+# Save each embedding to a separate file
+for i, emb in enumerate(embeddings):
+    np.save(f"embedding_{i}.npy", emb)
+    # Load each embedding from its corresponding file
+loaded_embeddings = []
+for i in range(len(embeddings)):
+    emb = np.load(f"embedding_{i}.npy")
+    loaded_embeddings.append(emb)
+for i, emb in enumerate(loaded_embeddings):
+    print(f"Embedding {i}:")
+    print(emb)
+import chromadb
+chroma_client = chromadb.Client()
+collection = chroma_client.create_collection(name="michelletest")
+# Extract the embeddings from the nested list
+extracted_embeddings = [embedding[0][0] for embedding in embeddings]
+# Add embeddings to the ChromaDB collection
+collection.add(
+    embeddings=extracted_embeddings[:5],  # Add the first 5 embeddings
+    documents=emails_test.tolist()[:5],  # Add the first 5 documents
+    metadatas=[{"source": "emails_test"} for _ in range(5)],  # Metadata for the first 5 documents
+    ids=[f"id{i}" for i in range(5)]  # ID for the first 5 documents
+)
+collection.count() #check how many in the database
+# Retrieve the first 2 entries from the ChromaDB database to check that it worked properly
+collection.get()
+# Convert the Series to a DataFrame
+emails_test_df = emails_test.to_frame()
+# Print the column names of the DataFrame
+print(emails_test_df.columns)
+print(emails_test_df['message']) #checking content of messsages for fine tuning the model
+print(emails_test_df['message'].head())
+# Print the column names of the DataFrame
+print(emails_test_df.columns)
+num_entries = emails_test_df.shape[0]
+print("Number of entries in emails_test_df:", num_entries)
+# Extract 1% of the content as test set so that instead of 500,000 emails 5,000 are being used as a sample; 60 used in the end
+emails_train, emails_test2 = train_test_split(messages_df, test_size=0.00001, random_state=42)
+print(emails_test2)
+print(type(emails_test2))
+num_entries2=emails_test2.shape[0]
+print("number of",num_entries2)
+# Convert pandas Series to a list of strings
+text_list = emails_test_df['message'].tolist()
+# Verify the type and content
+print(type(text_list))
+print(text_list[:5])  # Print the first 5 entries as an example
+print(text_list[:5])
+print(text_list)
+print(text_list[2]) #to see the content of an average mail to know what to clean up
+def remove_sections(email):  #clean email of content that is not useful
+    """Remove sections including original message, from, sent, to, subject line, and additional headers."""
+    sections_to_remove = [
+        "----- Original Message -----",
+        "From:",
+        "Sent:",
+        "To:",
+        "CC:",
+        "Subject:",
+        "Message-ID:",
+        "Date:",
+        "Mime-Version:",
+        "Content-Type:",
+        "Content-Transfer-Encoding:",
+        "X-cc:",
+        "X-bcc:",
+        "X-Folder:",
+        "X-Origin:",
+        "X-FileName:",
+        "-----Original Message-----"
+    ]
+    for section in sections_to_remove:
+        email = [line for line in email if section not in line]
+    return email
+# Remove sections from each email in the list
+cleaned_text_list = [remove_sections(email.split("\n")) for email in text_list]
+# Print out the cleaned emails to see if content looks ok
+for cleaned_email in cleaned_text_list:
+    print("\n".join(cleaned_email))
+    print("=" * 50)  # Separate each cleaned email for better readability
+#fine tune language model
+# Define the pre-trained model name (bart-base)
+model_name = "facebook/bart-base"
+# Load the tokenizer for bart-base
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Function to preprocess text_list for training
+def prepare_data(text_list):
+   # Tokenize the text with padding and truncation (BART handles these well)
+  inputs = tokenizer(text_list, padding="max_length", truncation=True)
+  # Copy the input IDs for labels (desired output during training)
+  labels = inputs.input_ids.copy()
+  # Create a Dataset object from the preprocessed data
+  return Dataset.from_dict({"input_ids": inputs["input_ids"], "labels": labels})
+  """Preprocesses text data for training the BART model.
+  Args:
+      text_list: A list of strings containing the text data.
+  Returns:
+      A Dataset object containing the preprocessed data.
+  """
+# Prepare your training data from the text list
+train_data = prepare_data(text_list)
+# Define the fine-tuning model (BART for sequence-to-sequence tasks)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# Training hyperparameters (adjust as needed)
+batch_size = 8
+learning_rate = 2e-5
+num_epochs = 3
+from transformers import Trainer
+# Define the Trainer object for training management
+trainer = Trainer(
+    model=model,
+    args=TrainingArguments(
+        output_dir="./results",  # Output directory for checkpoints etc.
+        overwrite_output_dir=True,
+        per_device_train_batch_size=batch_size,
+        learning_rate=learning_rate,
+        num_train_epochs=num_epochs,
+    ),
+    train_dataset=train_data,
+)
+# Start the fine-tuning process
+trainer.train()
+# Save the fine-tuned model and tokenizer
+model.save_pretrained("./fine-tuned_bart")
+tokenizer.save_pretrained("./fine-tuned_bart")
+print("Fine-tuning completed! Model saved in ./fine-tuned_bart")
+# Fine-tuning completed! Model saved in ./fine-tuned_bart
+# i used a very small amount of input so that colab stopped crashing
+import gradio as gr
+from transformers import BartForQuestionAnswering, BartTokenizer
+# Load the fine-tuned BART model
+model = BartForQuestionAnswering.from_pretrained("./fine-tuned_bart")
+tokenizer = BartTokenizer.from_pretrained("./fine-tuned_bart")
+# Function to answer questions
+def answer_question(question):
+    inputs = tokenizer.encode_plus(question, return_tensors="pt", max_length=512, truncation=True)
+    input_ids = inputs["input_ids"].tolist()[0]
+    answer_start_scores, answer_end_scores = model(**inputs)
+    answer_start = torch.argmax(answer_start_scores)
+    answer_end = torch.argmax(answer_end_scores) + 1
+    answer = tokenizer.decode(input_ids[answer_start:answer_end])
+    return answer
+# Create Gradio interface
+iface = gr.Interface(
+    fn=answer_question,
+    inputs="text",
+    outputs="text",
+    title="Question Answering Model",
+    description="Enter a question to get the answer."
+)
+# Launch the interface
+iface.launch()