drewThomasson
/

Quotation_identification_BERT.v1

Model card Files Files and versions Community

drewThomasson commited on Jul 22, 2024

Commit

1b9ef96

verified ·

1 Parent(s): b83b2ed

Upload 9 files

Browse files

Files changed (9) hide show

Metal_gui_original_quotation_identification_BERT_infrence.py +308 -0
quotation_identifer_model/checkpoint-1000/config.json +25 -0
quotation_identifer_model/checkpoint-1000/optimizer.pt +3 -0
quotation_identifer_model/checkpoint-1000/pytorch_model.bin +3 -0
quotation_identifer_model/checkpoint-1000/rng_state.pth +3 -0
quotation_identifer_model/checkpoint-1000/scheduler.pt +3 -0
quotation_identifer_model/checkpoint-1000/trainer_state.json +0 -0
quotation_identifer_model/checkpoint-1000/training_args.bin +3 -0
sample_text.txt +67 -0

Metal_gui_original_quotation_identification_BERT_infrence.py ADDED Viewed

	@@ -0,0 +1,308 @@

+import pandas as pd
+import re
+import torch
+import threading
+from transformers import BertTokenizerFast, DistilBertTokenizer, DistilBertForSequenceClassification
+from tqdm import tqdm
+import tkinter as tk
+from tkinter import filedialog, messagebox, scrolledtext, ttk
+from tkinter.font import Font
+# Check if Metal is available
+device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
+def replace_titles_and_abbreviations(text):
+    replacements = {
+        r"Mr\.": "<MR>", r"Ms\.": "<MS>", r"Mrs\.": "<MRS>", r"Dr\.": "<DR>",
+        r"Prof\.": "<PROF>", r"Rev\.": "<REV>", r"Gen\.": "<GEN>", r"Sen\.": "<SEN>",
+        r"Rep\.": "<REP>", r"Gov\.": "<GOV>", r"Lt\.": "<LT>", r"Sgt\.": "<SGT>",
+        r"Capt\.": "<CAPT>", r"Cmdr\.": "<CMDR>", r"Adm\.": "<ADM>", r"Maj\.": "<MAJ>",
+        r"Col\.": "<COL>", r"St\.": "<ST>", r"Co\.": "<CO>", r"Inc\.": "<INC>",
+        r"Corp\.": "<CORP>", r"Ltd\.": "<LTD>", r"Jr\.": "<JR>", r"Sr\.": "<SR>",
+        r"Ph\.D\.": "<PHD>", r"M\.D\.": "<MD>", r"B\.A\.": "<BA>", r"B\.S\.": "<BS>",
+        r"M\.A\.": "<MA>", r"M\.S\.": "<MS>", r"LL\.B\.": "<LLB>", r"LL\.M\.": "<LLM>",
+        r"J\.D\.": "<JD>", r"Esq\.": "<ESQ>",
+    }
+    for pattern, replacement in replacements.items():
+        text = re.sub(pattern, replacement, text)
+    return text
+def revert_titles_and_abbreviations(text):
+    replacements = {
+        "<MR>": "Mr.", "<MS>": "Ms.", "<MRS>": "Mrs.", "<DR>": "Dr.",
+        "<PROF>": "Prof.", "<REV>": "Rev.", "<GEN>": "Gen.", "<SEN>": "Sen.",
+        "<REP>": "Rep.", "<GOV>": "Gov.", "<LT>": "Lt.", "<SGT>": "Sgt.",
+        "<CAPT>": "Capt.", "<CMDR>": "Cmdr.", "<ADM>": "Adm.", "<MAJ>": "Maj.",
+        "<COL>": "Col.", "<ST>": "St.", "<CO>": "Co.", "<INC>": "Inc.",
+        "<CORP>": "Corp.", "<LTD>": "Ltd.", "<JR>": "Jr.", "<SR>": "Sr.",
+        "<PHD>": "Ph.D.", "<MD>": "M.D.", "<BA>": "B.A.", "<BS>": "B.S.",
+        "<MA>": "M.A.", "<MS>": "M.S.", "<LLB>": "LL.B.", "<LLM>": "LL.M.",
+        "<JD>": "J.D.", "<ESQ>": "Esq.",
+    }
+    for placeholder, original in replacements.items():
+        text = re.sub(placeholder, original, text)
+    return text
+def split_text_by_pauses(text):
+    text = replace_titles_and_abbreviations(text)
+    pattern = r'[.!,;?:]'
+    parts = [part.strip() for part in re.split(pattern, text) if part.strip()]
+    parts_with_punctuation = [
+        part + text[text.find(part) + len(part)]
+        if text.find(part) + len(part) < len(text) and text[text.find(part) + len(part)] in '.!,;?'
+        else part for part in parts
+    ]
+    parts_with_punctuation = [revert_titles_and_abbreviations(part) for part in parts_with_punctuation]
+    return parts_with_punctuation
+def Process_txt_into_BERT_quotes_input_dataframe(filepath):
+    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
+    with open(filepath, 'r', encoding='utf-8') as file:
+        text = file.read()
+    sentences = split_text_by_pauses(text)
+    data = {
+        'Text': [],
+        'Context': [],
+        'Text start char': [],
+        'Text end char': [],
+        'Context start char': [],
+        'Context end char': [],
+        'Is Quote': [],
+        'Speaker': []
+    }
+    tokenized_text = tokenizer.tokenize(text)
+    encoded_text = tokenizer.encode_plus(text, add_special_tokens=False, return_offsets_mapping=True)
+    offsets = encoded_text['offset_mapping']
+    for sentence in sentences:
+        start_idx, end_idx = text.find(sentence), text.find(sentence) + len(sentence)
+        start_token_idx = next((i for i, offset in enumerate(offsets) if offset[0] == start_idx), None)
+        end_token_idx = next((i for i, offset in enumerate(offsets) if offset[1] == end_idx), None)
+        if start_token_idx is not None and end_token_idx is not None:
+            context_start_token_idx = max(0, start_token_idx - 200)
+            context_end_token_idx = min(len(tokenized_text), end_token_idx + 200)
+            context_start_char = offsets[context_start_token_idx][0]
+            context_end_char = offsets[min(context_end_token_idx, len(offsets) - 1)][1]
+            context = text[context_start_char:context_end_char]
+            data['Text'].append(sentence)
+            data['Context'].append(context)
+            data['Text start char'].append(start_idx)
+            data['Text end char'].append(end_idx)
+            data['Context start char'].append(context_start_char)
+            data['Context end char'].append(context_end_char)
+            data['Is Quote'].append('')
+            data['Speaker'].append('')
+    df = pd.DataFrame(data)
+    return df
+def predict_quote(context, text, model_checkpoint_path="./quotation_identifer_model/checkpoint-1000"):
+    formatted_input = f"{context} : Is Sentence Quote : {text}"
+    model = DistilBertForSequenceClassification.from_pretrained(model_checkpoint_path).to(device)
+    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+    tokenized_input = tokenizer(formatted_input, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model(**tokenized_input)
+    predicted_label = torch.argmax(outputs.logits).item()
+    label_encoder = {0: "Not a Quote", 1: "Quote"}
+    return label_encoder[predicted_label] == "Quote"
+def fill_is_quote_column(df, model_checkpoint_path="./quotation_identifer_model/checkpoint-1000"):
+    if 'Is Quote' not in df.columns:
+        df['Is Quote'] = None
+    tqdm.pandas(desc="Processing rows", unit="row")
+    for index, row in tqdm(df.iterrows(), total=len(df)):
+        context = row['Context']
+        text = row['Text']
+        df.at[index, 'Is Quote'] = predict_quote(context, text, model_checkpoint_path)
+    return df
+def transfer_quotes(complete_df, incomplete_df):
+    for index, row in complete_df.iterrows():
+        is_quote = row['Is Quote']
+        if pd.notna(is_quote):
+            incomplete_df.at[index, 'Is Quote'] = is_quote
+    return incomplete_df
+def visualize_quotes(df, is_dark_mode=False):
+    root = tk.Toplevel()
+    root.title("Text Visualization")
+    root.geometry("800x600")
+    style = ttk.Style(root)
+    style.theme_use('clam')
+    main_frame = ttk.Frame(root, padding="20")
+    main_frame.pack(fill=tk.BOTH, expand=True)
+    title_font = Font(family="Helvetica", size=24, weight="bold")
+    title_label = ttk.Label(main_frame, text="Quote Visualization (Identified quotes are highlighted in blue)", font=title_font)
+    title_label.pack(pady=(0, 20))
+    text_box = scrolledtext.ScrolledText(main_frame, width=80, height=30, wrap=tk.WORD, font=("Helvetica", 12))
+    text_box.pack(fill=tk.BOTH, expand=True)
+    def set_color_scheme(is_dark):
+        if is_dark:
+            style.configure("TFrame", background="#2c2c2c")
+            style.configure("TLabel", background="#2c2c2c", foreground="white")
+            text_box.config(bg="#2c2c2c", fg="white", insertbackground="white")
+            text_box.tag_configure('quote', background='#4a86e8', foreground='white')
+            root.configure(bg="#2c2c2c")
+        else:
+            style.configure("TFrame", background="#f0f0f0")
+            style.configure("TLabel", background="#f0f0f0", foreground="black")
+            text_box.config(bg="white", fg="black", insertbackground="black")
+            text_box.tag_configure('quote', background='#4a86e8', foreground='black')
+            root.configure(bg="#f0f0f0")
+    def highlight_text():
+        text_box.delete('1.0', tk.END)
+        for _, row in df.iterrows():
+            text = row['Text']
+            is_quote = row['Is Quote']
+            if is_quote:
+                text_box.insert(tk.END, text + "\n", 'quote')
+            else:
+                text_box.insert(tk.END, text + "\n")
+    set_color_scheme(is_dark_mode)
+    highlight_text()
+    root.mainloop()
+class QuoteIdentifierApp:
+    def __init__(self, master):
+        self.master = master
+        self.master.title("Quote Identifier")
+        self.master.geometry("600x450")
+        self.master.resizable(False, False)
+        self.style = ttk.Style()
+        self.style.theme_use('clam')
+        self.is_dark_mode = False
+        self.create_widgets()
+        self.set_light_mode()
+    def create_widgets(self):
+        self.main_frame = ttk.Frame(self.master, padding="20")
+        self.main_frame.pack(fill=tk.BOTH, expand=True)
+        title_font = Font(family="Helvetica", size=24, weight="bold")
+        title_label = ttk.Label(self.main_frame, text="Quote Identifier", font=title_font)
+        title_label.pack(pady=(0, 20))
+        btn_frame = ttk.Frame(self.main_frame)
+        btn_frame.pack(fill=tk.X, pady=10)
+        self.open_file_btn = ttk.Button(btn_frame, text="Open Text File", command=self.open_file, style="AccentButton.TButton")
+        self.open_file_btn.pack(side=tk.LEFT, padx=(0, 10))
+        self.identify_quotes_btn = ttk.Button(btn_frame, text="Run Identify Quotes", command=self.identify_quotes, style="AccentButton.TButton")
+        self.identify_quotes_btn.pack(side=tk.LEFT)
+        self.dark_mode_btn = ttk.Button(self.main_frame, text="Toggle Dark Mode", command=self.toggle_dark_mode, style="TButton")
+        self.dark_mode_btn.pack(pady=10)
+        self.status_label = ttk.Label(self.main_frame, text="Ready", font=("Helvetica", 12))
+        self.status_label.pack(pady=10)
+        self.progress_bar = ttk.Progressbar(self.main_frame, orient=tk.HORIZONTAL, length=300, mode='determinate')
+        self.progress_bar.pack(pady=10)
+    def set_light_mode(self):
+        self.style.configure("TFrame", background="#f0f0f0")
+        self.style.configure("TButton", background="#e0e0e0", foreground="black")
+        self.style.configure("AccentButton.TButton", background="#4a86e8", foreground="white")
+        self.style.configure("TLabel", background="#f0f0f0", foreground="black")
+        self.master.configure(bg="#f0f0f0")
+        self.is_dark_mode = False
+    def set_dark_mode(self):
+        self.style.configure("TFrame", background="#2c2c2c")
+        self.style.configure("TButton", background="#3c3c3c", foreground="white")
+        self.style.configure("AccentButton.TButton", background="#4a86e8", foreground="white")
+        self.style.configure("TLabel", background="#2c2c2c", foreground="white")
+        self.master.configure(bg="#2c2c2c")
+        self.is_dark_mode = True
+    def toggle_dark_mode(self):
+        if self.is_dark_mode:
+            self.set_light_mode()
+        else:
+            self.set_dark_mode()
+    def open_file(self):
+        filepath = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])
+        if filepath:
+            self.status_label.config(text=f"File selected: {filepath}")
+            self.filepath = filepath
+        else:
+            self.status_label.config(text="No file selected")
+    def identify_quotes(self):
+        if hasattr(self, 'filepath'):
+            self.status_label.config(text="Processing... Please wait.")
+            self.progress_bar['value'] = 0
+            self.master.update()
+            def process_quotes():
+                df = Process_txt_into_BERT_quotes_input_dataframe(self.filepath)
+                df = self.fill_is_quote_column_with_progress(df)
+                self.master.after(0, lambda: self.finish_processing(df))
+            threading.Thread(target=process_quotes, daemon=True).start()
+        else:
+            messagebox.showwarning("No File Selected", "Please select a text file first.")
+    def fill_is_quote_column_with_progress(self, df):
+        if 'Is Quote' not in df.columns:
+            df['Is Quote'] = None
+        total_rows = len(df)
+        for index, row in enumerate(tqdm(df.iterrows(), total=total_rows, desc="Processing rows", unit="row")):
+            context = row[1]['Context']
+            text = row[1]['Text']
+            df.at[index, 'Is Quote'] = predict_quote(context, text)
+            progress = (index + 1) / total_rows * 100
+            self.master.after(0, lambda p=progress: self.update_progress(p))
+        return df
+    def update_progress(self, value):
+        self.progress_bar['value'] = value
+        self.master.update_idletasks()
+    def finish_processing(self, df):
+        self.progress_bar['value'] = 100
+        self.status_label.config(text="Quote identification complete!")
+        visualize_quotes(df, self.is_dark_mode)
+def create_gui():
+    root = tk.Tk()
+    app = QuoteIdentifierApp(root)
+    root.mainloop()
+if __name__ == "__main__":
+    create_gui()

quotation_identifer_model/checkpoint-1000/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_name_or_path": "distilbert-base-uncased",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.0",
+  "vocab_size": 30522
+}

quotation_identifer_model/checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e21aed7c35cfefa93118971a420fe14893d95736dac398704f7633168ca47c78
+size 535701498

quotation_identifer_model/checkpoint-1000/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ae8c959e1d9422d571f25beb32f051c1598a120595f2cc561ad9fc73c38e703
+size 267855978

quotation_identifer_model/checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5317a2fc18eacc03c223a72064f1c14c21c19dc84f0c2d0a7daeb726d218ba3e
+size 14244

quotation_identifer_model/checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:241f68f21f4f92d7d38b9008513df0dfc6d00224a7e125958cd1bf69975fb379
+size 1064

quotation_identifer_model/checkpoint-1000/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

quotation_identifer_model/checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6183a921f240b660c30bb64732af34d3523f7d3cce7f8be9dd27ab5b2e1c4f6
+size 4280

sample_text.txt ADDED Viewed

	@@ -0,0 +1,67 @@

+In the heart of Everwood, a quaint village nestled deep within an enchanted forest, magic was a part of everyday life. The village's most extraordinary residents, though, were the students of Everwood Academy for Magical Arts.
+One crisp autumn morning, young Eliza Thorne and her best friend, Milo Garrison, strolled through the cobblestone streets of the village. Eliza, with her bushy brown hair and curious eyes, was always brimming with questions. Milo, a tall boy with tousled blonde hair and an air of mischief, was more interested in adventure.
+"Eliza, do you think Professor Merriwether really knows where the Enchanted Amulet is hidden?" Milo asked, his eyes twinkling with excitement.
+Eliza adjusted her glasses and replied thoughtfully, "Professor Merriwether mentioned it in passing during last week's Transfiguration class. She seemed quite... mysterious about it."
+"That's what makes it so intriguing," Milo said with a grin. "Imagine if we could find it before anyone else!"
+As they reached the cobbled square, their friend, Fiona Bright, joined them. Fiona was known for her fiery red hair and quick wit. She waved enthusiastically.
+"Morning, you two!" Fiona called out. "I overheard the most fascinating rumor at breakfast. Apparently, the Enchanted Amulet can grant a single wish."
+Eliza's eyes widened. "A wish? That sounds incredible! But it must be protected by powerful magic."
+Just then, a familiar figure approached: Professor Merriwether herself, a tall woman with silver hair that shimmered like moonlight. She had a knowing smile on her face.
+"Good morning, students," Professor Merriwether greeted them warmly. "I see you're in high spirits today."
+"Professor Merriwether!" Milo said, barely containing his excitement. "We were just talking about the Enchanted Amulet. Do you think it’s really hidden somewhere in Everwood?"
+Professor Merriwether’s eyes sparkled with amusement. "Ah, the Enchanted Amulet. It’s a legend many have sought after, but few have found. What makes you so interested?"
+Eliza took a deep breath. "We heard it can grant a wish, and we were wondering if we might try to find it."
+Professor Merriwether chuckled softly. "The amulet does indeed hold great power, but it’s not something to be taken lightly. It requires both wisdom and courage to seek it out. Remember, not all wishes are as straightforward as they seem."
+Fiona’s face lit up. "We’re ready for any challenge! Where should we start?"
+Professor Merriwether paused, considering. "Very well. There is an old map in the library that might help you. But be cautious—it has a mind of its own and is known to test those who seek the amulet."
+The trio exchanged excited glances and hurried to the academy’s grand library. Rows of towering bookshelves and ancient tomes surrounded them. They found the map tucked away in a dusty, leather-bound book.
+Eliza spread the map across a table. "It looks like it leads to the Whispering Woods."
+Milo frowned. "I’ve heard those woods are haunted by mischievous sprites."
+Fiona nodded, her eyes gleaming with determination. "Then we’d better be prepared."
+As they ventured into the Whispering Woods, the air grew thick with magic. The trees seemed to whisper secrets, and shadows danced at the edges of their vision. Suddenly, a group of tiny, glowing sprites appeared, blocking their path.
+"To pass, you must answer our riddles," the sprites chimed in unison.
+Milo stepped forward. "We’re ready. What’s your first riddle?"
+One of the sprites fluttered closer and asked, "I have cities, but no houses. I have mountains, but no trees. I have water, but no fish. What am I?"
+Eliza’s eyes lit up. "A map!"
+The sprites giggled and parted, allowing them to continue. They faced more riddles and magical challenges, each one testing their resolve and teamwork.
+Finally, after what seemed like hours, they arrived at a hidden glen where the Enchanted Amulet rested upon an ancient pedestal. The amulet glowed with a soft, inviting light.
+Eliza hesitated. "Remember what Professor Merriwether said about wishes."
+Milo nodded. "Let’s make sure our wish is for something that truly matters."
+Fiona placed her hand on the amulet and spoke softly. "We wish for the wisdom and courage to always do what’s right."
+The amulet’s glow intensified, and a warm, gentle breeze enveloped them. They felt a deep sense of fulfillment and clarity.
+As they made their way back to Everwood, Professor Merriwether awaited them with a proud smile.
+"You have learned much today," she said. "The true magic lies not in the wish itself, but in the journey and the choices you make along the way."
+The trio smiled, their hearts full of newfound wisdom and friendship. They knew that whatever adventures lay ahead, they would face them together, guided by the lessons they had learned.