Spaces:

TangSan003
/

question-answering-api

Sleeping

App Files Files Community

TangSan003 commited on 17 days ago

Commit

8516514

0 Parent(s):

Load model

Browse files

Files changed (17) hide show

.gitattributes +1 -0
__pycache__/inference.cpython-310.pyc +0 -0
__pycache__/model.cpython-310.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
app.py +41 -0
finetune_roberta_qa.py +229 -0
inference.py +104 -0
model.py +399 -0
requirements.txt +4 -0
save_model/merges.txt +0 -0
save_model/model.safetensors +3 -0
save_model/special_tokens_map.json +51 -0
save_model/tokenizer.json +0 -0
save_model/tokenizer_config.json +60 -0
save_model/training_args.bin +0 -0
save_model/vocab.json +0 -0
utils.py +220 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.safetensors filter=lfs diff=lfs merge=lfs -text

__pycache__/inference.cpython-310.pyc ADDED Viewed

Binary file (2.59 kB). View file

__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (10.5 kB). View file

__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (5.29 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from flask import Flask, request, jsonify
+from flask_cors import CORS
+from inference import InferenceModel
+import traceback
+app = Flask(__name__)
+CORS(app)
+try:
+    model = InferenceModel(path_to_weights="save_model/model.safetensors", huggingface_model=True)
+except Exception as e:
+    print("❌ Lỗi khi load mô hình:")
+    traceback.print_exc()
+    model = None
+@app.route('/pred', methods=['POST'])
+def prediction():
+    payload = request.get_json()
+    # Lấy dữ liệu từ request
+    context = payload.get('context', '')
+    question = payload.get('question', '')
+    # # In ra terminal
+    # print("\n===== Nhận yêu cầu mới =====")
+    # print(f"Context: {context}")
+    # print(f"Question: {question}")
+    # Gọi mô hình
+    prediction = model.inference_model(question, context)
+    answer = prediction["answer"]
+    return jsonify({"answer": answer}), 200
+if __name__ == '__main__':
+    app.run(port=5000, debug=True)
+# Chayj server

finetune_roberta_qa.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import os
+import logging
+os.environ["WANDB_PROJECT"] = "RoBERTa_QA_Finetune"
+import argparse
+from datasets import load_dataset
+from transformers import (
+    RobertaTokenizerFast,
+    DefaultDataCollator,
+    TrainingArguments,
+    Trainer,
+)
+import torch
+from utils import RobertaConfig, ExtractiveQAPreProcesing
+from model import RobertaForQuestionAnswering
+import warnings
+warnings.filterwarnings("ignore")
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Wav2Vec2 Finetuning Arguments on Librispeech")
+    ### Experiment Logging ###
+    parser.add_argument(
+        "--experiment_name",
+        required=True,
+        type=str
+    )
+    parser.add_argument(
+        "--working_directory",
+        required=True,
+        type=str
+    )
+    parser.add_argument(
+        "--path_to_cache_dir",
+        help="Path to huggingface cache if different from default",
+        default=None,
+        type=str
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        help="Number of epochs you want to train for",
+        default=3,
+        type=int
+    )
+    parser.add_argument(
+        "--save_steps",
+        help="After how many steps do you want to log a checkpoint",
+        default=500,
+        type=int
+    )
+    parser.add_argument(
+        "--eval_steps",
+        help="After how many steps do you want to evaluate on eval data",
+        default=500,
+        type=int
+    )
+    parser.add_argument(
+        "--logging_steps",
+        help="After how many steps do you want to log to Weights and Biases (if installed)",
+        default=500,
+        type=int
+    )
+    parser.add_argument(
+        "--warmup_steps",
+        help="Number of learning rate warmup steps",
+        default=100,
+        type=int
+    )
+    ### Training Arguments ###
+    parser.add_argument(
+        "--per_device_batch_size",
+        help="Batch size for every gradient accumulation steps",
+        default=2,
+        type=int
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        help="Number of gradient accumulation steps you want",
+        default=2,
+        type=int
+    )
+    parser.add_argument(
+        "--learning_rate",
+        help="Max learning rate that we warmup to",
+        default=2e-5,
+        type=float
+    )
+    parser.add_argument(
+        "--weight_decay",
+        help="Weight decay applied to model parameters during training",
+        default=0.01,
+        type=float
+    )
+    parser.add_argument(
+        "--save_total_limit",
+        help="Max number of checkpoints to save",
+        default=4,
+        type=int
+    )
+    ### Backbone Arguments ###
+    parser.add_argument(
+        "--huggingface_model_name",
+        help="Name for pretrained RoBERTa backbone and Tokenizer",
+        default="deepset/roberta-base-squad2",
+        type=str
+    )
+    parser.add_argument(
+        "--path_to_pretrained_backbone",
+        help="Path to model weights stored from our pretraining to initialize the backbone",
+        default=None,
+        type=str
+    )
+    parser.add_argument(
+        "--pretrained_backbone",
+        help="Do you want want a `pretrained` backbone that we made (need to provide path_to_pretrained_backbone), \
+            `pretrained_huggingface` backbone (then need huggingface_model_name), or `random` initialized backbone",
+        choices=("pretrained", "pretrained_huggingface", "random"),
+        type=str
+    )
+    parser.add_argument('--resume_from_checkpoint', type=str, default=None)
+    parser.add_argument('--model_name_or_path', type=str, default="roberta-base")
+    args = parser.parse_args()
+    return args
+### Load Arguments ###
+args = parse_arguments()
+def load_tokenizer(model_name):
+    try:
+        return RobertaTokenizerFast.from_pretrained(model_name)
+    except Exception as e:
+        logging.error(f"Failed to load tokenizer: {e}")
+        raise
+def load_model(config):
+    try:
+        return RobertaForQuestionAnswering(config)
+    except Exception as e:
+        logging.error(f"Failed to load model: {e}")
+        raise
+logging.basicConfig(level=logging.INFO)
+logging.info("----------Loading dataset and tokenizer----------")
+### Load Tokenizer ###
+tokenizer = RobertaTokenizerFast.from_pretrained(args.huggingface_model_name)
+### Load Config ###
+dataset = load_dataset("stanfordnlp/coqa")
+processor = ExtractiveQAPreProcesing()
+tokenized_squad = dataset.map(processor, batched=True, remove_columns=dataset["train"].column_names)
+# print(tokenized_squad.column_names)
+### Load Model ###
+if args.resume_from_checkpoint is not None:
+    config = RobertaConfig(pretrained_backbone=args.pretrained_backbone,
+                           path_to_pretrained_weights=args.path_to_pretrained_backbone)
+    model = RobertaForQuestionAnswering(config)
+    model.load_state_dict(torch.load(f"{args.resume_from_checkpoint}/training_args.bin", map_location="cpu"))
+else:
+    config = RobertaConfig(pretrained_backbone=args.pretrained_backbone,
+                           path_to_pretrained_weights=args.path_to_pretrained_backbone)
+    model = RobertaForQuestionAnswering(config)
+### Load Default Collator, We padded to longest length so no padding necessary ##
+data_collator = DefaultDataCollator()
+### Define Training Arguments ###
+training_args = TrainingArguments(
+    output_dir=os.path.join(args.working_directory, args.experiment_name),
+    per_device_train_batch_size=args.per_device_batch_size,
+    gradient_accumulation_steps=args.gradient_accumulation_steps,
+    # evaluation_strategy="steps",
+    num_train_epochs=args.num_train_epochs,
+    bf16=True,
+    save_steps=args.save_steps,
+    eval_steps=args.eval_steps,
+    logging_steps=args.logging_steps,
+    learning_rate=args.learning_rate,
+    weight_decay=args.weight_decay,
+    warmup_steps=args.warmup_steps,
+    save_total_limit=args.save_total_limit,
+    run_name=args.experiment_name,
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_squad["train"],
+    eval_dataset=tokenized_squad["validation"],
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+)
+### TRAIN MODEL !!! ###
+# trainer.train()
+trainer.train(resume_from_checkpoint="model/RoBERTa/finetune_qa_hf_roberta_backbone/checkpoint-54324")
+### Save Final Model ###
+trainer.save_model("/home/tangsan/AllNlpProject/CoQAChat/model/RoBERTa/save_model")

inference.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import torch
+from transformers import RobertaTokenizerFast
+from utils import RobertaConfig
+from model import RobertaForQuestionAnswering
+from safetensors.torch import load_file
+from datasets import load_dataset
+from pprintpp import pprint
+class InferenceModel:
+    """
+        Quick inference function that works with the models we have trained!
+    """
+    def __init__(self, path_to_weights, huggingface_model=True):
+        ### Init Config with either Huggingface Backbone or our own ###
+        self.config = RobertaConfig(pretrained_backbone="pretrained_huggingface" if huggingface_model else "random")
+        ### Load Tokenizer ###
+        self.tokenizer = RobertaTokenizerFast.from_pretrained(self.config.hf_model_name)
+        ### Load Model ###
+        self.model = RobertaForQuestionAnswering(self.config)
+        weights = load_file(path_to_weights)
+        self.model.load_state_dict(weights)
+        self.model.eval()
+    def inference_model(self,
+                        question,
+                        context):
+        ### Tokenize Text
+        inputs = self.tokenizer(text=question,
+                                text_pair=context,
+                                max_length=self.config.context_length,
+                                truncation="only_second",
+                                return_tensors="pt")
+        pass
+        ### Pass through Model ####
+        with torch.no_grad():
+            start_token_logits, end_token_logits = self.model(**inputs)
+        ### Grab Start and End Token Idx ###
+        start_token_idx = start_token_logits.squeeze().argmax().item()
+        end_token_idx = end_token_logits.squeeze().argmax().item()
+        ### Slice Tokens and then Decode with Tokenizer (+1 because slice is not right inclusive) ###
+        tokens = inputs["input_ids"].squeeze()[start_token_idx:end_token_idx + 1]
+        answer = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
+        prediction = {"start_token_idx": start_token_idx,
+                      "end_token_idx": end_token_idx,
+                      "answer": answer}
+        return prediction
+if __name__ == "__main__":
+    dataset = load_dataset("stanfordnlp/coqa")
+    data = dataset["validation"][2]
+    # data = dataset["train"][0]
+    # print("answer:", data["answers"])
+    ### Sample Text ###
+    context = data["story"]
+    print("context:", context)
+    question = data["questions"][4]
+    tokenizer = RobertaTokenizerFast.from_pretrained("deepset/roberta-base-squad2")
+    encoded = tokenizer(
+        question,
+        context,
+        max_length=512,
+        truncation="only_second",
+        padding="max_length",
+        return_offsets_mapping=True,
+        return_tensors="pt"
+    )
+    offset_mapping = encoded["offset_mapping"][0].tolist()  # convert to list of tuples
+    input_ids = encoded["input_ids"][0]
+    ### Inference Model ###
+    path_to_weights = "model/RoBERTa/save_model/model.safetensors"
+    inferencer = InferenceModel(path_to_weights=path_to_weights, huggingface_model=True)
+    prediction = inferencer.inference_model(question, context)
+    print("\n----------------------------------")
+    print("results:", prediction)
+    start_token_idx = prediction["start_token_idx"]
+    end_token_idx = prediction["end_token_idx"]
+    start_char = offset_mapping[start_token_idx][0]
+    end_char = offset_mapping[end_token_idx][1]
+    print("Question:", question)
+    print("Recovered answer:", context[start_char:end_char])
+    # test model

model.py ADDED Viewed

	@@ -0,0 +1,399 @@

+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from safetensors.torch import load_file
+from transformers import RobertaModel as HFRobertaModel
+from utils import RobertaConfig
+from pprintpp import pprint
+class RobertaEmbeddings(nn.Module):
+    """
+    Converts our tokens to embedding vectors and then adds positional embeddings (and potentially token type embeddings)
+    to our data! We wont need to token type embeddings until we do our QA finetuning.
+    """
+    def __init__(self, config):
+        super(RobertaEmbeddings, self).__init__()
+        ### Embeddings for Tokens ###
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_dimension, padding_idx=config.pad_token)
+        ### Positional Embeddings ###
+        self.position_embeddings = nn.Embedding(config.context_length, config.embedding_dimension)
+        ### Layernorm and Dropout ###
+        self.layernorm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_p)
+    def forward(self, input_ids):
+        batch_size, seq_length = input_ids.shape
+        ### Convert Tokens to Embeddings ###
+        x = self.word_embeddings(input_ids)
+        ### Add Positional Information ###
+        avail_idx = torch.arange(0, seq_length, dtype=torch.long, device=input_ids.device)
+        pos_embed = self.position_embeddings(avail_idx)
+        x = x + pos_embed
+        x = self.layernorm(x)
+        x = self.dropout(x)
+        return x
+class RobertaAttention(nn.Module):
+    """
+    Regular Self-Attention but in this case we utilize flash_attention
+    incorporated in the F.scaled_dot_product_attention to speed up our training.
+    """
+    def __init__(self, config):
+        super(RobertaAttention, self).__init__()
+        ### Store Config ###
+        self.config = config
+        ### Sanity Checks ###
+        assert config.embedding_dimension % config.num_attention_heads == 0, "Double check embedding dim divisible by number of heads"
+        ### Attention Head Dim ###
+        self.head_dim = config.embedding_dimension // config.num_attention_heads
+        ### Attention Projections ###
+        self.q_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
+        self.k_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
+        self.v_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
+        ### Post Attention Projection ###
+        self.out_proj = nn.Linear(config.embedding_dimension, config.embedding_dimension)
+    def forward(self, x, attention_mask=None):
+        ### Store Shape ###
+        batch, seq_len, embed_dim = x.shape
+        ### Compute Attention with Flash Attention ###
+        q = self.q_proj(x).reshape(batch, seq_len, self.config.num_attention_heads, self.head_dim).transpose(1,
+                                                                                                             2).contiguous()
+        k = self.k_proj(x).reshape(batch, seq_len, self.config.num_attention_heads, self.head_dim).transpose(1,
+                                                                                                             2).contiguous()
+        v = self.v_proj(x).reshape(batch, seq_len, self.config.num_attention_heads, self.head_dim).transpose(1,
+                                                                                                             2).contiguous()
+        ### Compute Attention (Attention Mask has shape Batch x Sequence len x Sequence len) ###
+        attention_out = F.scaled_dot_product_attention(q, k, v,
+                                                       attn_mask=attention_mask,
+                                                       dropout_p=self.config.attention_dropout_p if self.training else 0.0)
+        ### Compute Output Projection ###
+        attention_out = attention_out.transpose(1, 2).flatten(2)
+        attention_out = self.out_proj(attention_out)
+        return attention_out
+class RobertaFeedForward(nn.Module):
+    """
+    Regular MLP module after our attention computation.
+    """
+    def __init__(self, config):
+        super(RobertaFeedForward, self).__init__()
+        hidden_size = config.embedding_dimension * config.mlp_ratio
+        self.intermediate_dense = nn.Linear(config.embedding_dimension, hidden_size)
+        self.activation = nn.GELU()
+        self.intermediate_dropout = nn.Dropout(config.hidden_dropout_p)
+        self.output_dense = nn.Linear(hidden_size, config.embedding_dimension)
+        self.output_dropout = nn.Dropout(config.hidden_dropout_p)
+    def forward(self, x):
+        x = self.intermediate_dense(x)
+        x = self.activation(x)
+        x = self.intermediate_dropout(x)
+        x = self.output_dense(x)
+        x = self.output_dropout(x)
+        return x
+class RobertaEncoderLayer(nn.Module):
+    """
+    Single transformer block stacking together Attention and our FeedForward
+    layers, with normalization and residual connections.
+    """
+    def __init__(self, config):
+        super(RobertaEncoderLayer, self).__init__()
+        self.attention = RobertaAttention(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_p)
+        self.layer_norm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
+        self.feed_forward = RobertaFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
+    def forward(self, x, attention_mask=None):
+        x = x + self.dropout(self.attention(x, attention_mask=attention_mask))
+        x = self.layer_norm(x)
+        x = x + self.feed_forward(x)
+        x = self.final_layer_norm(x)
+        return x
+class RobertaEncoder(nn.Module):
+    """
+    This will be the stack of all of our transformer blocks
+    """
+    def __init__(self, config):
+        super(RobertaEncoder, self).__init__()
+        self.config = config
+        ### Transformer Layers ###
+        self.layers = nn.ModuleList(
+            [
+                RobertaEncoderLayer(config) for _ in range(config.num_transformer_blocks)
+            ]
+        )
+    def forward(
+            self,
+            x,
+            attention_mask=None,
+    ):
+        batch_size, seq_len, embed_dim = x.shape
+        if attention_mask is not None:
+            ### Make Sure Attention Mask is a Boolean Tensor ###
+            attention_mask = attention_mask.bool()
+            ### Now our Attention Mask is in (Batch x Sequence Length) where we have 0 for tokens we don't want to attend to ###
+            ### F.scaled_dot_product_attention expects a mask of the shape (Batch x ..., x Seq_len x Seq_len) ###
+            ### the "..." in this case is any extra dimensions (such as heads of attention). lets expand our mask to (Batch x 1 x Seq_len x Seq_len) ###
+            ### The 1 in this case refers to the number of heads of attention we want, so it is a dummy index to broadcast over ###
+            ### In each (Seq_len x Seq_len) matrix for every batch, we want False for all columns corresponding to padding tokens ###
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1, 1, seq_len, 1)
+        for layer in self.layers:
+            x = layer(x, attention_mask=attention_mask)
+        return x
+class RobertaMLMHead(nn.Module):
+    """
+    The Masked Language model head is a stack of two linear layers with an activation in between!
+    """
+    def __init__(self, config):
+        super(RobertaMLMHead, self).__init__()
+        self.config = config
+        ### Projection Layer for Hidden States ###
+        self.dense = nn.Linear(config.embedding_dimension, config.embedding_dimension)
+        self.layer_norm = nn.LayerNorm(config.embedding_dimension, eps=config.layer_norm_eps)
+        self.activation = nn.GELU()
+        ### Mapping to Vocabulary ###
+        self.decoder = nn.Linear(config.embedding_dimension, config.vocab_size)
+    def forward(self, inputs):
+        ### Pass through Projection/Activation/Norm ###
+        x = self.dense(inputs)
+        x = self.activation(x)
+        x = self.layer_norm(x)
+        ### Prediction of Masked Tokens ###
+        x = self.decoder(x)
+        return x
+class RobertaModel(nn.Module):
+    """
+    Backbone of our model, has to be pretrained via MLM on a ton of data!
+    """
+    def __init__(self, config):
+        super(RobertaModel, self).__init__()
+        self.config = config
+        ### Define all Parts of the Model ###
+        self.embeddings = RobertaEmbeddings(config)
+        self.encoder = RobertaEncoder(config)
+    def forward(self, input_ids, attention_mask=None):
+        embeddings = self.embeddings(input_ids)
+        output = self.encoder(embeddings, attention_mask)
+        return output
+class RobertaForMaskedLM(nn.Module):
+    """
+    This model will perform the masked language modeling task.
+    """
+    def __init__(self, config):
+        super(RobertaForMaskedLM, self).__init__()
+        self.config = config
+        ### Define Model and MLM Head ###
+        self.roberta = RobertaModel(config)
+        self.mlm_head = RobertaMLMHead(config)
+        self.apply(_init_weights_)
+    def forward(self,
+                input_ids,
+                attention_mask=None,
+                labels=None):
+        ### Pass data through model ###
+        hidden_states = self.roberta(input_ids,
+                                     attention_mask)
+        preds = self.mlm_head(hidden_states)
+        ### Compute Loss if Labels are Available ###
+        loss = None
+        if labels is not None:
+            ### Flatten Logits to (B*S x N) and Labels to (B*S) ###
+            preds = preds.flatten(end_dim=1)
+            labels = labels.flatten()
+            loss = F.cross_entropy(preds, labels)
+            return hidden_states, preds, loss
+        else:
+            return hidden_states, preds
+class RobertaForQuestionAnswering(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.load_backbone()
+        self.qa_head = nn.Linear(config.embedding_dimension, 2)
+    def load_backbone(self):
+        if self.config.pretrained_backbone == "pretrained_huggingface":
+            print("Loading Huggingface RoBERTa Model")
+            self.roberta = HFRobertaModel.from_pretrained(self.config.hf_model_name)
+        else:
+            self.roberta = RobertaModel(self.config)
+            if self.config.pretrained_backbone == "pretrained":
+                # state_dict = load_file(self.config.path_to_pretrained_weights)
+                # print(self.config.path_to_pretrained_weights)
+                if self.config.path_to_pretrained_weights is None:
+                    # state_dict = HFRobertaModel.from_pretrained(RobertaConfig.hf_model_name).state_dict()
+                    raise Exception(
+                        "Provide the argument `path_to_pretrained_weights` in the config, else we cant load them!")
+                else:
+                    if not os.path.isfile(self.config.path_to_pretrained_weights):
+                        raise Exception(
+                            f"Provided path to safetensors weights {self.config.path_to_pretrained_weights} is invalid!")
+                    print(f"Loading RobertaModel Backbone from {self.config.path_to_pretrained_weights}")
+                    state_dict = load_file(self.config.path_to_pretrained_weights)
+                    # Filter and rename keys
+                    backbone_keys = {}
+                    for key in state_dict.keys():
+                        if "roberta" in key:
+                            new_key = key.replace("roberta.", "")
+                            backbone_keys[new_key] = state_dict[key]
+                        else:
+                            continue
+                    self.roberta.load_state_dict(backbone_keys)
+    def forward(self,
+                input_ids,
+                attention_mask=None,
+                start_positions=None,
+                end_positions=None):
+        if self.config.pretrained_backbone == "pretrained_huggingface":
+            output = self.roberta(input_ids, attention_mask=attention_mask).last_hidden_state
+        else:
+            output = self.roberta(input_ids, attention_mask=attention_mask)
+        logit = self.qa_head(output)
+        #
+        start_logits, end_logits = logit.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        if start_positions is not None and end_positions is not None:
+            #
+            if len(start_positions.size()) >1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            start_loss = F.cross_entropy(start_logits, start_positions, ignore_index=ignored_index)
+            end_loss = F.cross_entropy(end_logits, end_positions, ignore_index=ignored_index)
+            total_loss = (start_loss + end_loss) / 2
+            return total_loss, start_logits, end_logits
+        return start_logits, end_logits
+def _init_weights_(module):
+    """
+    Simple weight intialization taken directly from the huggingface
+    `modeling_roberta.py` implementation!
+    """
+    if isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=0.02)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    elif isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=0.02)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    elif isinstance(module, nn.LayerNorm):
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
+if __name__ == "__main__":
+    config = RobertaConfig(pretrained_backbone = "pretrained",
+                           path_to_pretrained_weights="/home/tangsan/AllNlpProject/CoQAChat/model/RoBERTa/finetune_qa_hf_roberta_backbone/checkpoint-27162/model.safetensors")
+    model = RobertaForQuestionAnswering(config=config)
+    rand= torch.randint(0,100,size=(4,8))
+    start_positions=torch.tensor([1,2,3,4])
+    end_positions=torch.tensor([5,6,7,8])
+    model(rand, start_positions=start_positions, end_positions=end_positions)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+fastapi
+uvicorn
+transformers
+torch

save_model/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

save_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf41fc1b853a2a0fc06599fa407a82f96d6aa4e2a6650347d07abc09defccd30
+size 498612792

save_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

save_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

save_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "full_tokenizer_file": null,
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

save_model/training_args.bin ADDED Viewed

Binary file (5.78 kB). View file

save_model/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

utils.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import torch
+import random
+from typing import Literal
+print("CUDA available:", torch.cuda.is_available())
+print("CUDA device count:", torch.cuda.device_count())
+if torch.cuda.is_available():
+    print("GPU name:", torch.cuda.get_device_name(0))
+from datasets.features.video import Example
+from transformers import RobertaTokenizerFast, PretrainedConfig
+from dataclasses import dataclass, asdict
+from datasets import load_dataset
+from pprintpp import pprint
+@dataclass
+class RobertaConfig(PretrainedConfig):
+    ### Tokenizer Config
+    vocab_size: int = 50265
+    start_token: int = 0
+    end_token: int = 2
+    pad_token: int = 2
+    mask_token: int = 50264
+    ### Transformer Config ###
+    embedding_dimension: int = 768
+    num_transformer_blocks: int = 12
+    num_attention_heads: int = 12
+    mlp_ratio: int = 4
+    layer_norm_eps: float = 1e-6
+    hidden_dropout_p: float = 0.1
+    attention_dropout_p: float = 0.1
+    context_length: int = 512
+    ### Masking Config ###
+    masking_prob: float = 0.15
+    ### Huggingface Config ###
+    hf_model_name: str = "deepset/roberta-base-squad2"
+    ### Model Config ###
+    pretrained_backbone: Literal["pretrained", "pretrained_huggingface", "random"] = "pretrained"
+    max_position_embeddings: int = 512
+    path_to_pretrained_weights: str = None
+    ### Added in to_dict() method so this Config is compatible with Huggingface Trainer!!! ###
+    def to_dict(self):
+        return asdict(self)
+def random_masking_text(tokens,
+                        special_tokens_mask,
+                        vocab_size=50264,
+                        special_ids=(0, 1, 2, 3, 50264),
+                        mask_ratio=0.15,
+                        mask_token=50264):
+    """
+    Function for our random masking of tokens (excluding special tokens). This follow the logic provided
+    by BERT/RoBERTa:
+        - Select 15% of the tokens for masking
+            - 80% of the selected tokens are replaced with a mask token
+            - 10% of the selected tokens are replaced with another random token
+            - 10% of the selected tokens are left alone
+    This is almost identical to the masking function in our introductory jupyter notebook walkthrough of
+    masked language modeling, but some minor changes are made to apply masking to batches of tokens
+    rather than just one sequence at a time!
+    """
+    ### Create Random Uniform Sample Tensor ###
+    random_masking = torch.rand(*tokens.shape)
+    ### Set Value of Special Tokens to 1 so we DONT MASK THEM ###
+    random_masking[special_tokens_mask == 1] = 1
+    ### Get Boolean of Words under Masking Threshold ###
+    random_masking = (random_masking < mask_ratio)
+    ### Create Labels ###
+    labels = torch.full((tokens.shape), -100)
+    labels[random_masking] = tokens[random_masking]
+    ### Get Indexes of True ###
+    random_selected_idx = random_masking.nonzero()
+    ### 80% Of the Time Replace with Mask Token ###
+    masking_flag = torch.rand(len(random_selected_idx))
+    masking_flag = (masking_flag < 0.8)
+    selected_idx_for_masking = random_selected_idx[masking_flag]
+    ### Seperate out remaining indexes to be assigned ###
+    unselected_idx_for_masking = random_selected_idx[~masking_flag]
+    ### 10% of the time (or 50 percent of the remaining 20%) we fill with random token ###
+    ### The remaining times, leave the text as is ###
+    masking_flag = torch.rand(len(unselected_idx_for_masking))
+    masking_flag = (masking_flag < 0.5)
+    selected_idx_for_random_filling = unselected_idx_for_masking[masking_flag]
+    selected_idx_to_be_left_alone = unselected_idx_for_masking[~masking_flag]
+    ### Fill Mask Tokens ###
+    if len(selected_idx_for_masking) > 0:
+        tokens[selected_idx_for_masking[:, 0], selected_idx_for_masking[:, 1]] = mask_token
+    ### Fill Random Tokens ###
+    if len(selected_idx_for_random_filling) > 0:
+        non_special_ids = list(set(range(vocab_size)) - set(special_ids))
+        randomly_selected_tokens = torch.tensor(random.sample(non_special_ids, len(selected_idx_for_random_filling)))
+        tokens[selected_idx_for_random_filling[:, 0], selected_idx_for_random_filling[:, 1]] = randomly_selected_tokens
+    return tokens, labels
+def ExtractiveQAPreProcesing():
+    tokenizer = RobertaTokenizerFast.from_pretrained("deepset/roberta-base-squad2")
+    def char2token_mapping(examples):
+        #
+        # pprint(examples)
+        questions = [q.strip() for sublist in examples["questions"] for q in sublist]
+        # pprint(questions)
+        stories = []
+        for idx, sublist in enumerate(examples["questions"]):
+            stories.extend([examples["story"][idx]] * len(sublist))
+        # Now both questions and stories are 1D lists of the same length
+        input = tokenizer(
+            text=questions,
+            text_pair=stories,
+            max_length=512,
+            truncation="only_second",
+            return_offsets_mapping=True,
+            padding="max_length",
+        )
+        offset_mapping = input.pop("offset_mapping")
+        # pprint(input)
+        answers = examples["answers"]
+        input_text =[]
+        answer_start = []
+        answer_end=[]
+        for key in answers:
+            input_text.extend(key["input_text"])
+            answer_start.extend(key["answer_start"])
+            answer_end.extend(key["answer_end"])
+        starting_token_idxs = []
+        ending_token_idxs = []
+        convert_data = {}
+        for i, offset in enumerate(offset_mapping):
+            start_char = answer_start[i]
+            end_char = answer_end[i]
+            # if start_char == -1 or end_char == -1:
+            #     starting_token_idxs.append(0)
+            #     ending_token_idxs.append(0)
+            #     continue
+            sequencen_ids = input.sequence_ids(i)
+            context_start = None
+            context_end = None
+            for idx, id in enumerate(sequencen_ids):
+                if context_start is None and id == 1:
+                    context_start = idx
+                elif context_start is not None and id != 1:
+                    context_end = idx - 1
+                    break
+                elif context_start is not None and idx == len(sequencen_ids) - 1:
+                    context_end = idx
+            context_start_char = offset[context_start][0]
+            context_end_char = offset[context_end][-1]
+            if (start_char >= context_start_char) and (end_char <= context_end_char):
+                # print(start_char, end_char)
+                start_token_idx = None
+                end_token_idx = None
+                for token_idx, (offsets, seq_id) in enumerate(zip(offset, sequencen_ids)):
+                    if seq_id == 1:
+                        if start_char in range(offsets[0], offsets[1] + 1):
+                            start_token_idx = token_idx
+                        if end_char in range(offsets[0], offsets[1] + 1):
+                            end_token_idx = token_idx
+                starting_token_idxs.append(start_token_idx)
+                ending_token_idxs.append(end_token_idx)
+                # print("start_token_idx", start_token_idx, "end_token_idx", end_token_idx)
+            else:
+                starting_token_idxs.append(0)
+                ending_token_idxs.append(0)
+        input["start_positions"] = starting_token_idxs
+        input["end_positions"] = ending_token_idxs
+        return  input
+    return char2token_mapping
+if __name__ == "__main__":
+    datasets = load_dataset("stanfordnlp/coqa")
+    # print(datasets)
+    processor = ExtractiveQAPreProcesing()
+    data = datasets["train"][:1]
+    print("Raw Data:", data["answers"])
+    result = processor(data)
+    # pprint(processor(data))
+# Train model