File size: 7,395 Bytes
c896641
1
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"},"kaggle":{"accelerator":"none","dataSources":[],"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# Install necessary libraries\n!pip install transformers pandas datasets accelerate","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Additional installations for PyTorch and CUDA\n!pip install torch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import re\nimport pandas as pd\nimport torch\nfrom datasets import Dataset\nfrom transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup\nfrom accelerate import Accelerator","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Text cleaning functions\ndef fix_text(text):\n    text = text.replace('&amp;', '&')\n    text = text.replace('&lt;', '<')\n    text = text.replace('&gt;', '>')\n    return text\n\ndef clean_tweet(tweet, allow_new_lines=False):\n    bad_start = ['http:', 'https:']\n    for w in bad_start:\n        tweet = re.sub(f\" {w}\\\\S+\", \"\", tweet)  # removes white space before url\n        tweet = re.sub(f\"{w}\\\\S+ \", \"\", tweet)  # in case a tweet starts with a url\n        tweet = re.sub(f\"\\n{w}\\\\S+ \", \"\", tweet)  # in case the url is on a new line\n        tweet = re.sub(f\"\\n{w}\\\\S+\", \"\", tweet)  # in case the url is alone on a new line\n        tweet = re.sub(f\"{w}\\\\S+\", \"\", tweet)  # any other case?\n    tweet = re.sub(' +', ' ', tweet)  # replace multiple spaces with one space\n    if not allow_new_lines:  # remove new lines\n        tweet = ' '.join(tweet.split())\n    return tweet.strip()\n\ndef boring_tweet(tweet):\n    \"Check if this is a boring tweet\"\n    boring_stuff = ['http', '@', '#']\n    not_boring_words = len([None for w in tweet.split() if all(bs not in w.lower() for bs in boring_stuff)])\n    return not_boring_words < 3","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Load and filter the dataset for a specified party\ndef load_and_filter_data(party):\n    curated_tweets = pd.read_csv('/kaggle/input/curated-tweets/curated_tweets.csv')\n    data = curated_tweets[curated_tweets.Partei == party][['text']].astype(str)\n    return data","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Initialize tokenizer\ndef initialize_tokenizer():\n    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='', eos_token='', pad_token='')\n    return tokenizer","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Prepare dataset for training\ndef prepare_dataset(data, tokenizer):\n    training_examples = f' ' + data['text'] + ''\n    task_df = pd.DataFrame({'text': training_examples})\n    tweet_data = Dataset.from_pandas(task_df)\n\n    def preprocess(example):\n        return tokenizer(example['text'], truncation=True)\n\n    tweet_data = tweet_data.map(preprocess, batched=False)\n    tweet_data = tweet_data.train_test_split(train_size=.8)\n    return tweet_data","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Initialize model and related components\ndef initialize_model_and_components(tokenizer):\n    model = GPT2LMHeadModel.from_pretrained('gpt2')\n    model.resize_token_embeddings(len(tokenizer))\n    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n    return model, data_collator","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Set training arguments\ndef set_training_arguments():\n    training_args = TrainingArguments(\n        output_dir=\"/kaggle/working/tweets\",\n        overwrite_output_dir=True,\n        num_train_epochs=3,\n        per_device_train_batch_size=6,\n        per_device_eval_batch_size=6,\n        load_best_model_at_end=True,\n        log_level='info',\n        evaluation_strategy='epoch',\n        save_strategy='epoch',\n        learning_rate=2e-4,\n        warmup_steps=1e2,\n        seed=38,\n        report_to=\"none\",\n    )\n    return training_args","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Train and evaluate the model\ndef train_and_evaluate_model(model, training_args, tweet_data, data_collator):\n    optimizer = AdamW(model.parameters(), lr=2e-4, eps=1e-8)\n    total_steps = len(tweet_data[\"train\"]) * training_args.num_train_epochs\n    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=1e2, num_training_steps=total_steps)\n\n    trainer = Trainer(\n        model=model,\n        args=training_args,\n        train_dataset=tweet_data[\"train\"],\n        eval_dataset=tweet_data[\"test\"],\n        data_collator=data_collator,\n        optimizers=(optimizer, scheduler),\n    )\n    trainer.train()\n    trainer.evaluate()","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Generate text using the fine-tuned model\ndef generate_text(model, tokenizer, prompt):\n    device = torch.device(\"cuda\")\n    model.eval()\n    generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)\n\n    sample_outputs = model.generate(\n        generated,\n        do_sample=True,\n        top_k=20,\n        max_length=70,\n        top_p=0.98,\n        num_return_sequences=10,\n        temperature=0.95\n    )\n\n    for i, sample_output in enumerate(sample_outputs):\n        print(\"{}: {}\\n\\n\".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Save the fine-tuned model\ndef save_model(model, tokenizer, output_dir):\n    model.save_pretrained(output_dir)\n    tokenizer.save_pretrained(output_dir)","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Main function to execute the workflow\ndef main(party):\n    data = load_and_filter_data(party)\n    tokenizer = initialize_tokenizer()\n    tweet_data = prepare_dataset(data, tokenizer)\n    model, data_collator = initialize_model_and_components(tokenizer)\n    training_args = set_training_arguments()\n    train_and_evaluate_model(model, training_args, tweet_data, data_collator)\n\n    # Generate some example text\n    prompt = \"Die Deutsche Kultur\"\n    generate_text(model, tokenizer, prompt)\n\n    # Save the model\n    save_model(model, tokenizer, \"/kaggle/working/{}_gpt2-finetuned\".format(party))","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Train your desired model\nparty = \"Die Linke\"  # Parties available for training: AfD, FDP, Fraktionslos, SPD, Bündnis 90/Die Grünen, CDU, CSU, Die Linke\nmain(party)","metadata":{},"execution_count":null,"outputs":[]}]}