Summarizer

Runtime error

App Files Files Community

eevaw commited on Oct 21, 2024

Commit

c1dfb85

verified ·

1 Parent(s): f1569c8

Upload app.py

Browse files

Files changed (1) hide show

app.py +284 -0

app.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import transformers
+import datasets
+import torch
+import sentencepiece
+import evaluate
+from datasets import load_dataset
+from transformers import MT5ForConditionalGeneration, T5Tokenizer
+import re
+# Load dataset
+ds = load_dataset("scillm/scientific_papers-archive", split="test")
+# Select the first 1000 examples
+small_ds = ds.select(range(1000))
+# Preprocessing function to remove unwanted references
+def preprocess_text(text):
+    # Remove unwanted references like @xcite
+    text = re.sub(r'@\w+', '', text)  # Remove anything that starts with @
+    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
+    return text
+# Preprocessing function
+def preprocess(examples):
+    # Preprocess articles and summaries
+    articles = [preprocess_text(article) for article in examples["input"]]
+    outputs = [preprocess_text(output) for output in examples["output"]]
+    # Add prefix to the articles
+    inputs = ["summarize: " + article for article in articles]
+    # Tokenize articles
+    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
+    # Tokenize summaries
+    labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+# Load mT5 model and tokenizer
+model_name = "google/mt5-small"  # You can also use other mT5 models
+tokenizer = T5Tokenizer.from_pretrained(model_name)
+model = MT5ForConditionalGeneration.from_pretrained(model_name)
+# Tokenize the smaller dataset
+tokenized_small_ds = small_ds.map(preprocess, batched=True)
+# Verify that the dataset is correctly tokenized
+print(tokenized_small_ds[0])
+# Split the data into train and test set
+small_ds = ds.train_test_split(test_size=0.2)
+small_ds["train"][0]
+print(small_ds['train'].features)
+print(small_ds.column_names)
+from transformers import T5Tokenizer
+model_name = "google/mt5-small"
+tokenizer = T5Tokenizer.from_pretrained(model_name)
+# Apply preprocessing function to dataset
+tokenized_ds = small_ds.map(preprocess, batched=True)
+from transformers import DataCollatorForSeq2Seq
+data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
+import torch
+torch.cuda.empty_cache()
+nvidia-smi
+!pip install wandb
+import wandb
+wandb.login()
+from transformers import MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
+import torch
+# Load the model
+model_name = "google/mt5-small"
+model = MT5ForConditionalGeneration.from_pretrained(model_name)
+# Set the device
+device = torch.device("cpu")
+model.to(device)
+# Ensure model parameters are contiguous
+for name, param in model.named_parameters():
+    if not param.is_contiguous():
+        param.data = param.data.contiguous()  # Make the tensor contiguous
+        print(f"Made {name} contiguous.")
+training_args = Seq2SeqTrainingArguments(
+    output_dir='./results',
+    num_train_epochs=10,
+    per_device_train_batch_size=4,  # Pienennä batch-kokoa
+    per_device_eval_batch_size=4,
+    evaluation_strategy='epoch',
+    logging_dir='./logs',
+    predict_with_generate=True
+)
+# Create trainer instance
+trainer = Seq2SeqTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_small_ds.shuffle().select(range(80)),  # Käytetään 800 esimerkkiä koulutukseen
+    eval_dataset=tokenized_small_ds.shuffle().select(range(20, 100)),  # Käytetään 200 esimerkkiä arvioimiseen
+)
+# Kouluta malli
+trainer.train()
+pip install rouge_score
+import evaluate
+rouge = evaluate.load("rouge")
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    # Decode predictions and labels (remove special tokens)
+    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+    # Replace -100 in labels (ignore index) with the padding token id
+    labels[labels == -100] = tokenizer.pad_token_id
+    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+    # Compute ROUGE scores using the `evaluate` library
+    rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)
+    return {
+        "rouge1": rouge_output["rouge1"],
+        "rouge2": rouge_output["rouge2"],
+        "rougeL": rouge_output["rougeL"],
+    }
+# Update trainer to include costom metrics
+trainer.compute_metrics = compute_metrics
+# Evaluate the model
+eval_result = trainer.evaluate()
+print(eval_result)
+# Save the fine-tuned model
+trainer.save_model("fine-tuned-mt5")
+tokenizer.save_pretrained("fine-tuned-mt5")
+# Load required libraries
+from transformers import T5Tokenizer, MT5ForConditionalGeneration
+# Load the fine-tuned tokenizer and model
+model_name = "fine-tuned-mt5"
+new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
+new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
+from transformers import pipeline
+import torch
+# Syötteesi
+# Restructured input
+text = (
+    "Summarize the following information regarding psoriasis, its effects on skin health, and its potential health risks:\n\n"
+    "1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.\n"
+    "2. Immune system dysfunction causes inflammatory cells to accumulate in the dermis, the middle layer of the skin.\n"
+    "3. The condition accelerates skin cell growth, with skin cells shedding more quickly than usual.\n"
+    "4. This abnormal shedding results in uncomfortable symptoms like raised plaques, scales, and redness.\n"
+    "5. Psoriasis not only affects the skin but also increases the risk of serious health issues, including heart disease, cancer, and inflammatory bowel disease.\n\n"
+    "Please provide a summary."
+)
+# Määrittele laite (GPU tai CPU)
+device = 0 if torch.cuda.is_available() else -1
+# Lataa tiivistämispipeline
+summarizer = pipeline("summarization", model=new_model, tokenizer=new_tokenizer, device=device)
+# Tiivistä teksti
+summary = summarizer(text,
+                     max_length=120,
+                     min_length=30,
+                     do_sample=False,
+                     num_beams=10,
+                     repetition_penalty=5.0,
+                     no_repeat_ngram_size=2,
+                     length_penalty=1.0)[0]["summary_text"]
+# Clean the summary by removing the <extra_id_0> token
+import re
+# Regular expression to match both <extra_id_X> and <id_XX>
+pattern = r"<(extra_id_\d+|id_\d+)>"
+# Replace all matches with a space
+cleaned_summary = re.sub(pattern, " ", summary).strip()
+print(cleaned_summary)
+# Niinan koodi
+!pip install gradio PyMuPDF
+import gradio as gr
+from transformers import T5Tokenizer, MT5ForConditionalGeneration
+import fitz  # PyMuPDF
+# Load the fine-tuned tokenizer and model
+model_name = "fine-tuned-mt5"
+new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
+new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
+# Function to extract text from PDF using PyMuPDF
+def extract_text_from_pdf(pdf_file):
+    text = ""
+    # Open the PDF file
+    with fitz.open(pdf_file) as doc:
+        for page in doc:
+            text += page.get_text()  # Extract text from each page
+    return text
+# Summarization function
+def summarize_pdf(pdf_file, max_summary_length):
+    # Extract text from the PDF
+    input_text = extract_text_from_pdf(pdf_file)
+    # Tokenize the input to check length
+    tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')
+    try:
+        # Generate the summary
+        summary_ids = new_model.generate(
+            tokenized_input,
+            max_length=max_summary_length,
+            min_length=30,
+            num_beams=15,
+            repetition_penalty=5.0,
+            no_repeat_ngram_size=2
+        )
+        # Decode the generated summary
+        summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        # Clean up the summary to remove unwanted tokens
+        cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()
+        # Ensure the summary ends with a complete sentence
+        if cleaned_summary:
+            last_period_index = cleaned_summary.rfind('.')
+            if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
+                cleaned_summary = cleaned_summary[:last_period_index + 1]
+            else:
+                cleaned_summary = cleaned_summary.strip()
+        return cleaned_summary if cleaned_summary else "No valid summary generated."
+    except Exception as e:
+        return str(e)  # Return the error message for debugging
+# Define the Gradio interface
+interface = gr.Interface(
+    fn=summarize_pdf,
+    inputs=[
+        gr.File(label="Upload PDF"),
+        gr.Slider(50, 300, step=10, label="Max summary length")
+    ],
+    outputs="textbox",  # A textbox for the output summary
+    title="PDF Text Summarizer",
+    description="Upload a PDF file to summarize its content."
+)
+# Launch the interface
+# Launch the interface with debug mode enabled
+interface.launch(debug=True)