eevaw commited on
Commit
c1dfb85
·
verified ·
1 Parent(s): f1569c8

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +284 -0
app.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ import datasets
3
+ import torch
4
+ import sentencepiece
5
+ import evaluate
6
+
7
+
8
+ from datasets import load_dataset
9
+ from transformers import MT5ForConditionalGeneration, T5Tokenizer
10
+ import re
11
+
12
+ # Load dataset
13
+ ds = load_dataset("scillm/scientific_papers-archive", split="test")
14
+
15
+ # Select the first 1000 examples
16
+ small_ds = ds.select(range(1000))
17
+
18
+ # Preprocessing function to remove unwanted references
19
+ def preprocess_text(text):
20
+ # Remove unwanted references like @xcite
21
+ text = re.sub(r'@\w+', '', text) # Remove anything that starts with @
22
+ text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
23
+ return text
24
+
25
+ # Preprocessing function
26
+ def preprocess(examples):
27
+ # Preprocess articles and summaries
28
+ articles = [preprocess_text(article) for article in examples["input"]]
29
+ outputs = [preprocess_text(output) for output in examples["output"]]
30
+
31
+ # Add prefix to the articles
32
+ inputs = ["summarize: " + article for article in articles]
33
+
34
+ # Tokenize articles
35
+ model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
36
+
37
+ # Tokenize summaries
38
+ labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")
39
+
40
+ model_inputs["labels"] = labels["input_ids"]
41
+
42
+ return model_inputs
43
+
44
+ # Load mT5 model and tokenizer
45
+ model_name = "google/mt5-small" # You can also use other mT5 models
46
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
47
+ model = MT5ForConditionalGeneration.from_pretrained(model_name)
48
+
49
+ # Tokenize the smaller dataset
50
+ tokenized_small_ds = small_ds.map(preprocess, batched=True)
51
+
52
+ # Verify that the dataset is correctly tokenized
53
+ print(tokenized_small_ds[0])
54
+
55
+ # Split the data into train and test set
56
+ small_ds = ds.train_test_split(test_size=0.2)
57
+
58
+ small_ds["train"][0]
59
+
60
+ print(small_ds['train'].features)
61
+
62
+ print(small_ds.column_names)
63
+
64
+ from transformers import T5Tokenizer
65
+
66
+ model_name = "google/mt5-small"
67
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
68
+
69
+ # Apply preprocessing function to dataset
70
+ tokenized_ds = small_ds.map(preprocess, batched=True)
71
+
72
+ from transformers import DataCollatorForSeq2Seq
73
+
74
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
75
+
76
+ import torch
77
+ torch.cuda.empty_cache()
78
+
79
+ nvidia-smi
80
+
81
+ !pip install wandb
82
+ import wandb
83
+ wandb.login()
84
+ from transformers import MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
85
+ import torch
86
+
87
+ # Load the model
88
+ model_name = "google/mt5-small"
89
+ model = MT5ForConditionalGeneration.from_pretrained(model_name)
90
+
91
+ # Set the device
92
+ device = torch.device("cpu")
93
+ model.to(device)
94
+ # Ensure model parameters are contiguous
95
+ for name, param in model.named_parameters():
96
+ if not param.is_contiguous():
97
+ param.data = param.data.contiguous() # Make the tensor contiguous
98
+ print(f"Made {name} contiguous.")
99
+
100
+ training_args = Seq2SeqTrainingArguments(
101
+ output_dir='./results',
102
+ num_train_epochs=10,
103
+ per_device_train_batch_size=4, # Pienennä batch-kokoa
104
+ per_device_eval_batch_size=4,
105
+ evaluation_strategy='epoch',
106
+ logging_dir='./logs',
107
+ predict_with_generate=True
108
+ )
109
+
110
+ # Create trainer instance
111
+ trainer = Seq2SeqTrainer(
112
+ model=model,
113
+ args=training_args,
114
+ train_dataset=tokenized_small_ds.shuffle().select(range(80)), # Käytetään 800 esimerkkiä koulutukseen
115
+ eval_dataset=tokenized_small_ds.shuffle().select(range(20, 100)), # Käytetään 200 esimerkkiä arvioimiseen
116
+ )
117
+
118
+ # Kouluta malli
119
+ trainer.train()
120
+
121
+ pip install rouge_score
122
+ import evaluate
123
+ rouge = evaluate.load("rouge")
124
+
125
+ def compute_metrics(eval_pred):
126
+ predictions, labels = eval_pred
127
+
128
+ # Decode predictions and labels (remove special tokens)
129
+ decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
130
+
131
+ # Replace -100 in labels (ignore index) with the padding token id
132
+ labels[labels == -100] = tokenizer.pad_token_id
133
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
134
+
135
+ # Compute ROUGE scores using the `evaluate` library
136
+ rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)
137
+
138
+ return {
139
+ "rouge1": rouge_output["rouge1"],
140
+ "rouge2": rouge_output["rouge2"],
141
+ "rougeL": rouge_output["rougeL"],
142
+ }
143
+
144
+ # Update trainer to include costom metrics
145
+ trainer.compute_metrics = compute_metrics
146
+
147
+ # Evaluate the model
148
+ eval_result = trainer.evaluate()
149
+ print(eval_result)
150
+
151
+ # Save the fine-tuned model
152
+ trainer.save_model("fine-tuned-mt5")
153
+ tokenizer.save_pretrained("fine-tuned-mt5")
154
+
155
+ # Load required libraries
156
+ from transformers import T5Tokenizer, MT5ForConditionalGeneration
157
+
158
+ # Load the fine-tuned tokenizer and model
159
+ model_name = "fine-tuned-mt5"
160
+ new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
161
+ new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
162
+
163
+ from transformers import pipeline
164
+ import torch
165
+
166
+ # Syötteesi
167
+ # Restructured input
168
+ text = (
169
+ "Summarize the following information regarding psoriasis, its effects on skin health, and its potential health risks:\n\n"
170
+ "1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.\n"
171
+ "2. Immune system dysfunction causes inflammatory cells to accumulate in the dermis, the middle layer of the skin.\n"
172
+ "3. The condition accelerates skin cell growth, with skin cells shedding more quickly than usual.\n"
173
+ "4. This abnormal shedding results in uncomfortable symptoms like raised plaques, scales, and redness.\n"
174
+ "5. Psoriasis not only affects the skin but also increases the risk of serious health issues, including heart disease, cancer, and inflammatory bowel disease.\n\n"
175
+ "Please provide a summary."
176
+ )
177
+
178
+
179
+ # Määrittele laite (GPU tai CPU)
180
+ device = 0 if torch.cuda.is_available() else -1
181
+
182
+ # Lataa tiivistämispipeline
183
+ summarizer = pipeline("summarization", model=new_model, tokenizer=new_tokenizer, device=device)
184
+
185
+ # Tiivistä teksti
186
+ summary = summarizer(text,
187
+ max_length=120,
188
+ min_length=30,
189
+ do_sample=False,
190
+ num_beams=10,
191
+ repetition_penalty=5.0,
192
+ no_repeat_ngram_size=2,
193
+ length_penalty=1.0)[0]["summary_text"]
194
+
195
+ # Clean the summary by removing the <extra_id_0> token
196
+
197
+ import re
198
+
199
+ # Regular expression to match both <extra_id_X> and <id_XX>
200
+ pattern = r"<(extra_id_\d+|id_\d+)>"
201
+
202
+ # Replace all matches with a space
203
+ cleaned_summary = re.sub(pattern, " ", summary).strip()
204
+
205
+
206
+ print(cleaned_summary)
207
+
208
+
209
+ # Niinan koodi
210
+ !pip install gradio PyMuPDF
211
+
212
+ import gradio as gr
213
+ from transformers import T5Tokenizer, MT5ForConditionalGeneration
214
+ import fitz # PyMuPDF
215
+
216
+ # Load the fine-tuned tokenizer and model
217
+ model_name = "fine-tuned-mt5"
218
+ new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
219
+ new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
220
+
221
+ # Function to extract text from PDF using PyMuPDF
222
+ def extract_text_from_pdf(pdf_file):
223
+ text = ""
224
+ # Open the PDF file
225
+ with fitz.open(pdf_file) as doc:
226
+ for page in doc:
227
+ text += page.get_text() # Extract text from each page
228
+ return text
229
+
230
+ # Summarization function
231
+ def summarize_pdf(pdf_file, max_summary_length):
232
+ # Extract text from the PDF
233
+ input_text = extract_text_from_pdf(pdf_file)
234
+
235
+ # Tokenize the input to check length
236
+ tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')
237
+
238
+
239
+
240
+ try:
241
+ # Generate the summary
242
+ summary_ids = new_model.generate(
243
+ tokenized_input,
244
+ max_length=max_summary_length,
245
+ min_length=30,
246
+ num_beams=15,
247
+ repetition_penalty=5.0,
248
+ no_repeat_ngram_size=2
249
+ )
250
+
251
+ # Decode the generated summary
252
+ summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
253
+
254
+ # Clean up the summary to remove unwanted tokens
255
+ cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()
256
+
257
+ # Ensure the summary ends with a complete sentence
258
+ if cleaned_summary:
259
+ last_period_index = cleaned_summary.rfind('.')
260
+ if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
261
+ cleaned_summary = cleaned_summary[:last_period_index + 1]
262
+ else:
263
+ cleaned_summary = cleaned_summary.strip()
264
+
265
+ return cleaned_summary if cleaned_summary else "No valid summary generated."
266
+
267
+ except Exception as e:
268
+ return str(e) # Return the error message for debugging
269
+
270
+ # Define the Gradio interface
271
+ interface = gr.Interface(
272
+ fn=summarize_pdf,
273
+ inputs=[
274
+ gr.File(label="Upload PDF"),
275
+ gr.Slider(50, 300, step=10, label="Max summary length")
276
+ ],
277
+ outputs="textbox", # A textbox for the output summary
278
+ title="PDF Text Summarizer",
279
+ description="Upload a PDF file to summarize its content."
280
+ )
281
+
282
+ # Launch the interface
283
+ # Launch the interface with debug mode enabled
284
+ interface.launch(debug=True)