File size: 2,405 Bytes
f7487a3
 
 
b4a053e
 
 
f7487a3
 
 
b4a053e
 
 
 
 
 
f7487a3
b4a053e
f7487a3
 
 
 
 
 
 
b4a053e
f7487a3
 
b4a053e
 
f7487a3
b4a053e
 
f7487a3
b4a053e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7487a3
 
b4a053e
 
 
 
 
 
f7487a3
b4a053e
f7487a3
b4a053e
f7487a3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import torch
import gradio as gr
from transformers import T5ForConditionalGeneration, T5Tokenizer
import fitz  # PyMuPDF

model_path = 'Sibinraj/T5-finetuned-dialogue_sumxx'
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

def summarize_text(text, max_length, show_length):
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=512,
        truncation=True,
        padding='max_length'
    )
    
    summary_ids = model.generate(
        inputs,
        max_length=max_length + 20,
        min_length=10,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    summary_words = summary.split()
    if len(summary_words) > max_length:
        summary = ' '.join(summary_words[:max_length])
    elif len(summary_words) < max_length:
        additional_tokens = model.generate(
            tokenizer.encode(" ".join(summary_words), return_tensors='pt'),
            max_length=max_length - len(summary_words) + len(summary_words),
            min_length=max_length - len(summary_words) + len(summary_words),
            num_beams=5,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
        additional_summary = tokenizer.decode(additional_tokens[0], skip_special_tokens=True)
        summary += ' ' + ' '.join(additional_summary.split()[len(summary_words):max_length])
    
    if show_length:
        summary_length = len(summary.split())
        summary = f"{summary}\n\n(Summary length: {summary_length} words)"
    
    return summary

def handle_pdf(pdf, max_length, show_length):
    text = extract_text_from_pdf(pdf.name)
    return summarize_text(text, max_length, show_length)

interface = gr.Interface(
    fn=handle_pdf,
    inputs=[
        gr.File(label='Upload PDF', type='file'),
        gr.Slider(minimum=10, maximum=150, step=1, label='Max Length'),
        gr.Checkbox(label='Show summary length', value=False)
    ],
    outputs=gr.Textbox(label='Summarized Text'),
    title='PDF Text Summarizer using T5-finetuned-dialogue_sumxx'
)

interface.launch()