File size: 2,406 Bytes
898d64f
cbbfdce
21b05f2
599f557
6b2f148
 
2a0f339
5d68fbb
cbbfdce
599f557
 
 
 
 
 
 
6b2f148
5797bd3
 
 
 
 
 
 
543501a
5797bd3
 
599f557
 
5797bd3
90c4eb5
 
5797bd3
543501a
6b2f148
 
90c4eb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b2f148
 
 
 
 
cbbfdce
599f557
 
 
 
fd7fbfb
599f557
fd7fbfb
599f557
6b2f148
599f557
fd7fbfb
 
599f557
fd7fbfb
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import torch
import gradio as gr
from transformers import T5ForConditionalGeneration, T5Tokenizer
import fitz  # PyMuPDF

model_path = 'Sibinraj/T5-finetuned-dialogue_sumxx'
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

def summarize_text(text, max_length, show_length):
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=512,
        truncation=True,
        padding='max_length'
    )
    
    summary_ids = model.generate(
        inputs,
        max_length=max_length + 20,
        min_length=10,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    summary_words = summary.split()
    if len(summary_words) > max_length:
        summary = ' '.join(summary_words[:max_length])
    elif len(summary_words) < max_length:
        additional_tokens = model.generate(
            tokenizer.encode(" ".join(summary_words), return_tensors='pt'),
            max_length=max_length - len(summary_words) + len(summary_words),
            min_length=max_length - len(summary_words) + len(summary_words),
            num_beams=5,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
        additional_summary = tokenizer.decode(additional_tokens[0], skip_special_tokens=True)
        summary += ' ' + ' '.join(additional_summary.split()[len(summary_words):max_length])
    
    if show_length:
        summary_length = len(summary.split())
        summary = f"{summary}\n\n(Summary length: {summary_length} words)"
    
    return summary

def handle_pdf(pdf, max_length, show_length):
    text = extract_text_from_pdf(pdf.name)
    return summarize_text(text, max_length, show_length)

interface = gr.Interface(
    fn=handle_pdf,
    inputs=[
        gr.File(label='Upload PDF', type='file'),
        gr.Slider(minimum=10, maximum=150, step=1, label='Max Length'),
        gr.Checkbox(label='Show summary length', value=False)
    ],
    outputs=gr.Textbox(label='Summarized Text'),
    title='PDF Text Summarizer using T5-finetuned-dialogue_sumxx'
)

interface.launch()