File size: 3,808 Bytes
898d64f
cbbfdce
21b05f2
a758efc
6b2f148
948b6c3
6b2f148
2a0f339
5d68fbb
cbbfdce
599f557
948b6c3
 
 
 
 
 
 
 
 
599f557
 
 
 
 
 
6b2f148
948b6c3
 
 
 
 
 
 
 
 
 
 
5797bd3
 
 
 
 
 
 
543501a
5797bd3
 
948b6c3
a758efc
5797bd3
90c4eb5
 
5797bd3
543501a
6b2f148
 
90c4eb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b2f148
 
 
 
 
cbbfdce
a758efc
948b6c3
a758efc
948b6c3
 
a758efc
 
948b6c3
 
 
 
 
 
 
a758efc
 
 
 
 
599f557
948b6c3
fd7fbfb
a758efc
fd7fbfb
a758efc
 
 
6b2f148
599f557
fd7fbfb
 
a758efc
fd7fbfb
 
948b6c3
fd7fbfb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import torch
import gradio as gr
from transformers import T5ForConditionalGeneration, T5Tokenizer
import fitz  

# Load the model and tokenizer
model_path = 'Sibinraj/T5-finetuned-dialogue_sumxx'
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a given PDF file.
    
    Args:
        pdf_path (str): Path to the PDF file.
    
    Returns:
        str: Extracted text from the PDF.
    """
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

def summarize_text(text, max_length, show_length):
    """
    Summarizes the given text using a T5 model.
    
    Args:
        text (str): The text to summarize.
        max_length (int): The maximum length of the summary.
        show_length (bool): Whether to show the length of the summary.
    
    Returns:
        str: The summarized text.
    """
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=512,
        truncation=True,
        padding='max_length'
    )
    
    summary_ids = model.generate(
        inputs,
        max_length=max_length + 20,  # Allow some buffer
        min_length=10,  # Set a reasonable minimum length
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    summary_words = summary.split()
    if len(summary_words) > max_length:
        summary = ' '.join(summary_words[:max_length])
    elif len(summary_words) < max_length:
        additional_tokens = model.generate(
            tokenizer.encode(" ".join(summary_words), return_tensors='pt'),
            max_length=max_length - len(summary_words) + len(summary_words),
            min_length=max_length - len(summary_words) + len(summary_words),
            num_beams=5,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
        additional_summary = tokenizer.decode(additional_tokens[0], skip_special_tokens=True)
        summary += ' ' + ' '.join(additional_summary.split()[len(summary_words):max_length])
    
    if show_length:
        summary_length = len(summary.split())
        summary = f"{summary}\n\n(Summary length: {summary_length} words)"
    
    return summary

def handle_input(input_type, text, pdf, max_length, show_length):
    """
    Handles the user input based on the selected input type.
    
    Args:
        input_type (str): The type of input (text or PDF).
        text (str): The text input.
        pdf (UploadedFile): The uploaded PDF file.
        max_length (int): The maximum length of the summary.
        show_length (bool): Whether to show the length of the summary.
    
    Returns:
        str: The summarized text.
    """
    if input_type == 'Text':
        return summarize_text(text, max_length, show_length)
    elif input_type == 'PDF':
        extracted_text = extract_text_from_pdf(pdf.name)
        return summarize_text(extracted_text, max_length, show_length)

# Define the Gradio interface
interface = gr.Interface(
    fn=handle_input,
    inputs=[
        gr.Radio(['Text', 'PDF'], label='Input Type', type='value'),
        gr.Textbox(lines=10, placeholder='Enter Text Here...', label='Input Text', visible=True),
        gr.File(label='Upload PDF', type='filepath', visible=True),
        gr.Slider(minimum=10, maximum=150, step=1, label='Max Length'),
        gr.Checkbox(label='Show summary length', value=False)
    ],
    outputs=gr.Textbox(label='Summarized Text'),
    title='Text or PDF Summarizer using T5-finetuned-dialogue_sumxx'
)

# Launch the Gradio interface
interface.launch()