File size: 2,498 Bytes
44e21a6
 
 
f4067be
 
 
 
 
 
 
 
341ccc1
 
f4067be
 
 
44e21a6
2bb61b8
06bc437
 
 
 
 
 
 
2bb61b8
12b0ed7
f4067be
 
06bc437
f4067be
 
341ccc1
1aa90a2
f4067be
 
 
 
e0cbe77
f4067be
1aa90a2
f4067be
 
 
1aa90a2
f4067be
 
1aa90a2
f4067be
 
 
 
 
367a8a1
f4067be
12b0ed7
 
 
 
7044543
f4067be
 
12b0ed7
 
 
 
6e4c777
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from pptx import Presentation
import re
import gradio as gr
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F
from transformers import pipeline

# Load the pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Ahmed235/roberta_classification")
model = AutoModelForSequenceClassification.from_pretrained("Ahmed235/roberta_classification")
device = torch.device("cpu")
model = model.to(device)  # Move the model to the CPU

# Create a summarization pipeline
summarizer = pipeline("summarization", model="Falconsai/text_summarization")

def extract_text_from_pptx(file_path):
    presentation = Presentation(file_path)
    text = []
    for slide_number, slide in enumerate(presentation.slides, start=1):
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text.append(shape.text)
    return "\n".join(text)

def predict_pptx_content(file_path):
    extracted_text = extract_text_from_pptx(file_path)
    cleaned_text = re.sub(r'\s+', ' ', extracted_text)

    # Tokenize and encode the cleaned text
    input_encoding = tokenizer(cleaned_text, truncation=True, padding=True, return_tensors="pt")
    input_encoding = {key: val.to(device) for key, val in input_encoding.items()}  # Move input tensor to CPU

    # Perform inference
    with torch.no_grad():
        outputs = model(**input_encoding)
        logits = outputs.logits

    probabilities = F.softmax(logits, dim=1)

    predicted_label_id = torch.argmax(logits, dim=1).item()
    predicted_label = model.config.id2label[predicted_label_id]
    predicted_probability = probabilities[0][predicted_label_id].item()

    # Summarize the cleaned text
    summary = summarizer(cleaned_text, max_length=80, min_length=30, do_sample=False)[0]['summary_text']

    prediction = {
        "Predicted Label": predicted_label,
        "Evaluation": f"Evaluate the topic according to {predicted_label} is: {predicted_probability}",
        "Summary": summary
    }

    return prediction

# Define the Gradio interface
iface = gr.Interface(
    fn=predict_pptx_content,
    inputs=gr.File(type="filepath", label="Upload PowerPoint (.pptx) file"),
    outputs=["text", "text", "text"],  # Predicted Label, Evaluation, Summary
    live=False,  # Change to True for one-time analysis
    title="<h1 style='color: lightgreen; text-align: center;'>PPTX Analyzer</h1>",
)

# Deploy the Gradio interface
iface.launch(share=True)