File size: 2,741 Bytes
44e21a6
 
 
f4067be
 
 
 
 
 
 
 
341ccc1
 
f4067be
 
 
44e21a6
2bb61b8
06bc437
 
 
 
 
 
 
2bb61b8
12b0ed7
95d05cb
 
 
06bc437
95d05cb
 
 
1aa90a2
95d05cb
 
 
 
e0cbe77
95d05cb
1aa90a2
95d05cb
 
 
1aa90a2
95d05cb
 
1aa90a2
95d05cb
 
 
 
 
367a8a1
95d05cb
 
 
 
 
 
12b0ed7
 
 
 
7044543
f4067be
 
bcb2ab6
12b0ed7
 
 
6e4c777
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from pptx import Presentation
import re
import gradio as gr
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F
from transformers import pipeline

# Load the pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Ahmed235/roberta_classification")
model = AutoModelForSequenceClassification.from_pretrained("Ahmed235/roberta_classification")
device = torch.device("cpu")
model = model.to(device)  # Move the model to the CPU

# Create a summarization pipeline
summarizer = pipeline("summarization", model="Falconsai/text_summarization")

def extract_text_from_pptx(file_path):
    presentation = Presentation(file_path)
    text = []
    for slide_number, slide in enumerate(presentation.slides, start=1):
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text.append(shape.text)
    return "\n".join(text)

def predict_pptx_content(file_path):
    try:
        extracted_text = extract_text_from_pptx(file_path)
        cleaned_text = re.sub(r'\s+', ' ', extracted_text)

        # Tokenize and encode the cleaned text
        input_encoding = tokenizer(cleaned_text, truncation=True, padding=True, return_tensors="pt")
        input_encoding = {key: val.to(device) for key, val in input_encoding.items()}  # Move input tensor to CPU

        # Perform inference
        with torch.no_grad():
            outputs = model(**input_encoding)
            logits = outputs.logits

        probabilities = F.softmax(logits, dim=1)

        predicted_label_id = torch.argmax(logits, dim=1).item()
        predicted_label = model.config.id2label[predicted_label_id]
        predicted_probability = probabilities[0][predicted_label_id].item()

        # Summarize the cleaned text
        summary = summarizer(cleaned_text, max_length=80, min_length=30, do_sample=False)[0]['summary_text']

        prediction = {
            "Predicted Label": predicted_label,
            "Evaluation": f"Evaluate the topic according to {predicted_label} is: {predicted_probability}",
            "Summary": summary
        }

        return prediction

    except Exception as e:
        # Log the error details
        print(f"Error in predict_pptx_content: {e}")
        return {"error": str(e)}

# Define the Gradio interface
iface = gr.Interface(
    fn=predict_pptx_content,
    inputs=gr.File(type="filepath", label="Upload PowerPoint (.pptx) file"),
    outputs=["text", "text", "text"],  # Predicted Label, Evaluation, Summary
    live=False,  # Change to True for one-time analysis
    title="<h1 style='color: lightgreen; text-align: center;'>HackTalk Analyzer</h1>",
)

# Deploy the Gradio interface
iface.launch(share=True)