File size: 2,171 Bytes
98c0f54
18332e8
d6ff263
bbeaa3a
e993aed
09ca2da
18332e8
 
e025c42
09ca2da
2bb61b8
c4d5545
 
 
 
 
 
 
2bb61b8
d6ff263
 
 
 
12b0ed7
95d05cb
8cb1867
b246175
d6ff263
 
 
 
18332e8
d6ff263
98c0f54
18332e8
 
1b6c7fd
c1117f1
9ad6938
2e63d67
 
 
ae2295f
9ad6938
 
9a8d31d
fc6af31
9a8d31d
2e63d67
b093982
95d05cb
 
12b0ed7
 
 
 
e36289c
d8e3fd6
d6ff263
bcb2ab6
12b0ed7
 
 
124a463
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import gradio as gr
from transformers import pipeline
from pptx import Presentation
import re
import json

# Create a text classification pipeline
classifier = pipeline("text-classification", model="Ahmed235/roberta_classification", tokenizer="Ahmed235/roberta_classification")
summarizer = pipeline("summarization", model="Falconsai/text_summarization")

def extract_text_from_pptx(file_path):
    presentation = Presentation(file_path)
    text = []
    for slide_number, slide in enumerate(presentation.slides, start=1):
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text.append(shape.text)
    return "\n".join(text)

def limit_text_length(text, max_length=512):
    # Truncate or limit the text length
    return text[:max_length]

def predict_pptx_content(file_path):
    try:
        extracted_text = extract_text_from_pptx(file_path)
        cleaned_text = re.sub(r'\s+', ' ', extracted_text)
        
        # Limit text length before classification
        limited_text = limit_text_length(cleaned_text)

        # Perform inference using the pipeline
        result = classifier(limited_text)

        predicted_label = result[0]['label']
        predicted_probability = result[0]['score']
        summary = summarizer(cleaned_text, max_length=1000, min_length=30, do_sample=False)[0]['summary_text']
        
        output = {
            "predicted_label": predicted_label,
            "evaluation": predicted_probability,
            "summary": summary
        }
        
        output_dict = json.dumps(output, indent = 3)

        return {"output": output_dict}  # Return the JSON string within a dictionary

    except Exception as e:
        # Log the error details
        print(f"Error in predict_pptx_content: {e}")
        return {"error": str(e)}

# Define the Gradio interface
iface = gr.Interface(
    fn=predict_pptx_content,
    inputs=gr.File(type="filepath", label="Upload PowerPoint (.pptx) file"),
    outputs=gr.Textbox("output"),
    live=False,
    title="<h1 style='color: lightgreen; text-align: center;'>HackTalk Analyzer</h1>",
)

# Deploy the Gradio interface
iface.launch(share=True)