File size: 2,741 Bytes
44e21a6 f4067be 341ccc1 f4067be 44e21a6 2bb61b8 06bc437 2bb61b8 12b0ed7 95d05cb 06bc437 95d05cb 1aa90a2 95d05cb e0cbe77 95d05cb 1aa90a2 95d05cb 1aa90a2 95d05cb 1aa90a2 95d05cb 367a8a1 95d05cb 12b0ed7 7044543 f4067be bcb2ab6 12b0ed7 6e4c777 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
from pptx import Presentation
import re
import gradio as gr
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F
from transformers import pipeline
# Load the pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Ahmed235/roberta_classification")
model = AutoModelForSequenceClassification.from_pretrained("Ahmed235/roberta_classification")
device = torch.device("cpu")
model = model.to(device) # Move the model to the CPU
# Create a summarization pipeline
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
def extract_text_from_pptx(file_path):
presentation = Presentation(file_path)
text = []
for slide_number, slide in enumerate(presentation.slides, start=1):
for shape in slide.shapes:
if hasattr(shape, "text"):
text.append(shape.text)
return "\n".join(text)
def predict_pptx_content(file_path):
try:
extracted_text = extract_text_from_pptx(file_path)
cleaned_text = re.sub(r'\s+', ' ', extracted_text)
# Tokenize and encode the cleaned text
input_encoding = tokenizer(cleaned_text, truncation=True, padding=True, return_tensors="pt")
input_encoding = {key: val.to(device) for key, val in input_encoding.items()} # Move input tensor to CPU
# Perform inference
with torch.no_grad():
outputs = model(**input_encoding)
logits = outputs.logits
probabilities = F.softmax(logits, dim=1)
predicted_label_id = torch.argmax(logits, dim=1).item()
predicted_label = model.config.id2label[predicted_label_id]
predicted_probability = probabilities[0][predicted_label_id].item()
# Summarize the cleaned text
summary = summarizer(cleaned_text, max_length=80, min_length=30, do_sample=False)[0]['summary_text']
prediction = {
"Predicted Label": predicted_label,
"Evaluation": f"Evaluate the topic according to {predicted_label} is: {predicted_probability}",
"Summary": summary
}
return prediction
except Exception as e:
# Log the error details
print(f"Error in predict_pptx_content: {e}")
return {"error": str(e)}
# Define the Gradio interface
iface = gr.Interface(
fn=predict_pptx_content,
inputs=gr.File(type="filepath", label="Upload PowerPoint (.pptx) file"),
outputs=["text", "text", "text"], # Predicted Label, Evaluation, Summary
live=False, # Change to True for one-time analysis
title="<h1 style='color: lightgreen; text-align: center;'>HackTalk Analyzer</h1>",
)
# Deploy the Gradio interface
iface.launch(share=True)
|