Souha Ben Hassine commited on
Commit
eebb5f5
·
1 Parent(s): c32f512

initial commit

Browse files
Files changed (1) hide show
  1. app.py +9 -50
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import torch
2
- from torch.utils.data import Dataset
3
- from transformers import AutoModel, AutoTokenizer, AutoConfig
4
  from torchvision import transforms, models
5
  import torch.nn as nn
6
  import os
@@ -8,16 +7,15 @@ import json
8
  import cv2
9
  from PIL import Image
10
  import gradio as gr
11
- import torch
12
 
13
  class MultimodalRiskBehaviorModel(nn.Module):
14
  def __init__(self, text_model_name="bert-base-uncased", hidden_dim=512, dropout=0.3):
15
  super(MultimodalRiskBehaviorModel, self).__init__()
16
 
17
- # Text model
18
  self.text_model_name = text_model_name
19
- self.text_model = AutoModel.from_pretrained(text_model_name)
20
-
21
  # Visual model (ResNet50)
22
  self.visual_model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
23
 
@@ -38,10 +36,10 @@ class MultimodalRiskBehaviorModel(nn.Module):
38
  attention_mask = encoding['attention_mask'].squeeze(1).to(device)
39
 
40
  # Text embeddings from BERT
41
- text_features = self.text_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
42
 
43
  # Visual features from ResNet50
44
- frames = frames.to(device) # Move frames to the same device as the model
45
 
46
  batch_size, num_frames, channels, height, width = frames.size()
47
  frames = frames.view(batch_size * num_frames, channels, height, width)
@@ -55,56 +53,17 @@ class MultimodalRiskBehaviorModel(nn.Module):
55
 
56
  # Pass through the classifier
57
  x = self.dropout(torch.relu(self.fc1(combined_features)))
58
- output = torch.sigmoid(self.fc2(x)) # Sigmoid for binary classification
59
 
60
  return output
61
- def save_pretrained(self, save_directory):
62
- os.makedirs(save_directory, exist_ok=True)
63
- torch.save(self.state_dict(), os.path.join(save_directory, 'pytorch_model.bin'))
64
- config = {
65
- "text_model_name": self.text_model_name,
66
- "hidden_dim": self.fc1.out_features
67
- }
68
- with open(os.path.join(save_directory, 'config.json'), 'w') as f:
69
- json.dump(config, f)
70
- @classmethod
71
- def from_pretrained(cls, load_directory, map_location=None):
72
- if os.path.exists(load_directory):
73
- # Handling local paths
74
- config_path = os.path.join(load_directory, 'config.json')
75
- state_dict_path = os.path.join(load_directory, 'pytorch_model.bin')
76
-
77
- if not os.path.exists(config_path):
78
- raise FileNotFoundError(f"{config_path} not found.")
79
- if not os.path.exists(state_dict_path):
80
- raise FileNotFoundError(f"{state_dict_path} not found.")
81
-
82
- with open(config_path, 'r') as f:
83
- config_dict = json.load(f)
84
- model = cls(text_model_name=config_dict["text_model_name"], hidden_dim=config_dict["hidden_dim"])
85
- state_dict = torch.load(state_dict_path, map_location=map_location)
86
- model.load_state_dict(state_dict)
87
-
88
- else:
89
- # Handling Hugging Face Hub paths
90
- config = AutoConfig.from_pretrained(load_directory)
91
-
92
- # Use AutoModel to load a pre-trained model with weights
93
- hf_model = AutoModel.from_pretrained(load_directory, config=config)
94
- model = cls(text_model_name=config.name_or_path, hidden_dim=hf_model.config.hidden_size)
95
- model.text_model = hf_model
96
-
97
- return model
98
-
99
 
 
100
  tokenizer = AutoTokenizer.from_pretrained('Souha-BH/BERT_Resnet50')
101
- model = MultimodalRiskBehaviorModel.from_pretrained('Souha-BH/BERT_Resnet50') # if cpu add arg map_location='cpu'
102
-
103
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
104
  model.to(device)
105
 
106
 
107
-
108
  # Function to load frames from a video
109
  def load_frames_from_video(video_path, transform, num_frames=10):
110
  cap = cv2.VideoCapture(video_path)
 
1
  import torch
2
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
 
3
  from torchvision import transforms, models
4
  import torch.nn as nn
5
  import os
 
7
  import cv2
8
  from PIL import Image
9
  import gradio as gr
 
10
 
11
  class MultimodalRiskBehaviorModel(nn.Module):
12
  def __init__(self, text_model_name="bert-base-uncased", hidden_dim=512, dropout=0.3):
13
  super(MultimodalRiskBehaviorModel, self).__init__()
14
 
15
+ # Text model using AutoModelForSequenceClassification
16
  self.text_model_name = text_model_name
17
+ self.text_model = AutoModelForSequenceClassification.from_pretrained(text_model_name, num_labels=1)
18
+
19
  # Visual model (ResNet50)
20
  self.visual_model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
21
 
 
36
  attention_mask = encoding['attention_mask'].squeeze(1).to(device)
37
 
38
  # Text embeddings from BERT
39
+ text_features = self.text_model(input_ids=input_ids, attention_mask=attention_mask).logits
40
 
41
  # Visual features from ResNet50
42
+ frames = frames.to(device)
43
 
44
  batch_size, num_frames, channels, height, width = frames.size()
45
  frames = frames.view(batch_size * num_frames, channels, height, width)
 
53
 
54
  # Pass through the classifier
55
  x = self.dropout(torch.relu(self.fc1(combined_features)))
56
+ output = torch.sigmoid(self.fc2(x))
57
 
58
  return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ # Loading the tokenizer and model
61
  tokenizer = AutoTokenizer.from_pretrained('Souha-BH/BERT_Resnet50')
62
+ model = MultimodalRiskBehaviorModel.from_pretrained('Souha-BH/BERT_Resnet50', map_location='cpu')
 
63
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
64
  model.to(device)
65
 
66
 
 
67
  # Function to load frames from a video
68
  def load_frames_from_video(video_path, transform, num_frames=10):
69
  cap = cv2.VideoCapture(video_path)