Spaces:

Kaushik066
/

indian_sign_language_translation

Running

App Files Files Community

Kaushik066 commited on Feb 27

Commit

b41fa6a

1 Parent(s): 75d7cea

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -5

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ from torchvision import transforms
 from torchvision.transforms import v2
 # For ML Model
 import transformers
-from transformers import VivitImageProcessor, VivitConfig, VivitModel
 from transformers import set_seed
 # For Data Loaders
 import datasets
@@ -47,7 +47,8 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 MODEL_TRANSFORMER = 'google/vivit-b-16x2'
 # Set Paths
 #model_path = 'vivit_pytorch_loss051.pt'
-model_path = 'vivit_pytorch_GPU_6_acc087.pt'
 data_path = 'signs'
 # Custom CSS to control output video size
@@ -225,7 +226,9 @@ class SignClassificationModel(torch.nn.Module):
         return reduced_tensor
 # Load the model
-model_pretrained = torch.load(model_path, map_location=device, weights_only=False) #torch.device('cpu')
 # Evaluation Function
 def prod_function(model_pretrained, prod_ds):
@@ -307,7 +310,8 @@ def translate_sign_language(gesture):
     #prod_video = np.random.randint(0, 255, (32, 225, 225, 3), dtype=np.uint8)
     # Run ML Model
-    predicted_prod_label = prod_function(model_pretrained, prod_ds)
     # Identify the hand gesture
     predicted_prod_label = predicted_prod_label.squeeze(0)
@@ -347,7 +351,7 @@ with gr.Blocks(css=custom_css) as demo:
         #        text_output = gr.Textbox(label="Translation in English")
         with gr.Row():
-            with gr.Column(scale=1, variant="panel"):
                 with gr.Row(height=350, variant="panel"):
                     # Add webcam input for sign language video capture
                     video_input = gr.Video(sources=["webcam"], format="mp4", label="Gesture")

 from torchvision.transforms import v2
 # For ML Model
 import transformers
+from transformers import VivitImageProcessor, VivitConfig, VivitModel, VivitForVideoClassification
 from transformers import set_seed
 # For Data Loaders
 import datasets
 MODEL_TRANSFORMER = 'google/vivit-b-16x2'
 # Set Paths
 #model_path = 'vivit_pytorch_loss051.pt'
+model_path_2_pytorch = 'vivit_pytorch_GPU_6_acc087.pt'
+model_path_2_transformer = ''
 data_path = 'signs'
 # Custom CSS to control output video size
         return reduced_tensor
 # Load the model
+#model_pretrained = torch.load(model_path, map_location=device, weights_only=False) #torch.device('cpu')
+#model_pretrained_2 = torch.load(model_path_2, map_location=device, weights_only=False)
+model_pretrained_2 = VivitForVideoClassification.from_pretrained(model_path_2_transformer)
 # Evaluation Function
 def prod_function(model_pretrained, prod_ds):
     #prod_video = np.random.randint(0, 255, (32, 225, 225, 3), dtype=np.uint8)
     # Run ML Model
+    #predicted_prod_label = prod_function(model_pretrained, prod_ds)
+    predicted_prod_label = prod_function(model_pretrained_2, prod_ds)
     # Identify the hand gesture
     predicted_prod_label = predicted_prod_label.squeeze(0)
         #        text_output = gr.Textbox(label="Translation in English")
         with gr.Row():
+            with gr.Column(scale=1.25, variant="panel"):
                 with gr.Row(height=350, variant="panel"):
                     # Add webcam input for sign language video capture
                     video_input = gr.Video(sources=["webcam"], format="mp4", label="Gesture")