SoumyaJ commited on
Commit
739efb1
Β·
verified Β·
1 Parent(s): d03b7ee

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +173 -0
  2. requirements.txt +13 -0
app.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import os
4
+ import subprocess
5
+ import json
6
+ from threading import Thread
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
8
+ import spaces
9
+ import moviepy.editor as mp
10
+ import time
11
+ import langdetect
12
+ import uuid
13
+ from dotenv import load_dotenv
14
+
15
+ load_dotenv()
16
+
17
+ HF_TOKEN = os.getenv("HF_TOKEN")
18
+ print("Starting the program...")
19
+
20
+ model_path = "internlm/internlm2_5-7b-chat"
21
+ print(f"Loading model {model_path}...")
22
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
23
+ model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True).cuda()
24
+ model = model.eval()
25
+ print("Model successfully loaded.")
26
+
27
+ def generate_unique_filename(extension):
28
+ return f"{uuid.uuid4()}{extension}"
29
+
30
+ def cleanup_files(*files):
31
+ for file in files:
32
+ if file and os.path.exists(file):
33
+ os.remove(file)
34
+ print(f"Removed file: {file}")
35
+
36
+ @spaces.GPU(duration=90)
37
+ def transcribe_audio(file_path):
38
+ print(f"Starting transcription of file: {file_path}")
39
+ temp_audio = None
40
+ if file_path.endswith(('.mp4', '.avi', '.mov', '.flv')):
41
+ print("Video file detected. Extracting audio...")
42
+ try:
43
+ video = mp.VideoFileClip(file_path)
44
+ temp_audio = generate_unique_filename(".wav")
45
+ video.audio.write_audiofile(temp_audio)
46
+ file_path = temp_audio
47
+ except Exception as e:
48
+ print(f"Error extracting audio from video: {e}")
49
+ raise
50
+
51
+ print(f"Does the file exist? {os.path.exists(file_path)}")
52
+ print(f"File size: {os.path.getsize(file_path) if os.path.exists(file_path) else 'N/A'} bytes")
53
+
54
+ output_file = generate_unique_filename(".json")
55
+ command = [
56
+ "insanely-fast-whisper",
57
+ "--file-name", file_path,
58
+ "--device-id", "0",
59
+ "--model-name", "openai/whisper-large-v3",
60
+ "--task", "transcribe",
61
+ "--timestamp", "chunk",
62
+ "--transcript-path", output_file
63
+ ]
64
+ print(f"Executing command: {' '.join(command)}")
65
+ try:
66
+ result = subprocess.run(command, check=True, capture_output=True, text=True)
67
+ print(f"Standard output: {result.stdout}")
68
+ print(f"Error output: {result.stderr}")
69
+ except subprocess.CalledProcessError as e:
70
+ print(f"Error running insanely-fast-whisper: {e}")
71
+ print(f"Standard output: {e.stdout}")
72
+ print(f"Error output: {e.stderr}")
73
+ raise
74
+
75
+ print(f"Reading transcription file: {output_file}")
76
+ try:
77
+ with open(output_file, "r") as f:
78
+ transcription = json.load(f)
79
+ except json.JSONDecodeError as e:
80
+ print(f"Error decoding JSON: {e}")
81
+ print(f"File content: {open(output_file, 'r').read()}")
82
+ raise
83
+
84
+ if "text" in transcription:
85
+ result = transcription["text"]
86
+ else:
87
+ result = " ".join([chunk["text"] for chunk in transcription.get("chunks", [])])
88
+
89
+ print("Transcription completed.")
90
+
91
+ # Cleanup
92
+ cleanup_files(output_file)
93
+ if temp_audio:
94
+ cleanup_files(temp_audio)
95
+
96
+ return result
97
+
98
+ @spaces.GPU(duration=90)
99
+ def generate_summary_stream(transcription):
100
+ print("Starting summary generation...")
101
+ print(f"Transcription length: {len(transcription)} characters")
102
+
103
+ detected_language = langdetect.detect(transcription)
104
+
105
+ prompt = f"""Summarize the following video transcription in 200-300 words.
106
+ The summary should be in the same language as the transcription, which is detected as {detected_language}.
107
+ Please ensure that the summary captures the main points and key ideas of the transcription:
108
+ {transcription[:300000]}..."""
109
+
110
+ response, history = model.chat(tokenizer, prompt, history=[])
111
+ print(f"Final summary generated: {response[:100]}...")
112
+ print("Summary generation completed.")
113
+ return response
114
+
115
+ def process_uploaded_video(video_path):
116
+ print(f"Processing uploaded video: {video_path}")
117
+ try:
118
+ print("Starting transcription...")
119
+ transcription = transcribe_audio(video_path)
120
+ print(f"Transcription completed. Length: {len(transcription)} characters")
121
+ return transcription, None
122
+ except Exception as e:
123
+ print(f"Error processing video: {e}")
124
+ return f"Processing error: {str(e)}", None
125
+
126
+ print("Setting up Gradio interface...")
127
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
128
+ gr.Markdown(
129
+ """
130
+ # πŸŽ₯ Video Transcription and Smart Summary
131
+
132
+ Upload a video to get a transcription and AI-generated summary.
133
+ """
134
+ )
135
+
136
+ with gr.Tabs():
137
+ with gr.TabItem("πŸ“€ Video Upload"):
138
+ video_input = gr.Video(label="Drag and drop or click to upload")
139
+ video_button = gr.Button("πŸš€ Process Video", variant="primary")
140
+
141
+
142
+ with gr.Row():
143
+ with gr.Column():
144
+ transcription_output = gr.Textbox(label="πŸ“ Transcription", lines=10, show_copy_button=True)
145
+ with gr.Column():
146
+ summary_output = gr.Textbox(label="πŸ“Š Summary", lines=10, show_copy_button=True)
147
+
148
+ summary_button = gr.Button("πŸ“ Generate Summary", variant="secondary")
149
+
150
+ gr.Markdown(
151
+ """
152
+ ### How to use:
153
+ 1. Upload a video.
154
+ 2. Click 'Process' to get the transcription.
155
+ 3. Click 'Generate Summary' to get a summary of the content.
156
+
157
+ *Note: Processing may take a few minutes depending on the video length.*
158
+ """
159
+ )
160
+
161
+ def process_video_and_update(video):
162
+ if video is None:
163
+ return "No video uploaded.", "Please upload a video."
164
+ print(f"Video received: {video}")
165
+ transcription, _ = process_uploaded_video(video)
166
+ print(f"Returned transcription: {transcription[:100] if transcription else 'No transcription generated'}...")
167
+ return transcription or "Transcription error", ""
168
+
169
+ video_button.click(process_video_and_update, inputs=[video_input], outputs=[transcription_output, summary_output])
170
+ summary_button.click(generate_summary_stream, inputs=[transcription_output], outputs=[summary_output])
171
+
172
+ print("Launching Gradio interface...")
173
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ insanely-fast-whisper
2
+ accelerate
3
+ bitsandbytes
4
+ scipy
5
+ sentencepiece
6
+ spaces
7
+ tiktoken
8
+ pytest
9
+ torch
10
+ transformers
11
+ moviepy
12
+ langdetect
13
+ python-dotenv