File size: 6,539 Bytes
ca42f91
 
 
 
 
c381344
ca42f91
 
 
 
 
 
1b80071
1b8e760
 
 
ca42f91
 
 
 
 
 
 
 
 
 
 
 
 
10c807d
 
 
ca42f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b8e760
ca42f91
6f27cd5
 
d39d53c
29468b0
1b8e760
 
 
 
 
 
 
 
2cac84b
1b8e760
 
2cac84b
 
29468b0
 
076ac27
 
 
 
 
4bccf04
ca42f91
 
 
 
 
 
 
 
4bccf04
 
 
 
 
 
 
076ac27
4bccf04
 
 
 
 
 
 
ca42f91
 
4bccf04
ca42f91
 
 
 
 
 
 
 
 
c381344
ca42f91
c381344
 
 
 
ca42f91
1b8e760
 
c381344
4dc2299
c381344
ca42f91
c381344
ca42f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
739efb1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import gradio as gr
import torch
import os
import subprocess
from threading import Thread
from transformers import AutoTokenizer, AutoModelForCausalLM,pipeline
import spaces
import moviepy.editor as mp
import time
import langdetect
import uuid
from dotenv import load_dotenv
import whisper
from pathlib import Path
import numpy as np
from scipy.io import wavfile

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
print("Starting the program...")

model_path = "internlm/internlm2_5-7b-chat"
print(f"Loading model {model_path}...")
#tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
#model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True).cuda()
#model = model.eval()
print("Model successfully loaded.")

model = whisper.load_model("base")
print("Model successfully loaded.")

def generate_unique_filename(extension):
    return f"{uuid.uuid4()}{extension}"

def cleanup_files(*files):
    for file in files:
        if file and os.path.exists(file):
            os.remove(file)
            print(f"Removed file: {file}")

def transcribe_audio(file_path):
    print(f"Starting transcription of file: {file_path}")
    temp_audio = None
    if file_path.endswith(('.mp4', '.avi', '.mov', '.flv')):
        print("Video file detected. Extracting audio...")
        try:
            video = mp.VideoFileClip(file_path)
            temp_audio = generate_unique_filename(".wav")
            video.audio.write_audiofile(temp_audio)

            print(f"temp_audio : {temp_audio}")
            model = whisper.load_model("base.en")
            print(f"transcription1")
            p = Path(__file__).resolve().parent
            final_path = p / temp_audio
            print(final_path)
            if os.access(str(final_path), os.R_OK):
                print("File is readable.")
            else:
                print("File is not readable. Check permissions.")
            
            #sample_rate, audio_data = wavfile.read(str(final_path))


            #transcription = model.transcribe(audio_data, sample_rate=sample_rate)
            transcription = model.transcribe(str(final_path))

            print(f"transcription {transcription}")
           
            if "text" in transcription:
                 result = transcription["text"]
            else:
                result = " ".join([chunk["text"] for chunk in transcription.get("chunks", [])])
            #file_path = temp_audio
        except Exception as e:
            print(f"Error extracting audio from video: {e}")
            raise
    
    print(f"Does the file exist? {os.path.exists(file_path)}")
    print(f"File size: {os.path.getsize(file_path) if os.path.exists(file_path) else 'N/A'} bytes")
    
    try:
        print(f"Reading transcription file: {file_path}")
        #with open(file_path, 'r') as file:
            #file_contents = file.read()
        
        print(f"File content: {file_path}")
        #time.sleep(5)

       

    except ConnectionResetError as e:
        print(f"Connection error occurred: {e}")

    except Exception as e:
        print(f"Error output: {e}")

    print("Transcription completed.")
    
    # Cleanup    
    if temp_audio:
        cleanup_files(temp_audio)
    
    return result

def generate_summary_stream(transcription):
    print("Starting summary generation...")
    print(f"Transcription length: {len(transcription)} characters")
    
    #detected_language = langdetect.detect(transcription)
    
    #prompt = f"""Summarize the following video transcription in 200-300 words. 
    #The summary should be in the same language as the transcription, which is detected as {detected_language}.
    #Please ensure that the summary captures the main points and key ideas of the transcription:
    #{transcription[:300000]}..."""
    
    #response, history = model.chat(tokenizer, prompt, history=[])
    #print(f"Final summary generated: {response[:100]}...")
    summarizer = pipeline("summarization")
    summary = summarizer(transcription, max_length=500, min_length=250, do_sample=False)
    #print(summary[0]['summary_text'])
    print("Summary generation completed.")
    return summary[0]['summary_text']

def process_uploaded_video(video_path):
    print(f"Processing uploaded video: {video_path}")
    try:
        print("Starting transcription...")
        transcription = transcribe_audio(video_path)
        print(f"Transcription completed. Length: {len(transcription)} characters")
        return transcription, None
    except Exception as e:
        print(f"Error processing video: {e}")
        return f"Processing error: {str(e)}", None

print("Setting up Gradio interface...")
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # πŸŽ₯ Video Transcription and Smart Summary
        
        Upload a video to get a transcription and AI-generated summary.
        """
    )
    
    with gr.Tabs():
        with gr.TabItem("πŸ“€ Video Upload"):
            video_input = gr.Video(label="Drag and drop or click to upload")
            video_button = gr.Button("πŸš€ Process Video", variant="primary")       
       
    
    with gr.Row():
        with gr.Column():
            transcription_output = gr.Textbox(label="πŸ“ Transcription", lines=10, show_copy_button=True)
        with gr.Column():
            summary_output = gr.Textbox(label="πŸ“Š Summary", lines=10, show_copy_button=True)
    
    summary_button = gr.Button("πŸ“ Generate Summary", variant="secondary")
    
    gr.Markdown(
        """
        ### How to use:
        1. Upload a video.
        2. Click 'Process' to get the transcription.
        3. Click 'Generate Summary' to get a summary of the content.
        
        *Note: Processing may take a few minutes depending on the video length.*
        """
    )
    
    def process_video_and_update(video):
        if video is None:
            return "No video uploaded.", "Please upload a video."
        print(f"Video received: {video}")
        transcription, _ = process_uploaded_video(video)
        print(f"Returned transcription: {transcription[:100] if transcription else 'No transcription generated'}...")
        return transcription or "Transcription error", ""

    video_button.click(process_video_and_update, inputs=[video_input], outputs=[transcription_output, summary_output])   
    summary_button.click(generate_summary_stream, inputs=[transcription_output], outputs=[summary_output])

print("Launching Gradio interface...")
demo.launch()