--[[ Transcript.lua - Speech transcription data model ]]-- Transcript = Polo { COLUMN_ORDER = {"id", "seek", "start", "end", "text", "score", "file"}, DEFAULT_HIDE = { seek = true, temperature = true, tokens = true, avg_logprob = true, compression_ratio = true, no_speech_prob = true }, init = function(self) self:clear() end } Transcript.calculate_offset = function (item, take) return ( reaper.GetMediaItemInfo_Value(item, 'D_POSITION') - reaper.GetMediaItemTakeInfo_Value(take, 'D_STARTOFFS')) end function Transcript:clear() self.init_data = {} self.filtered_data = {} self.data = {} self.search = '' end function Transcript:get_columns() if #self.init_data > 0 then local columns = {"score"} local row = self.init_data[1] for k, _ in pairs(row.data) do if k:sub(1, 1) ~= '_' then table.insert(columns, k) end end return self:_sort_columns(columns) end return {} end function Transcript:_sort_columns(columns) local order = self.COLUMN_ORDER local column_set = {} local extra_columns = {} local order_set = {} local result = {} for _, column in pairs(columns) do column_set[column] = true end for _, column in pairs(order) do order_set[column] = true if column_set[column] then table.insert(result, column) end end for _, column in pairs(columns) do if not order_set[column] then table.insert(extra_columns, column) end end table.sort(extra_columns) for _, column in pairs(extra_columns) do table.insert(result, column) end return result end function Transcript:add_segment(segment) table.insert(self.init_data, segment) end function Transcript:has_segments() return #self.init_data > 0 end function Transcript:get_segments() return self.data end function Transcript:sort(column, ascending) self.data = {table.unpack(self.filtered_data)} table.sort(self.data, function (a, b) local a_val, b_val = a:get(column), b:get(column) if a_val == nil then a_val = '' end if b_val == nil then b_val = '' end if not ascending then a_val, b_val = b_val, a_val end return a_val < b_val end) end function Transcript:to_table() local segments = {} for _, segment in pairs(self.data) do table.insert(segments, segment:to_table()) end return {segments = segments} end function Transcript:to_json() return json.encode(self:to_table()) end function Transcript:update() if #self.init_data == 0 then self:clear() return end local columns = self:get_columns() if #self.search > 0 then local search = self.search local search_lower = search:lower() local match_case = (search ~= search_lower) self.filtered_data = {} for _, segment in pairs(self.init_data) do local matching = false for _, column in pairs(columns) do if match_case then if tostring(segment.data[column]):find(search) then matching = true break end else if tostring(segment.data[column]):lower():find(search_lower) then matching = true break end end end if matching then table.insert(self.filtered_data, segment) end end else self.filtered_data = self.init_data end self.data = self.filtered_data end function Transcript:create_markers(proj, regions, words) proj = proj or 0 regions = regions or false for i, segment in pairs(self.data) do local offset = self.calculate_offset(segment.item, segment.take) local want_index = segment:get('id', i) local color = 0 if words then for _, word in pairs(segment.words) do local start = word.start + offset local end_ = word.end_ + offset local name = word.word reaper.AddProjectMarker2(proj, regions, start, end_, name, want_index, color) end else local start = segment.start + offset local end_ = segment.end_ + offset local name = segment.text reaper.AddProjectMarker2(proj, regions, start, end_, name, want_index, color) end end end function Transcript:create_notes_track(words) local cur_pos = reaper.GetCursorPosition() local index = 0 reaper.InsertTrackAtIndex(index, false) local track = reaper.GetTrack(0, index) reaper.SetOnlyTrackSelected(track) reaper.GetSetMediaTrackInfo_String(track, 'P_NAME', 'Speech', true) for _, segment in pairs(self.data) do local offset = self.calculate_offset(segment.item, segment.take) if words then for _, word in pairs(segment.words) do local start = word.start + offset local end_ = word.end_ + offset local text = word.word self:_create_note(start, end_, text, false) end else local start = segment.start + offset local end_ = segment.end_ + offset local text = segment.text self:_create_note(start, end_, text, true) end end reaper.SetEditCurPos(cur_pos, true, true) end function Transcript:_create_note(start, end_, text, stretch) local item = self:_create_empty_item(start, end_) self:_set_note_text(item, text, stretch) end function Transcript:_create_empty_item(start, end_) self:_insert_empty_item() local item = reaper.GetSelectedMediaItem(0, 0) reaper.SelectAllMediaItems(0, false) reaper.SetMediaItemPosition(item, start, true) reaper.SetMediaItemLength(item, end_ - start, true) return item end function Transcript:_insert_empty_item() reaper.Main_OnCommand(40142, 0) end function Transcript:_set_note_text(item, text, stretch) local _, chunk = reaper.GetItemStateChunk(item, "", false) local notes_chunk = ("\n"):format(text:match("^%s*(.-)%s*$")) local flags_chunk = (stretch and "IMGRESOURCEFLAGS 11\n" or "") chunk = chunk:gsub('>', notes_chunk:gsub('%%', '%%%%') .. flags_chunk .. '>') reaper.SetItemStateChunk(item, chunk, false) end TranscriptSegment = Polo { _proxy_fields = { start = 'start', end_ = 'end', text = 'text', } } TranscriptSegment.__index = function(o, key) local proxy_target = TranscriptSegment._proxy_fields[key] if proxy_target then return o.data[proxy_target] else return TranscriptSegment[key] end end function TranscriptSegment:init() assert(self.data, 'missing data') assert(self.item, 'missing item') assert(self.take, 'missing take') self.data = self._copy(self.data) self.data['file'] = self:get_file() end TranscriptSegment.from_whisper = function(segment, item, take) local result = {} local words = segment.words segment = TranscriptSegment._copy(segment) segment.text = segment.text:match("^%s*(.-)%s*$") segment.words = nil if words then local transcript_words = {} for _, word in pairs(words) do local transcript_word = TranscriptWord.new({ word = word.word:match("^%s*(.-)%s*$"), probability = word.probability, start = word.start, end_ = word['end'] }) table.insert(transcript_words, transcript_word) end table.insert(result, TranscriptSegment.new({ data = segment, item = item, take = take, words = transcript_words })) else table.insert(result, TranscriptSegment.new({ data = segment, item = item, take = take })) end return result end TranscriptSegment.default_hide = function(column) return Transcript.DEFAULT_HIDE[column] or false end TranscriptSegment.merge_words = function(words, index1, index2) local word1 = words[index1] local word2 = words[index2] local new_word = TranscriptWord.new { word = word1.word .. word2.word, start = word1.start, end_ = word2.end_, probability = (word1.probability + word2.probability) / 2 } table.remove(words, index2) table.remove(words, index1) table.insert(words, index1, new_word) end TranscriptSegment.split_word = function(words, index) local word = words[index] local length = utf8.len(word.word) local half_length = math.floor(length / 2) local new_word1 = TranscriptWord.new { word = word.word:sub(1, utf8.offset(word.word, half_length)), start = word.start, end_ = word.start + (word.end_ - word.start) / 2, probability = word.probability } local new_word2 = TranscriptWord.new { word = word.word:sub(utf8.offset(word.word, half_length + 1)), start = word.start + (word.end_ - word.start) / 2, end_ = word.end_, probability = word.probability } table.remove(words, index) table.insert(words, index, new_word2) table.insert(words, index, new_word1) end TranscriptSegment._copy = function(data) local result = {} for k, v in pairs(data) do result[k] = v end return result end function TranscriptSegment:score() local score = 0.0 if self.words and #self.words > 0 then for _, word in pairs(self.words) do score = score + word:score() end return score / #self.words else return 0.0 end end function TranscriptSegment:get(column, default) if column == 'score' then return self:score() elseif self.data[column] then return self.data[column] else return default end end function TranscriptSegment:set_words(words) self.words = words self:update_text() end function TranscriptSegment:update_text() local text_chunks = {} for _, word in pairs(self.words) do table.insert(text_chunks, word.word) end self.data['text'] = table.concat(text_chunks, ' ') end function TranscriptSegment:get_file(include_extensions) include_extensions = include_extensions or false local file = '' app:trap(function () local source = reaper.GetMediaItemTake_Source(self.take) if source then local source_path = reaper.GetMediaSourceFileName(source) file = source_path:gsub(".*[\\/](.*)", "%1") if not include_extensions then file = file:gsub("(.*)[.].*", "%1") end end end) return file end function TranscriptSegment:get_file_with_extension() return self:get_file(true) end function TranscriptSegment:navigate(word_index, autoplay) local start = self.start if word_index then start = self.words[word_index].start end local offset = start - reaper.GetMediaItemTakeInfo_Value(self.take, 'D_STARTOFFS') self:_navigate_to_media_item(self.item) reaper.MoveEditCursor(offset, false) if autoplay and reaper.GetPlayState() & 1 == 0 then self:_transport_play() end if reaper.GetPlayState() & 1 == 1 then self:_transport_play() end end function TranscriptSegment:_navigate_to_media_item(item) reaper.SelectAllMediaItems(0, false) reaper.SetMediaItemSelected(item, true) self:_move_cursor_to_start_of_items() end function TranscriptSegment:_move_cursor_to_start_of_items() reaper.Main_OnCommand(41173, 0) end function TranscriptSegment:_transport_play() reaper.Main_OnCommand(1007, 0) end function TranscriptSegment:to_json() return json.encode(self:to_table()) end function TranscriptSegment:to_table() local result = self._copy(self.data) if self.words then result['words'] = {} for _, word in pairs(self.words) do table.insert(result['words'], word:to_table()) end end return result end function TranscriptSegment:select_in_timeline(offset) offset = offset or 0 local start = self.start + offset local end_ = self.end_ + offset reaper.GetSet_LoopTimeRange(true, true, start, end_, false) end TranscriptWord = Polo {} function TranscriptWord:init() assert(self.word, 'missing word') assert(self.start, 'missing start') assert(self.end_, 'missing end_') assert(self.probability, 'missing probability') end function TranscriptWord:copy() return TranscriptWord.new { word = self.word, start = self.start, end_ = self.end_, probability = self.probability, } end function TranscriptWord:score() return self.probability end function TranscriptWord:to_table() return { word = self.word, start = self.start, ['end'] = self.end_, probability = self.probability, } end function TranscriptWord:select_in_timeline(offset) offset = offset or 0 local start = self.start + offset local end_ = self.end_ + offset reaper.GetSet_LoopTimeRange(true, true, start, end_, false) end