MT3

Running

App Files Files Community

Hmjz100 commited on May 30, 2024

Commit

d017266

verified ·

1 Parent(s): 7bd3414

Upload app.py

Browse files

Files changed (1) hide show

app.py +84 -84

app.py CHANGED Viewed

@@ -5,8 +5,8 @@ import pytz
 from pathlib import Path
 def current_time():
-    current = datetime.datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y年-%m月-%d日 %H时:%M分:%S秒")
-    return current
 print(f"[{current_time()}] 开始部署空间...")
@@ -83,58 +83,58 @@ SAMPLE_RATE = 16000
 SF2_PATH = 'SGM-v2.01-Sal-Guit-Bass-V1.3.sf2'
 def upload_audio(audio, sample_rate):
-    return note_seq.audio_io.wav_data_to_samples_librosa(
-        audio, sample_rate=sample_rate)
 print(f"[{current_time()}] 日志：开始包装模型...")
 class InferenceModel(object):
-    """音乐转录的 T5X 模型包装器。"""
-    def __init__(self, checkpoint_path, model_type='mt3'):
-        if model_type == 'ismir2021':
-            num_velocity_bins = 127
-            self.encoding_spec = note_sequences.NoteEncodingSpec
-            self.inputs_length = 512
-        elif model_type == 'mt3':
-            num_velocity_bins = 1
-            self.encoding_spec = note_sequences.NoteEncodingWithTiesSpec
-            self.inputs_length = 256
-        else:
-            raise ValueError('unknown model_type: %s' % model_type)
-        gin_files = ['/home/user/app/mt3/gin/model.gin',
-                    '/home/user/app/mt3/gin/mt3.gin']
-        self.batch_size = 8
-        self.outputs_length = 1024
-        self.sequence_length = {'inputs': self.inputs_length,
-                                'targets': self.outputs_length}
-        self.partitioner = t5x.partitioning.PjitPartitioner(
-                model_parallel_submesh=None, num_partitions=1)
-        print(f"[{current_time()}] 日志：构建编解码器")
-        self.spectrogram_config = spectrograms.SpectrogramConfig()
-        self.codec = vocabularies.build_codec(
-                vocab_config=vocabularies.VocabularyConfig(
-                num_velocity_bins=num_velocity_bins)
-                )
-        self.vocabulary = vocabularies.vocabulary_from_codec(self.codec)
-        self.output_features = {
-                'inputs': seqio.ContinuousFeature(dtype=tf.float32, rank=2),
-                'targets': seqio.Feature(vocabulary=self.vocabulary),
-        }
-        print(f"[{current_time()}] 日志：创建 T5X 模型")
-        self._parse_gin(gin_files)
-        self.model = self._load_model()
-        print(f"[{current_time()}] 日志：恢复模型检查点")
-        self.restore_from_checkpoint(checkpoint_path)
-    @property
-    def input_shapes(self):
 		return {
 			'encoder_input_tokens': (self.batch_size, self.inputs_length),
 			'decoder_input_tokens': (self.batch_size, self.outputs_length)
@@ -144,10 +144,10 @@ class InferenceModel(object):
 		"""解析用于训练模型的 gin 文件。"""
 		print(f"[{current_time()}] 日志：解析 gin 文件")
 		gin_bindings = [
-            'from __gin__ import dynamic_registration',
-            'from mt3 import vocabularies',
-            '[email protected]()',
-            'vocabularies.VocabularyConfig.num_velocity_bins=%NUM_VELOCITY_BINS'
 		]
 		with gin.unlock_config():
 			gin.parse_config_files_and_bindings(gin_files, gin_bindings, finalize_config=False)
@@ -158,11 +158,11 @@ class InferenceModel(object):
 		model_config = gin.get_configurable(network.T5Config)()
 		module = network.Transformer(config=model_config)
 		return models.ContinuousInputsEncoderDecoderModel(
-            module=module,
-            input_vocabulary=self.output_features['inputs'].vocabulary,
-            output_vocabulary=self.output_features['targets'].vocabulary,
-            optimizer_def=t5x.adafactor.Adafactor(decay_rate=0.8, step_offset=0),
-            input_depth=spectrograms.input_depth(self.spectrogram_config))
 	def restore_from_checkpoint(self, checkpoint_path):
@@ -175,12 +175,12 @@ class InferenceModel(object):
 			partitioner=self.partitioner)
 		restore_checkpoint_cfg = t5x.utils.RestoreCheckpointConfig(
-            path=checkpoint_path, mode='specific', dtype='float32')
 		train_state_axes = train_state_initializer.train_state_axes
 		self._predict_fn = self._get_predict_fn(train_state_axes)
 		self._train_state = train_state_initializer.from_checkpoint_or_scratch(
-            [restore_checkpoint_cfg], init_rng=jax.random.PRNGKey(0))
 	@functools.lru_cache()
 	def _get_predict_fn(self, train_state_axes):
@@ -189,11 +189,11 @@ class InferenceModel(object):
 		def partial_predict_fn(params, batch, decode_rng):
 			return self.model.predict_batch_with_aux(params, batch, decoder_params={'decode_rng': None})
 		return self.partitioner.partition(
-            partial_predict_fn,
-            in_axis_resources=(
-                    train_state_axes.params,
-                    t5x.partitioning.PartitionSpec('data',), None),
-            out_axis_resources=t5x.partitioning.PartitionSpec('data',)
 		)
 	def predict_tokens(self, batch, seed=0):
@@ -252,16 +252,16 @@ class InferenceModel(object):
 	def preprocess(self, ds):
 		pp_chain = [
 				functools.partial(
-                    t5.data.preprocessors.split_tokens_to_inputs_length,
-                    sequence_length=self.sequence_length,
-                    output_features=self.output_features,
-                    feature_key='inputs',
-                    additional_feature_keys=['input_times']),
 				# 在训练期间进行缓存。
 				preprocessors.add_dummy_targets,
 				functools.partial(
-                    preprocessors.compute_spectrograms,
-                    spectrogram_config=self.spectrogram_config)
 		]
 		for pp in pp_chain:
 			ds = pp(ds)
@@ -273,10 +273,10 @@ class InferenceModel(object):
 		# 向下取整到最接近的符号化时间步。
 		start_time -= start_time % (1 / self.codec.steps_per_second)
 		return {
-            'est_tokens': tokens,
-            'start_time': start_time,
-            # 内部 MT3 代码期望原始输入，这里不使用。
-            'raw_inputs': []
 		}
 	@staticmethod
@@ -308,11 +308,11 @@ article = "<p style='text-align: center'>出错了？试试把文件转换为MP3
 examples=[['canon.flac'], ['download.wav']]
 gr.Interface(
-    inference,
-    gr.Audio(type="filepath", label="输入"),
-    outputs=gr.File(label="输出"),
-    title=title,
-    description=description,
-    article=article,
-    examples=examples
 ).launch(server_port=7861)

 from pathlib import Path
 def current_time():
+	current = datetime.datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y年-%m月-%d日 %H时:%M分:%S秒")
+	return current
 print(f"[{current_time()}] 开始部署空间...")
 SF2_PATH = 'SGM-v2.01-Sal-Guit-Bass-V1.3.sf2'
 def upload_audio(audio, sample_rate):
+	return note_seq.audio_io.wav_data_to_samples_librosa(
+		audio, sample_rate=sample_rate)
 print(f"[{current_time()}] 日志：开始包装模型...")
 class InferenceModel(object):
+	"""音乐转录的 T5X 模型包装器。"""
+	def __init__(self, checkpoint_path, model_type='mt3'):
+		if model_type == 'ismir2021':
+			num_velocity_bins = 127
+			self.encoding_spec = note_sequences.NoteEncodingSpec
+			self.inputs_length = 512
+		elif model_type == 'mt3':
+			num_velocity_bins = 1
+			self.encoding_spec = note_sequences.NoteEncodingWithTiesSpec
+			self.inputs_length = 256
+		else:
+			raise ValueError('unknown model_type: %s' % model_type)
+		gin_files = ['/home/user/app/mt3/gin/model.gin',
+					'/home/user/app/mt3/gin/mt3.gin']
+		self.batch_size = 8
+		self.outputs_length = 1024
+		self.sequence_length = {'inputs': self.inputs_length,
+								'targets': self.outputs_length}
+		self.partitioner = t5x.partitioning.PjitPartitioner(
+				model_parallel_submesh=None, num_partitions=1)
+		print(f"[{current_time()}] 日志：构建编解码器")
+		self.spectrogram_config = spectrograms.SpectrogramConfig()
+		self.codec = vocabularies.build_codec(
+				vocab_config=vocabularies.VocabularyConfig(
+				num_velocity_bins=num_velocity_bins)
+				)
+		self.vocabulary = vocabularies.vocabulary_from_codec(self.codec)
+		self.output_features = {
+				'inputs': seqio.ContinuousFeature(dtype=tf.float32, rank=2),
+				'targets': seqio.Feature(vocabulary=self.vocabulary),
+		}
+		print(f"[{current_time()}] 日志：创建 T5X 模型")
+		self._parse_gin(gin_files)
+		self.model = self._load_model()
+		print(f"[{current_time()}] 日志：恢复模型检查点")
+		self.restore_from_checkpoint(checkpoint_path)
+	@property
+	def input_shapes(self):
 		return {
 			'encoder_input_tokens': (self.batch_size, self.inputs_length),
 			'decoder_input_tokens': (self.batch_size, self.outputs_length)
 		"""解析用于训练模型的 gin 文件。"""
 		print(f"[{current_time()}] 日志：解析 gin 文件")
 		gin_bindings = [
+			'from __gin__ import dynamic_registration',
+			'from mt3 import vocabularies',
+			'[email protected]()',
+			'vocabularies.VocabularyConfig.num_velocity_bins=%NUM_VELOCITY_BINS'
 		]
 		with gin.unlock_config():
 			gin.parse_config_files_and_bindings(gin_files, gin_bindings, finalize_config=False)
 		model_config = gin.get_configurable(network.T5Config)()
 		module = network.Transformer(config=model_config)
 		return models.ContinuousInputsEncoderDecoderModel(
+			module=module,
+			input_vocabulary=self.output_features['inputs'].vocabulary,
+			output_vocabulary=self.output_features['targets'].vocabulary,
+			optimizer_def=t5x.adafactor.Adafactor(decay_rate=0.8, step_offset=0),
+			input_depth=spectrograms.input_depth(self.spectrogram_config))
 	def restore_from_checkpoint(self, checkpoint_path):
 			partitioner=self.partitioner)
 		restore_checkpoint_cfg = t5x.utils.RestoreCheckpointConfig(
+			path=checkpoint_path, mode='specific', dtype='float32')
 		train_state_axes = train_state_initializer.train_state_axes
 		self._predict_fn = self._get_predict_fn(train_state_axes)
 		self._train_state = train_state_initializer.from_checkpoint_or_scratch(
+			[restore_checkpoint_cfg], init_rng=jax.random.PRNGKey(0))
 	@functools.lru_cache()
 	def _get_predict_fn(self, train_state_axes):
 		def partial_predict_fn(params, batch, decode_rng):
 			return self.model.predict_batch_with_aux(params, batch, decoder_params={'decode_rng': None})
 		return self.partitioner.partition(
+			partial_predict_fn,
+			in_axis_resources=(
+					train_state_axes.params,
+					t5x.partitioning.PartitionSpec('data',), None),
+			out_axis_resources=t5x.partitioning.PartitionSpec('data',)
 		)
 	def predict_tokens(self, batch, seed=0):
 	def preprocess(self, ds):
 		pp_chain = [
 				functools.partial(
+					t5.data.preprocessors.split_tokens_to_inputs_length,
+					sequence_length=self.sequence_length,
+					output_features=self.output_features,
+					feature_key='inputs',
+					additional_feature_keys=['input_times']),
 				# 在训练期间进行缓存。
 				preprocessors.add_dummy_targets,
 				functools.partial(
+					preprocessors.compute_spectrograms,
+					spectrogram_config=self.spectrogram_config)
 		]
 		for pp in pp_chain:
 			ds = pp(ds)
 		# 向下取整到最接近的符号化时间步。
 		start_time -= start_time % (1 / self.codec.steps_per_second)
 		return {
+			'est_tokens': tokens,
+			'start_time': start_time,
+			# 内部 MT3 代码期望原始输入，这里不使用。
+			'raw_inputs': []
 		}
 	@staticmethod
 examples=[['canon.flac'], ['download.wav']]
 gr.Interface(
+	inference,
+	gr.Audio(type="filepath", label="输入"),
+	outputs=gr.File(label="输出"),
+	title=title,
+	description=description,
+	article=article,
+	examples=examples
 ).launch(server_port=7861)