David Thomas
commited on
Commit
·
8ee452e
1
Parent(s):
386449f
ui tweaks
Browse files- app.py +102 -9
- extract_feature_print.py +0 -0
- main.py +6 -0
- utils.py +1 -0
app.py
CHANGED
@@ -221,6 +221,7 @@ from vc_infer_pipeline import VC
|
|
221 |
from config import Config
|
222 |
|
223 |
config = Config()
|
|
|
224 |
# from trainset_preprocess_pipeline import PreProcess
|
225 |
logging.getLogger("numba").setLevel(logging.WARNING)
|
226 |
|
@@ -248,6 +249,91 @@ index_paths = ["./logs/joel/added_IVF479_Flat_nprobe_1.index","./logs/jenny/adde
|
|
248 |
file_index=None
|
249 |
|
250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
|
252 |
def vc_single(
|
253 |
sid,
|
@@ -1556,9 +1642,8 @@ with gr.Blocks(theme=gr.themes.Base(), title='RVC RULE1 v1') as app:
|
|
1556 |
# face.upload(fn=success_message,inputs=[face], outputs=[preview, faces])
|
1557 |
with gr.Row():
|
1558 |
animation = gr.Video(type='filepath')
|
1559 |
-
refresh_button2.click(fn=change_choices2, inputs=[], outputs=[input_audio0, animation])
|
1560 |
-
|
1561 |
-
# animate_button = gr.Button('Animate')
|
1562 |
|
1563 |
with gr.Column():
|
1564 |
with gr.Accordion("Index Settings", open=False):
|
@@ -1584,12 +1669,20 @@ with gr.Blocks(theme=gr.themes.Base(), title='RVC RULE1 v1') as app:
|
|
1584 |
value=0.66,
|
1585 |
interactive=True,
|
1586 |
)
|
1587 |
-
|
1588 |
-
|
1589 |
-
|
1590 |
-
|
1591 |
-
|
1592 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1593 |
with gr.Accordion("Advanced Settings", open=False):
|
1594 |
f0method0 = gr.Radio(
|
1595 |
label="Optional: Change the Pitch Extraction Algorithm.\nExtraction methods are sorted from 'worst quality' to 'best quality'.\nmangio-crepe may or may not be better than rmvpe in cases where 'smoothness' is more important, but rmvpe is the best overall.",
|
|
|
221 |
from config import Config
|
222 |
|
223 |
config = Config()
|
224 |
+
cpt=None
|
225 |
# from trainset_preprocess_pipeline import PreProcess
|
226 |
logging.getLogger("numba").setLevel(logging.WARNING)
|
227 |
|
|
|
249 |
file_index=None
|
250 |
|
251 |
|
252 |
+
# Define a function to calculate a "similarity score" to identify potential copyright infringement
|
253 |
+
def calculate_similarity_score(
|
254 |
+
audio0,
|
255 |
+
index_file,
|
256 |
+
sid0,
|
257 |
+
version="v1",
|
258 |
+
#protect=0.3,
|
259 |
+
index_rate=0.67,
|
260 |
+
#pitch
|
261 |
+
): # ,file_index,file_big_npy
|
262 |
+
""" Extract features from audio using the Hubert model """
|
263 |
+
extracted_feats = None
|
264 |
+
model = None
|
265 |
+
if sid0 == "joel.pth":
|
266 |
+
big_npy = "./logs/joel/total_fea.npy"
|
267 |
+
elif sid0 == "jenny.pth":
|
268 |
+
big_npy = "./logs/jenny/total_fea.npy"
|
269 |
+
try:
|
270 |
+
audio = load_audio(audio0, 16000, DoFormant, Quefrency, Timbre)
|
271 |
+
logging.log(logging.INFO, "audio loaded")
|
272 |
+
audio_max = np.abs(audio).max() / 0.95
|
273 |
+
if audio_max > 1:
|
274 |
+
audio /= audio_max
|
275 |
+
except TypeError as e:
|
276 |
+
print(e)
|
277 |
+
return None
|
278 |
+
feats = torch.from_numpy(audio)
|
279 |
+
# Use the "load_hubert_model" function to load the model
|
280 |
+
if hubert_model is None:
|
281 |
+
load_hubert()
|
282 |
+
# Set the "model" variable to the loaded model
|
283 |
+
model = hubert_model
|
284 |
+
# If the model is half precision, convert the features to half precision
|
285 |
+
if config.is_half:
|
286 |
+
feats = feats.half()
|
287 |
+
else:
|
288 |
+
feats = feats.float()
|
289 |
+
if feats.dim() == 2: # double channels
|
290 |
+
feats = feats.mean(-1)
|
291 |
+
assert feats.dim() == 1, feats.dim()
|
292 |
+
feats = feats.view(1, -1)
|
293 |
+
padding_mask = torch.BoolTensor(feats.shape).to(config.device).fill_(False)
|
294 |
+
inputs = {
|
295 |
+
"source": feats.to(config.device),
|
296 |
+
"padding_mask": padding_mask,
|
297 |
+
"output_layer": 9 if version == "v1" else 12,
|
298 |
+
}
|
299 |
+
with torch.no_grad():
|
300 |
+
logits = model.extract_features(**inputs)
|
301 |
+
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
|
302 |
+
#if protect < 0.5 and pitch != None and pitchf != None:
|
303 |
+
# feats0 = feats.clone()
|
304 |
+
if (
|
305 |
+
isinstance(index_file, type(None)) == False
|
306 |
+
and isinstance(big_npy, type(None)) == False
|
307 |
+
and index_rate != 0
|
308 |
+
):
|
309 |
+
npy = feats[0].cpu().numpy()
|
310 |
+
if config.is_half:
|
311 |
+
npy = npy.astype("float32")
|
312 |
+
|
313 |
+
extracted_feats = npy
|
314 |
+
|
315 |
+
if config.is_half:
|
316 |
+
extracted_feats = extracted_feats.astype("float32")
|
317 |
+
# Convert the big_npy file to a numpy array and match the type
|
318 |
+
# to the extracted features
|
319 |
+
big_npy = np.load(big_npy)
|
320 |
+
if config.is_half:
|
321 |
+
big_npy = big_npy.astype("float32")
|
322 |
+
|
323 |
+
# Use the extracted features and the big_npy file to calculate whether or
|
324 |
+
# not the audio vocalist is the same as the one in the big_npy file
|
325 |
+
# compare the distances between the extracted features and the big_npy file
|
326 |
+
# to determine the similarity score
|
327 |
+
index = faiss.read_index(index_file)
|
328 |
+
D, I = index.search(big_npy, k=1) # search index for nearest match
|
329 |
+
distances = np.sqrt(D[:, 0]) # use L2 distance
|
330 |
+
threshold = np.percentile(distances, 50) # set threshold to exclude outliers
|
331 |
+
score, ix = index.search(extracted_feats, k=1)
|
332 |
+
if score[0][0] < threshold:
|
333 |
+
print("Potential unauthorized use detected!")
|
334 |
+
|
335 |
+
return f"Score {score[0][0]}, {distances}"
|
336 |
+
|
337 |
|
338 |
def vc_single(
|
339 |
sid,
|
|
|
1642 |
# face.upload(fn=success_message,inputs=[face], outputs=[preview, faces])
|
1643 |
with gr.Row():
|
1644 |
animation = gr.Video(type='filepath')
|
1645 |
+
refresh_button2.click(fn=change_choices2, inputs=[], outputs=[input_audio0, animation]) # with gr.Row():
|
1646 |
+
animate_button = gr.Button('Animate')
|
|
|
1647 |
|
1648 |
with gr.Column():
|
1649 |
with gr.Accordion("Index Settings", open=False):
|
|
|
1669 |
value=0.66,
|
1670 |
interactive=True,
|
1671 |
)
|
1672 |
+
with gr.Row():
|
1673 |
+
vc_output2 = gr.Audio(
|
1674 |
+
label="Output Audio (Click on the Three Dots in the Right Corner to Download)",
|
1675 |
+
type='filepath',
|
1676 |
+
interactive=False,
|
1677 |
+
)
|
1678 |
+
with gr.Row():
|
1679 |
+
# Create a new button to calculate the similarity score
|
1680 |
+
similarity_button = gr.Button("Calculate Similarity Score", variant="primary")
|
1681 |
+
with gr.Row():
|
1682 |
+
similarity_score = gr.Textbox(label="Similarity Score", type="text", interactive=False)
|
1683 |
+
similarity_button.click(fn=calculate_similarity_score, inputs=[input_audio0, file_index1, sid0], outputs=[similarity_score])
|
1684 |
+
print(file_index1)
|
1685 |
+
#animate_button.click(fn=mouth, inputs=[size, face, vc_output2, faces], outputs=[animation, preview])
|
1686 |
with gr.Accordion("Advanced Settings", open=False):
|
1687 |
f0method0 = gr.Radio(
|
1688 |
label="Optional: Change the Pitch Extraction Algorithm.\nExtraction methods are sorted from 'worst quality' to 'best quality'.\nmangio-crepe may or may not be better than rmvpe in cases where 'smoothness' is more important, but rmvpe is the best overall.",
|
extract_feature_print.py
ADDED
File without changes
|
main.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" Main file to run the application. """
|
2 |
+
import uvicorn
|
3 |
+
from app.app import app
|
4 |
+
|
5 |
+
if __name__ == "__main__":
|
6 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
utils.py
CHANGED
@@ -150,3 +150,4 @@ def load_audio(file, sr, DoFormant, Quefrency, Timbre):
|
|
150 |
converted = False
|
151 |
|
152 |
return np.frombuffer(out, np.float32).flatten()
|
|
|
|
150 |
converted = False
|
151 |
|
152 |
return np.frombuffer(out, np.float32).flatten()
|
153 |
+
|