File size: 3,978 Bytes
2d8da09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import re
from pathlib import Path
import pandas as pd
parser = argparse.ArgumentParser(description="Compare alignment segments generated with different window sizes")
parser.add_argument(
"--base_dir",
default="output",
type=str,
required=True,
help="Path to directory with 'logs' and 'segments' folders generated during the segmentation step",
)
if __name__ == "__main__":
args = parser.parse_args()
segments_dir = os.path.join(args.base_dir, "segments")
if not os.path.exists(segments_dir):
raise ValueError(f"'segments' directory was not found at {args.base_dir}.")
all_files = Path(segments_dir).glob("*_segments.txt")
all_alignment_files = {}
for file in all_files:
base_name = re.sub(r"^\d+_", "", file.name)
if base_name not in all_alignment_files:
all_alignment_files[base_name] = []
all_alignment_files[base_name].append(file)
verified_dir = os.path.join(args.base_dir, "verified_segments")
os.makedirs(verified_dir, exist_ok=True)
def readlines(file):
with open(file, "r") as f:
lines = f.readlines()
return lines
stats = {}
for part, alignment_files in all_alignment_files.items():
stats[part] = {}
num_alignment_files = len(alignment_files)
all_alignments = []
for alignment in alignment_files:
all_alignments.append(readlines(alignment))
with open(os.path.join(verified_dir, part), "w") as f:
num_segments = len(all_alignments[0])
stats[part]["Original number of segments"] = num_segments
stats[part]["Verified segments"] = 0
stats[part]["Original Duration, min"] = 0
stats[part]["Verified Duration, min"] = 0
for i in range(num_segments):
line = all_alignments[0][i]
valid_line = True
if i == 0:
duration = 0
else:
info = line.split("|")[0].split()
duration = (float(info[1]) - float(info[0])) / 60
stats[part]["Original Duration, min"] += duration
for alignment in all_alignments:
if line != alignment[i]:
valid_line = False
if valid_line:
f.write(line)
stats[part]["Verified segments"] += 1
stats[part]["Verified Duration, min"] += duration
stats = pd.DataFrame.from_dict(stats, orient="index").reset_index()
stats["Number dropped"] = stats["Original number of segments"] - stats["Verified segments"]
stats["Duration of dropped, min"] = round(stats["Original Duration, min"] - stats["Verified Duration, min"])
stats["% dropped, min"] = round(stats["Duration of dropped, min"] / stats["Original number of segments"] * 100)
stats["Misalignment present"] = stats["Number dropped"] > 0
stats["Original Duration, min"] = round(stats["Original Duration, min"])
stats["Verified Duration, min"] = round(stats["Verified Duration, min"])
stats.loc["Total"] = stats.sum()
stats_file = os.path.join(args.base_dir, "alignment_summary.csv")
stats.to_csv(stats_file, index=False)
print(stats)
print(f"Alignment summary saved to {stats_file}")
|