Spaces:
Sleeping
Sleeping
### A simpler rendition of reconstructions. | |
### Lists only xorbs and a "dedupe factor" for the xorb | |
### Where dedupe-factor 1 == no dedupe, | |
### 2 == 1 chunk shared, | |
### 3 == 2 chunks shared, | |
### etc. | |
import json | |
import sys | |
import list_reconstructions | |
import list_repos | |
def list_xorbs(repos): | |
# first build up a mapping of {xorb_id: [(start, end), (start, end), ...]} | |
xorbs = {} | |
reconstructions = list_reconstructions.list_reconstructions(repos) | |
for term in reconstructions: | |
if not(term["xorb_id"] in xorbs): | |
xorbs[term["xorb_id"]] = [] | |
path_parts = term["file_path"].split("/") | |
if path_parts[0] != "datasets" and \ | |
path_parts[0] != "spaces": | |
# models omit the "models" part from file path | |
path_parts.insert(0, "models") | |
repo = "/".join(path_parts[:3]) | |
xorbs[term["xorb_id"]].append((term["start"], term["end"], repo)) | |
# then walk the lists and compute dedupe factor | |
output = [] | |
for xorb_id,chunks in xorbs.items(): | |
min_chunk_idx = float("inf") | |
max_chunk_idx = float("-inf") | |
xorb_repos = set() | |
dedupe_factor = 0 | |
for chunk in chunks: | |
min_chunk_idx = min(min_chunk_idx, chunk[0]) | |
max_chunk_idx = max(max_chunk_idx, chunk[1]) | |
xorb_repos.add(chunk[2]) | |
xorb_repos = list(xorb_repos) | |
for i in range(min_chunk_idx, max_chunk_idx): | |
ref_count = 0 | |
for chunk in chunks: | |
if i >= chunk[0] and i < chunk[1]: | |
ref_count += 1 | |
dedupe_factor += ref_count | |
if max_chunk_idx != 0: | |
dedupe_factor /= float(max_chunk_idx) | |
for repo in xorb_repos: | |
output.append({ | |
"xorb_id": xorb_id, | |
"dedupe_factor": dedupe_factor, | |
"repo": repo | |
}) | |
return output | |
if __name__ == "__main__": | |
json.dump(list_xorbs(list_repos.list_repos()), sys.stdout, sort_keys=True, indent=4) |