xet-repo-data-collection / list_xorbs.py
znation's picture
znation HF staff
let's try this
f624d68
### A simpler rendition of reconstructions.
### Lists only xorbs and a "dedupe factor" for the xorb
### Where dedupe-factor 1 == no dedupe,
### 2 == 1 chunk shared,
### 3 == 2 chunks shared,
### etc.
import json
import sys
import list_reconstructions
import list_repos
def list_xorbs(repos):
# first build up a mapping of {xorb_id: [(start, end), (start, end), ...]}
xorbs = {}
reconstructions = list_reconstructions.list_reconstructions(repos)
for term in reconstructions:
if not(term["xorb_id"] in xorbs):
xorbs[term["xorb_id"]] = []
path_parts = term["file_path"].split("/")
if path_parts[0] != "datasets" and \
path_parts[0] != "spaces":
# models omit the "models" part from file path
path_parts.insert(0, "models")
repo = "/".join(path_parts[:3])
xorbs[term["xorb_id"]].append((term["start"], term["end"], repo))
# then walk the lists and compute dedupe factor
output = []
for xorb_id,chunks in xorbs.items():
min_chunk_idx = float("inf")
max_chunk_idx = float("-inf")
xorb_repos = set()
dedupe_factor = 0
for chunk in chunks:
min_chunk_idx = min(min_chunk_idx, chunk[0])
max_chunk_idx = max(max_chunk_idx, chunk[1])
xorb_repos.add(chunk[2])
xorb_repos = list(xorb_repos)
for i in range(min_chunk_idx, max_chunk_idx):
ref_count = 0
for chunk in chunks:
if i >= chunk[0] and i < chunk[1]:
ref_count += 1
dedupe_factor += ref_count
if max_chunk_idx != 0:
dedupe_factor /= float(max_chunk_idx)
for repo in xorb_repos:
output.append({
"xorb_id": xorb_id,
"dedupe_factor": dedupe_factor,
"repo": repo
})
return output
if __name__ == "__main__":
json.dump(list_xorbs(list_repos.list_repos()), sys.stdout, sort_keys=True, indent=4)