Spaces:

znation
/

xet-repo-data-collection

Sleeping

App Files Files Community

xet-repo-data-collection / list_xorbs.py

znation HF staff

let's try this

f624d68 29 days ago

raw

history blame contribute delete

2.08 kB

	### A simpler rendition of reconstructions.
	### Lists only xorbs and a "dedupe factor" for the xorb
	### Where dedupe-factor 1 == no dedupe,
	### 2 == 1 chunk shared,
	### 3 == 2 chunks shared,
	### etc.

	import json
	import sys

	import list_reconstructions
	import list_repos

	def list_xorbs(repos):
	# first build up a mapping of {xorb_id: [(start, end), (start, end), ...]}
	xorbs = {}
	reconstructions = list_reconstructions.list_reconstructions(repos)
	for term in reconstructions:
	if not(term["xorb_id"] in xorbs):
	xorbs[term["xorb_id"]] = []
	path_parts = term["file_path"].split("/")
	if path_parts[0] != "datasets" and \
	path_parts[0] != "spaces":
	# models omit the "models" part from file path
	path_parts.insert(0, "models")
	repo = "/".join(path_parts[:3])
	xorbs[term["xorb_id"]].append((term["start"], term["end"], repo))

	# then walk the lists and compute dedupe factor
	output = []
	for xorb_id,chunks in xorbs.items():
	min_chunk_idx = float("inf")
	max_chunk_idx = float("-inf")
	xorb_repos = set()
	dedupe_factor = 0
	for chunk in chunks:
	min_chunk_idx = min(min_chunk_idx, chunk[0])
	max_chunk_idx = max(max_chunk_idx, chunk[1])
	xorb_repos.add(chunk[2])
	xorb_repos = list(xorb_repos)
	for i in range(min_chunk_idx, max_chunk_idx):
	ref_count = 0
	for chunk in chunks:
	if i >= chunk[0] and i < chunk[1]:
	ref_count += 1
	dedupe_factor += ref_count
	if max_chunk_idx != 0:
	dedupe_factor /= float(max_chunk_idx)
	for repo in xorb_repos:
	output.append({
	"xorb_id": xorb_id,
	"dedupe_factor": dedupe_factor,
	"repo": repo
	})
	return output


	if __name__ == "__main__":
	json.dump(list_xorbs(list_repos.list_repos()), sys.stdout, sort_keys=True, indent=4)