Spaces:
Runtime error
Runtime error
import collections | |
import os | |
from datetime import datetime, timedelta | |
import json | |
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer | |
from urllib.parse import parse_qs, urlparse | |
from huggingface_hub import list_datasets, set_access_token, HfFolder | |
from datasets import load_dataset, DatasetDict, Dataset | |
import numpy as np | |
HF_TOKEN = os.environ['HF_TOKEN'] | |
set_access_token(HF_TOKEN) | |
HfFolder.save_token(HF_TOKEN) | |
datasets = { | |
"stars": load_dataset("open-source-metrics/stars").sort('dates'), | |
"issues": load_dataset("open-source-metrics/issues").sort('dates'), | |
"pip": load_dataset("open-source-metrics/pip").sort('day') | |
} | |
val = 0 | |
def _range(e): | |
global val | |
e['range'] = val | |
val += 1 | |
return e | |
stars = {} | |
for k, v in datasets['stars'].items(): | |
stars[k] = v.map(_range) | |
val = 0 | |
issues = {} | |
for k, v in datasets['issues'].items(): | |
issues[k] = v.map(_range) | |
val = 0 | |
datasets['stars'] = DatasetDict(**stars) | |
datasets['issues'] = DatasetDict(**issues) | |
# datasets = { | |
# k1: DatasetDict({ | |
# k2: v2.select(range(0, len(v2), max(1, int(len(v2) / 1000)))) for k2, v2 in v1.items() | |
# }) for k1, v1 in datasets.items() | |
# } | |
def link_values(library_names, returned_values): | |
previous_values = {library_name: None for library_name in library_names} | |
for library_name in library_names: | |
for i in returned_values.keys(): | |
if library_name not in returned_values[i]: | |
returned_values[i][library_name] = previous_values[library_name] | |
else: | |
previous_values[library_name] = returned_values[i][library_name] | |
return returned_values | |
def running_mean(x, N, total_length=-1): | |
cumsum = np.cumsum(np.insert(x, 0, 0)) | |
to_pad = max(total_length - len(cumsum), 0) | |
return np.pad(cumsum[N:] - cumsum[:-N], (to_pad, 0)) / float(N) | |
class RequestHandler(SimpleHTTPRequestHandler): | |
def do_GET(self): | |
print(self.path) | |
if self.path == "/": | |
self.path = "index.html" | |
return SimpleHTTPRequestHandler.do_GET(self) | |
if self.path.startswith("/initialize"): | |
dataset_keys = {k: set(v.keys()) for k, v in datasets.items()} | |
dataset_with_most_splits = max([d for d in dataset_keys.values()], key=len) | |
warnings = [] | |
for k, v in dataset_keys.items(): | |
if len(v) < len(dataset_with_most_splits): | |
warnings.extend(f"The {k} dataset does not contain all splits. Missing: {dataset_with_most_splits - v}") | |
self.send_response(200) | |
self.send_header("Content-Type", "application/json") | |
self.end_headers() | |
# TODO: Send and display warnings | |
dataset_with_most_splits = list(dataset_with_most_splits) | |
dataset_with_most_splits.sort() | |
self.wfile.write(json.dumps(list(dataset_with_most_splits)).encode("utf-8")) | |
return SimpleHTTPRequestHandler | |
if self.path.startswith("/retrievePipInstalls"): | |
url = urlparse(self.path) | |
query = parse_qs(url.query) | |
library_names = query.get("input", None)[0] | |
library_names = library_names.split(',') | |
if 'Cumulated' in library_names: | |
dataset_keys = {k: set(v.keys()) for k, v in datasets.items()} | |
dataset_with_most_splits = max([d for d in dataset_keys.values()], key=len) | |
library_names = list(dataset_with_most_splits) | |
returned_values = {} | |
for library_name in library_names: | |
for i in datasets['pip'][library_name]: | |
if i['day'] in returned_values: | |
returned_values[i['day']]['Cumulated'] += i['num_downloads'] | |
else: | |
returned_values[i['day']] = {'Cumulated': i['num_downloads']} | |
library_names = ['Cumulated'] | |
else: | |
returned_values = {} | |
for library_name in library_names: | |
for i in datasets['pip'][library_name]: | |
if i['day'] in returned_values: | |
returned_values[i['day']][library_name] = i['num_downloads'] | |
else: | |
returned_values[i['day']] = {library_name: i['num_downloads']} | |
for library_name in library_names: | |
for i in returned_values.keys(): | |
if library_name not in returned_values[i]: | |
returned_values[i][library_name] = None | |
returned_values = collections.OrderedDict(sorted(returned_values.items())) | |
output = {l: [k[l] for k in returned_values.values()] for l in library_names} | |
output['day'] = list(returned_values.keys()) | |
self.send_response(200) | |
self.send_header("Content-Type", "application/json") | |
self.end_headers() | |
self.wfile.write(json.dumps(output).encode("utf-8")) | |
return SimpleHTTPRequestHandler | |
if self.path.startswith("/retrieveStars"): | |
url = urlparse(self.path) | |
query = parse_qs(url.query) | |
library_names = query.get("input", None)[0] | |
library_names = library_names.split(',') | |
returned_values = {} | |
dataset_dict = datasets['stars'] | |
for library_name in library_names: | |
dataset = dataset_dict[library_name] | |
for i in dataset: | |
if i['dates'] in returned_values: | |
returned_values[i['dates']][library_name] = i['range'] | |
else: | |
returned_values[i['dates']] = {library_name: i['range']} | |
returned_values = collections.OrderedDict(sorted(returned_values.items())) | |
returned_values = link_values(library_names, returned_values) | |
output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names} | |
output['day'] = list(returned_values.keys())[::-1] | |
# Trim down to a smaller number of points. | |
output = {k: [v for i, v in enumerate(value) if i % int(len(value) / 100) == 0] for k, value in output.items()} | |
self.send_response(200) | |
self.send_header("Content-Type", "application/json") | |
self.end_headers() | |
self.wfile.write(json.dumps(output).encode("utf-8")) | |
return SimpleHTTPRequestHandler | |
if self.path.startswith("/retrieveIssues"): | |
url = urlparse(self.path) | |
query = parse_qs(url.query) | |
library_names = query.get("input", None)[0] | |
library_names = library_names.split(',') | |
returned_values = {} | |
dataset_dict = datasets['issues'] | |
for library_name in library_names: | |
dataset = dataset_dict[library_name] | |
for i in dataset: | |
if i['dates'] in returned_values: | |
returned_values[i['dates']][library_name] = i['range'] | |
else: | |
returned_values[i['dates']] = {library_name: i['range']} | |
returned_values = collections.OrderedDict(sorted(returned_values.items())) | |
returned_values = link_values(library_names, returned_values) | |
output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names} | |
output['day'] = list(returned_values.keys())[::-1] | |
# Trim down to a smaller number of points. | |
output = {k: [v for i, v in enumerate(value) if i % int(len(value) / 100) == 0] for k, value in output.items()} | |
self.send_response(200) | |
self.send_header("Content-Type", "application/json") | |
self.end_headers() | |
self.wfile.write(json.dumps(output).encode("utf-8")) | |
return SimpleHTTPRequestHandler | |
return SimpleHTTPRequestHandler.do_GET(self) | |
server = ThreadingHTTPServer(("", 7860), RequestHandler) | |
print("Running on port 7860") | |
server.serve_forever() | |