LysandreJik's picture
Cumulated only for pip
fe8da28
raw
history blame
8.18 kB
import collections
import os
from datetime import datetime, timedelta
import json
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
from urllib.parse import parse_qs, urlparse
from huggingface_hub import list_datasets, set_access_token, HfFolder
from datasets import load_dataset, DatasetDict, Dataset
import numpy as np
HF_TOKEN = os.environ['HF_TOKEN']
set_access_token(HF_TOKEN)
HfFolder.save_token(HF_TOKEN)
datasets = {
"stars": load_dataset("open-source-metrics/stars").sort('dates'),
"issues": load_dataset("open-source-metrics/issues").sort('dates'),
"pip": load_dataset("open-source-metrics/pip").sort('day')
}
val = 0
def _range(e):
global val
e['range'] = val
val += 1
return e
stars = {}
for k, v in datasets['stars'].items():
stars[k] = v.map(_range)
val = 0
issues = {}
for k, v in datasets['issues'].items():
issues[k] = v.map(_range)
val = 0
datasets['stars'] = DatasetDict(**stars)
datasets['issues'] = DatasetDict(**issues)
# datasets = {
# k1: DatasetDict({
# k2: v2.select(range(0, len(v2), max(1, int(len(v2) / 1000)))) for k2, v2 in v1.items()
# }) for k1, v1 in datasets.items()
# }
def link_values(library_names, returned_values):
previous_values = {library_name: None for library_name in library_names}
for library_name in library_names:
for i in returned_values.keys():
if library_name not in returned_values[i]:
returned_values[i][library_name] = previous_values[library_name]
else:
previous_values[library_name] = returned_values[i][library_name]
return returned_values
def running_mean(x, N, total_length=-1):
cumsum = np.cumsum(np.insert(x, 0, 0))
to_pad = max(total_length - len(cumsum), 0)
return np.pad(cumsum[N:] - cumsum[:-N], (to_pad, 0)) / float(N)
class RequestHandler(SimpleHTTPRequestHandler):
def do_GET(self):
print(self.path)
if self.path == "/":
self.path = "index.html"
return SimpleHTTPRequestHandler.do_GET(self)
if self.path.startswith("/initialize"):
dataset_keys = {k: set(v.keys()) for k, v in datasets.items()}
dataset_with_most_splits = max([d for d in dataset_keys.values()], key=len)
warnings = []
for k, v in dataset_keys.items():
if len(v) < len(dataset_with_most_splits):
warnings.extend(f"The {k} dataset does not contain all splits. Missing: {dataset_with_most_splits - v}")
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
# TODO: Send and display warnings
dataset_with_most_splits = list(dataset_with_most_splits)
dataset_with_most_splits.sort()
self.wfile.write(json.dumps(list(dataset_with_most_splits)).encode("utf-8"))
return SimpleHTTPRequestHandler
if self.path.startswith("/retrievePipInstalls"):
url = urlparse(self.path)
query = parse_qs(url.query)
library_names = query.get("input", None)[0]
library_names = library_names.split(',')
if 'Cumulated' in library_names:
dataset_keys = {k: set(v.keys()) for k, v in datasets.items()}
dataset_with_most_splits = max([d for d in dataset_keys.values()], key=len)
library_names = list(dataset_with_most_splits)
returned_values = {}
for library_name in library_names:
for i in datasets['pip'][library_name]:
if i['day'] in returned_values:
returned_values[i['day']]['Cumulated'] += i['num_downloads']
else:
returned_values[i['day']] = {'Cumulated': i['num_downloads']}
library_names = ['Cumulated']
else:
returned_values = {}
for library_name in library_names:
for i in datasets['pip'][library_name]:
if i['day'] in returned_values:
returned_values[i['day']][library_name] = i['num_downloads']
else:
returned_values[i['day']] = {library_name: i['num_downloads']}
for library_name in library_names:
for i in returned_values.keys():
if library_name not in returned_values[i]:
returned_values[i][library_name] = None
returned_values = collections.OrderedDict(sorted(returned_values.items()))
output = {l: [k[l] for k in returned_values.values()] for l in library_names}
output['day'] = list(returned_values.keys())
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps(output).encode("utf-8"))
return SimpleHTTPRequestHandler
if self.path.startswith("/retrieveStars"):
url = urlparse(self.path)
query = parse_qs(url.query)
library_names = query.get("input", None)[0]
library_names = library_names.split(',')
returned_values = {}
dataset_dict = datasets['stars']
for library_name in library_names:
dataset = dataset_dict[library_name]
for i in dataset:
if i['dates'] in returned_values:
returned_values[i['dates']][library_name] = i['range']
else:
returned_values[i['dates']] = {library_name: i['range']}
returned_values = collections.OrderedDict(sorted(returned_values.items()))
returned_values = link_values(library_names, returned_values)
output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names}
output['day'] = list(returned_values.keys())[::-1]
# Trim down to a smaller number of points.
output = {k: [v for i, v in enumerate(value) if i % int(len(value) / 100) == 0] for k, value in output.items()}
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps(output).encode("utf-8"))
return SimpleHTTPRequestHandler
if self.path.startswith("/retrieveIssues"):
url = urlparse(self.path)
query = parse_qs(url.query)
library_names = query.get("input", None)[0]
library_names = library_names.split(',')
returned_values = {}
dataset_dict = datasets['issues']
for library_name in library_names:
dataset = dataset_dict[library_name]
for i in dataset:
if i['dates'] in returned_values:
returned_values[i['dates']][library_name] = i['range']
else:
returned_values[i['dates']] = {library_name: i['range']}
returned_values = collections.OrderedDict(sorted(returned_values.items()))
returned_values = link_values(library_names, returned_values)
output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names}
output['day'] = list(returned_values.keys())[::-1]
# Trim down to a smaller number of points.
output = {k: [v for i, v in enumerate(value) if i % int(len(value) / 100) == 0] for k, value in output.items()}
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps(output).encode("utf-8"))
return SimpleHTTPRequestHandler
return SimpleHTTPRequestHandler.do_GET(self)
server = ThreadingHTTPServer(("", 7860), RequestHandler)
print("Running on port 7860")
server.serve_forever()