Petr Tsvetkov
release
9513395
import json
import os
from datetime import datetime, timedelta
import pandas as pd
from datasets import load_dataset
from huggingface_hub import hf_hub_download, list_repo_tree
import config
def load_raw_rewriting_as_pandas():
return load_dataset(
config.HF_RAW_DATASET_NAME, split=config.HF_RAW_DATASET_SPLIT, token=config.HF_TOKEN, cache_dir=config.CACHE_DIR
).to_pandas()
def load_full_commit_as_pandas():
return (
load_dataset(
path=config.HF_FULL_COMMITS_DATASET_NAME,
name=config.HF_FULL_COMMITS_DATASET_SUBNAME,
split=config.HF_FULL_COMMITS_DATASET_SPLIT,
cache_dir=config.CACHE_DIR,
)
.to_pandas()
.rename(columns={"message": "reference"})
)
def edit_time_from_history(history_str):
history = json.loads(history_str)
if len(history) == 0:
return 0
timestamps = list(map(lambda e: datetime.fromisoformat(e["ts"]), history))
delta = max(timestamps) - min(timestamps)
return delta // timedelta(milliseconds=1)
def edit_time_from_timestamps(row):
loaded_ts = datetime.fromisoformat(row["loaded_ts"])
submitted_ts = datetime.fromisoformat(row["submitted_ts"])
delta = submitted_ts - loaded_ts
result = delta // timedelta(milliseconds=1)
return result if result >= 0 else None
def load_processed_rewriting_as_pandas():
manual_rewriting = load_raw_rewriting_as_pandas()[
[
"hash",
"repo",
"commit_msg_start",
"commit_msg_end",
"session",
"commit_msg_history",
"loaded_ts",
"submitted_ts",
]
]
manual_rewriting["edit_time_hist"] = manual_rewriting["commit_msg_history"].apply(edit_time_from_history)
manual_rewriting["edit_time"] = manual_rewriting.apply(edit_time_from_timestamps, axis=1)
manual_rewriting.drop(columns=["commit_msg_history", "loaded_ts", "submitted_ts"])
manual_rewriting.set_index(["hash", "repo"], inplace=True)
mods_dataset = load_full_commit_as_pandas()[["hash", "repo", "mods"]]
mods_dataset.set_index(["hash", "repo"], inplace=True)
return manual_rewriting.join(other=mods_dataset, how="left").reset_index()
def load_synthetic_as_pandas():
return load_dataset(
config.HF_SYNTHETIC_DATASET_NAME,
"all_pairs_with_metrics",
split=config.HF_SYNTHETIC_DATASET_SPLIT,
token=config.HF_TOKEN,
cache_dir=config.CACHE_DIR,
).to_pandas()
def load_full_commit_with_predictions_as_pandas():
full_dataset = load_full_commit_as_pandas()
predictions_paths = []
for prediction_file in list_repo_tree(
repo_id=config.HF_PREDICTIONS_DATASET_NAME,
path=os.path.join("commit_message_generation/predictions", config.HF_PREDICTIONS_MODEL),
repo_type="dataset",
):
predictions_paths.append(
hf_hub_download(
prediction_file.path,
repo_id=config.HF_PREDICTIONS_DATASET_NAME,
repo_type="dataset",
cache_dir=config.CACHE_DIR,
)
)
dfs = []
for path in predictions_paths:
dfs.append(pd.read_json(path, orient="records", lines=True))
predictions_dataset = pd.concat(dfs, axis=0, ignore_index=True)
predictions_dataset = predictions_dataset.sample(frac=1, random_state=config.RANDOM_STATE).set_index(
["hash", "repo"]
)[["prediction"]]
predictions_dataset = predictions_dataset[~predictions_dataset.index.duplicated(keep="first")]
dataset = full_dataset.join(other=predictions_dataset, on=("hash", "repo"))
return dataset.reset_index()