LibreTranslate_Kabyle / upload_suggestions.py
axxam's picture
Update upload_suggestions.py
26cd20e verified
import os
import sqlite3
import json
import csv
import hashlib
from datetime import datetime
from huggingface_hub import HfApi, hf_hub_download
# Settings
HF_TOKEN = os.environ.get("HF_TOKEN")
REPO_ID = "axxam/libretranslate-suggestions"
DEST_JSON_PATH_IN_REPO = "suggestions.json"
DEST_CSV_PATH_IN_REPO = "suggestions.csv"
REPO_TYPE = "dataset"
JSON_OUTPUT_PATH = "/tmp/suggestions.json"
CSV_OUTPUT_PATH = "/tmp/suggestions.csv"
CHECKSUM_FILE_JSON = "/tmp/.last_suggestions_json_checksum"
CHECKSUM_FILE_CSV = "/tmp/.last_suggestions_csv_checksum"
possible_paths = [
"/app/db/suggestions.db",
"/app/suggestions.db",
"/root/.local/share/db/suggestions.db",
"/home/libretranslate/.local/share/db/suggestions.db"
]
def find_db():
print(f"Running in CWD: {os.getcwd()}")
for path in possible_paths:
if os.path.exists(path):
print(f"Found suggestions.db at {path}")
return path
print("suggestions.db not found in any known path.")
return None
def extract_suggestions(db_path):
suggestions = []
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT q, s, source, target FROM suggestions")
rows = cursor.fetchall()
conn.close()
for row in rows:
unique_id = hashlib.md5((row[0] + row[1] + row[2] + row[3]).encode()).hexdigest()
suggestions.append({
"id": unique_id,
"source_text": row[0],
"suggested_text": row[1],
"source_lang": row[2],
"target_lang": row[3],
"timestamp": datetime.now().isoformat()
})
except sqlite3.Error as e:
print(f"SQLite error: {e}")
return suggestions
def download_existing_json():
try:
path = hf_hub_download(
repo_id=REPO_ID,
repo_type=REPO_TYPE,
filename=DEST_JSON_PATH_IN_REPO,
token=HF_TOKEN,
local_dir="/tmp"
)
print("Downloaded existing suggestions from Hugging Face.")
return path
except Exception as e:
print(f"Could not fetch existing suggestions from HF: {e}")
return None
def merge_with_existing(suggestions, existing_json_path):
existing = {}
if existing_json_path and os.path.exists(existing_json_path):
try:
with open(existing_json_path, "r", encoding="utf-8") as f:
for item in json.load(f):
existing[item["id"]] = {
"source_text": item["source_text"],
"suggested_text": item["suggested_text"],
"source_lang": item["source_lang"],
"target_lang": item["target_lang"],
"timestamp": item.get("timestamp", datetime.now().isoformat())
}
except Exception as e:
print(f"Failed to read existing JSON: {e}")
changed = False
for s in suggestions:
s_clean = {
"source_text": s["source_text"],
"suggested_text": s["suggested_text"],
"source_lang": s["source_lang"],
"target_lang": s["target_lang"],
}
existing_entry = existing.get(s["id"])
if not existing_entry:
changed = True
existing[s["id"]] = {**s_clean, "timestamp": datetime.now().isoformat()}
if not changed:
print("No new suggestions β€” skipping write/upload.")
return None
# Write merged JSON
final = []
for id_, data in existing.items():
final.append({**data, "id": id_})
with open(JSON_OUTPUT_PATH, "w", encoding="utf-8") as f:
json.dump(final, f, indent=2, ensure_ascii=False)
# Also write CSV
write_csv(final, CSV_OUTPUT_PATH)
return JSON_OUTPUT_PATH
def write_csv(suggestions, csv_path):
with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=[
"id", "source_text", "suggested_text", "source_lang", "target_lang", "timestamp"
])
writer.writeheader()
for item in suggestions:
writer.writerow(item)
def get_checksum(filepath):
if not os.path.exists(filepath):
return None
with open(filepath, "rb") as f:
return hashlib.md5(f.read()).hexdigest()
def upload_if_updated(filepath, dest_path, checksum_file):
if not filepath or not os.path.exists(filepath):
return
new_checksum = get_checksum(filepath)
old_checksum = None
if os.path.exists(checksum_file):
with open(checksum_file, "r") as f:
old_checksum = f.read().strip()
if new_checksum != old_checksum:
print(f"Uploading updated {os.path.basename(dest_path)} to Hugging Face...")
try:
api = HfApi()
api.upload_file(
path_or_fileobj=filepath,
path_in_repo=dest_path,
repo_id=REPO_ID,
repo_type=REPO_TYPE,
token=HF_TOKEN
)
with open(checksum_file, "w") as f:
f.write(new_checksum)
print(f"Upload successful: {dest_path} at {datetime.now().isoformat()}")
except Exception as e:
print(f"Upload failed for {dest_path}:", e)
else:
print(f"No changes in {os.path.basename(dest_path)} β€” skipping upload.")
def main():
print(f"===== Application Startup at {datetime.now().isoformat()} =====")
if not HF_TOKEN:
print("HF_TOKEN not set β€” skipping upload.")
return
db_path = find_db()
if not db_path:
return
suggestions = extract_suggestions(db_path)
if not suggestions:
print("No suggestions found β€” skipping.")
return
existing_path = download_existing_json()
merged_json = merge_with_existing(suggestions, existing_path)
if merged_json:
upload_if_updated(JSON_OUTPUT_PATH, DEST_JSON_PATH_IN_REPO, CHECKSUM_FILE_JSON)
upload_if_updated(CSV_OUTPUT_PATH, DEST_CSV_PATH_IN_REPO, CHECKSUM_FILE_CSV)
if __name__ == "__main__":
main()