File size: 6,202 Bytes
aad1c54 b43e601 8fcba69 38d59be b43e601 30ea2f3 aad1c54 b43e601 ec04129 26cd20e dff7871 49d808e 162c032 8fcba69 26cd20e 38d59be b43e601 0b3fc64 b43e601 162c032 0b3fc64 162c032 38d59be b43e601 4c3f5ca b43e601 4c3f5ca b43e601 4c3f5ca b43e601 26cd20e 30ea2f3 26cd20e 30ea2f3 6d615ef 30ea2f3 b43e601 30ea2f3 b43e601 30ea2f3 b43e601 30ea2f3 b43e601 162c032 b43e601 30ea2f3 b43e601 26cd20e 30ea2f3 26cd20e 30ea2f3 26cd20e 30ea2f3 63363bc 30ea2f3 162c032 26cd20e 8fcba69 26cd20e b43e601 26cd20e 30ea2f3 26cd20e b43e601 38d59be 26cd20e b43e601 aad1c54 b43e601 26cd20e b43e601 49d808e 26cd20e 49d808e 26cd20e b43e601 26cd20e b43e601 26cd20e b43e601 26cd20e 38d59be b43e601 30ea2f3 38d59be e559262 aad1c54 38d59be b43e601 38d59be b43e601 162c032 aad1c54 26cd20e 49d808e 26cd20e aad1c54 b43e601 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import os
import sqlite3
import json
import csv
import hashlib
from datetime import datetime
from huggingface_hub import HfApi, hf_hub_download
# Settings
HF_TOKEN = os.environ.get("HF_TOKEN")
REPO_ID = "Imsidag-community/libretranslate-suggestions"
DEST_JSON_PATH_IN_REPO = "suggestions.json"
DEST_CSV_PATH_IN_REPO = "suggestions.csv"
REPO_TYPE = "dataset"
JSON_OUTPUT_PATH = "/tmp/suggestions.json"
CSV_OUTPUT_PATH = "/tmp/suggestions.csv"
CHECKSUM_FILE_JSON = "/tmp/.last_suggestions_json_checksum"
CHECKSUM_FILE_CSV = "/tmp/.last_suggestions_csv_checksum"
possible_paths = [
"/app/db/suggestions.db",
"/app/suggestions.db",
"/root/.local/share/db/suggestions.db",
"/home/libretranslate/.local/share/db/suggestions.db"
]
def find_db():
print(f"Running in CWD: {os.getcwd()}")
for path in possible_paths:
if os.path.exists(path):
print(f"Found suggestions.db at {path}")
return path
print("suggestions.db not found in any known path.")
return None
def extract_suggestions(db_path):
suggestions = []
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT q, s, source, target FROM suggestions")
rows = cursor.fetchall()
conn.close()
for row in rows:
unique_id = hashlib.md5((row[0] + row[1] + row[2] + row[3]).encode()).hexdigest()
suggestions.append({
"id": unique_id,
"source_text": row[0],
"suggested_text": row[1],
"source_lang": row[2],
"target_lang": row[3],
"timestamp": datetime.now().isoformat()
})
except sqlite3.Error as e:
print(f"SQLite error: {e}")
return suggestions
def download_existing_json():
try:
path = hf_hub_download(
repo_id=REPO_ID,
repo_type=REPO_TYPE,
filename=DEST_JSON_PATH_IN_REPO,
token=HF_TOKEN,
local_dir="/tmp"
)
print("Downloaded existing suggestions from Hugging Face.")
return path
except Exception as e:
print(f"Could not fetch existing suggestions from HF: {e}")
return None
def merge_with_existing(suggestions, existing_json_path):
existing = {}
if existing_json_path and os.path.exists(existing_json_path):
try:
with open(existing_json_path, "r", encoding="utf-8") as f:
for item in json.load(f):
existing[item["id"]] = {
"source_text": item["source_text"],
"suggested_text": item["suggested_text"],
"source_lang": item["source_lang"],
"target_lang": item["target_lang"],
"timestamp": item.get("timestamp", datetime.now().isoformat())
}
except Exception as e:
print(f"Failed to read existing JSON: {e}")
changed = False
for s in suggestions:
s_clean = {
"source_text": s["source_text"],
"suggested_text": s["suggested_text"],
"source_lang": s["source_lang"],
"target_lang": s["target_lang"],
}
existing_entry = existing.get(s["id"])
if not existing_entry:
changed = True
existing[s["id"]] = {**s_clean, "timestamp": datetime.now().isoformat()}
if not changed:
print("No new suggestions β skipping write/upload.")
return None
# Write merged JSON
final = []
for id_, data in existing.items():
final.append({**data, "id": id_})
with open(JSON_OUTPUT_PATH, "w", encoding="utf-8") as f:
json.dump(final, f, indent=2, ensure_ascii=False)
# Also write CSV
write_csv(final, CSV_OUTPUT_PATH)
return JSON_OUTPUT_PATH
def write_csv(suggestions, csv_path):
with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=[
"id", "source_text", "suggested_text", "source_lang", "target_lang", "timestamp"
])
writer.writeheader()
for item in suggestions:
writer.writerow(item)
def get_checksum(filepath):
if not os.path.exists(filepath):
return None
with open(filepath, "rb") as f:
return hashlib.md5(f.read()).hexdigest()
def upload_if_updated(filepath, dest_path, checksum_file):
if not filepath or not os.path.exists(filepath):
return
new_checksum = get_checksum(filepath)
old_checksum = None
if os.path.exists(checksum_file):
with open(checksum_file, "r") as f:
old_checksum = f.read().strip()
if new_checksum != old_checksum:
print(f"Uploading updated {os.path.basename(dest_path)} to Hugging Face...")
try:
api = HfApi()
api.upload_file(
path_or_fileobj=filepath,
path_in_repo=dest_path,
repo_id=REPO_ID,
repo_type=REPO_TYPE,
token=HF_TOKEN
)
with open(checksum_file, "w") as f:
f.write(new_checksum)
print(f"Upload successful: {dest_path} at {datetime.now().isoformat()}")
except Exception as e:
print(f"Upload failed for {dest_path}:", e)
else:
print(f"No changes in {os.path.basename(dest_path)} β skipping upload.")
def main():
print(f"===== Application Startup at {datetime.now().isoformat()} =====")
if not HF_TOKEN:
print("HF_TOKEN not set β skipping upload.")
return
db_path = find_db()
if not db_path:
return
suggestions = extract_suggestions(db_path)
if not suggestions:
print("No suggestions found β skipping.")
return
existing_path = download_existing_json()
merged_json = merge_with_existing(suggestions, existing_path)
if merged_json:
upload_if_updated(JSON_OUTPUT_PATH, DEST_JSON_PATH_IN_REPO, CHECKSUM_FILE_JSON)
upload_if_updated(CSV_OUTPUT_PATH, DEST_CSV_PATH_IN_REPO, CHECKSUM_FILE_CSV)
if __name__ == "__main__":
main()
|