Spaces:

imomayiz
/

DODa

Sleeping

App Files Files Community

Imane Momayiz commited on Feb 8, 2024

Commit

d9514f5

1 Parent(s): 3c66851

update

Browse files

Files changed (4) hide show

app.py +15 -90
src/components.py +48 -130
src/layout.py +5 -0
src/utils.py +31 -0

app.py CHANGED Viewed

@@ -1,24 +1,17 @@
-import streamlit as st
-from datasets import load_dataset
-import datetime as dt
-import random
-import json
 import os
 from huggingface_hub import HfApi, CommitScheduler
-import uuid
 HF_API_KEY = os.environ.get("HF_TOKEN", None)
 api = HfApi(token=HF_API_KEY)
-REPO_ID = "imomayiz/darija-english"
-DATASET_REPO_URL = f"https://huggingface.co/datasets/{REPO_ID}"
-submissions_folder = "submissions"
-submissions_file = os.path.join(submissions_folder, f"submissions_{uuid.uuid4()}.json")
 os.makedirs(submissions_folder, exist_ok=True)
 scheduler = CommitScheduler(
     token=HF_API_KEY,
     hf_api=api,
@@ -29,71 +22,10 @@ scheduler = CommitScheduler(
     every=1,
 )
-# Define the ParquetScheduler instance with your repo details
-# scheduler = ParquetScheduler(repo_id=REPO_ID,
-#                              token=HF_API_KEY, every=1,
-#                              path_in_repo=submissions_folder,
-#                              repo_type="dataset")
-def load_data(repo_id):
-    dataset = load_dataset(f'{repo_id}', name='sentences', split='sentences')
-    return dataset
-def fetch_sentence(dataset, column_name="darija_ar"):
-    # Get a random sentence
-    random_sentence_index = random.randint(0, len(dataset) - 1)
-    random_sentence = dataset[random_sentence_index][column_name]
-    st.session_state.sentence = random_sentence
-    st.session_state.translation_input = ""
-    st.session_state.translation_input_fr = ""
-    return random_sentence
-def store_submission(api: HfApi, sentence: str, translation: str, translation_fr: str):
-    """
-    Append input/outputs and user feedback to a JSON Lines file
-    using a thread lock to avoid concurrent writes from different users.
-    """
-    ts = dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")
-    # folder_path = "submissions"
-    # os.makedirs(folder_path, exist_ok=True)
-    # filename = os.path.join(folder_path, f"submissions_{ts}.txt")
-    # with open(filename, "w", encoding="utf-8") as f:
-    #     f.write(f"darija,eng,darija_ar\n{sentence},{translation},{translation_fr}")
-    # api.upload_file(
-    #             path_or_fileobj=filename,
-    #             path_in_repo=filename,
-    #             repo_id=REPO_ID,
-    #             repo_type="dataset",
-    #         )
-    with scheduler.lock:
-        with open(submissions_file, "a") as f:
-            f.write(json.dumps({
-                "darija": translation_fr,
-                "eng": translation,
-                "darija_ar": sentence}))
-            f.write("\n")
-    # scheduler.append({"darija": translation_fr,
-    #             "eng": translation,
-    #             "darija_ar": sentence})
-    st.success(
-        f"""Translation submitted successfully to
-        {DATASET_REPO_URL}/tree/main/{submissions_folder}"""
-        )
 # Load the dataset
 dataset = load_data(REPO_ID)
 if "sentence" not in st.session_state:
     st.session_state.sentence = fetch_sentence(dataset)
 if 'translation_input' not in st.session_state:
@@ -105,13 +37,7 @@ if 'display_new' not in st.session_state:
 st.title("Translate From Arabic to English")
-st.markdown(
-"""This mini-app allows you to contribute to the **darija-english** dataset
-as part of [DODa](https://darija-open-dataset.github.io/)
-project. To contribute, simply translate the given sentence from Arabic to English.
-The translated sentence will be submitted to the dataset
-[here](https://huggingface.co/datasets/imomayiz/darija-english)."""
-)
 st.divider()
@@ -132,21 +58,20 @@ st.session_state.display_new = st.button("New Sentence",
 # Input field for translation
-translation_input_placeholder = st.empty()
-translation_input = st.text_input("Enter translation to english: ",
-                                        st.session_state.translation_input)
 st.session_state.translation_input = translation_input
-# Input field for translation
-translation_input_placeholder_fr = st.empty()
 translation_input_fr = st.text_input(
         "Enter translation to darija in latin characters: ",
         st.session_state.translation_input_fr
         )
 st.session_state.translation_input_fr = translation_input_fr
 if st.button("Submit Translation"):
     if st.session_state.translation_input_fr or st.session_state.translation_input:
         store_submission(api,

 import os
+import streamlit as st
 from huggingface_hub import HfApi, CommitScheduler
+from src.components import (
+    load_data, fetch_sentence, store_submission,
+    REPO_ID, submissions_folder)
+from src.layout import INTRO_TEXT
+# setup
 HF_API_KEY = os.environ.get("HF_TOKEN", None)
 api = HfApi(token=HF_API_KEY)
 os.makedirs(submissions_folder, exist_ok=True)
+# Create a commit scheduler
 scheduler = CommitScheduler(
     token=HF_API_KEY,
     hf_api=api,
     every=1,
 )
 # Load the dataset
 dataset = load_data(REPO_ID)
+# Initialize session state
 if "sentence" not in st.session_state:
     st.session_state.sentence = fetch_sentence(dataset)
 if 'translation_input' not in st.session_state:
 st.title("Translate From Arabic to English")
+st.markdown(INTRO_TEXT, unsafe_allow_html=True)
 st.divider()
 # Input field for translation
+translation_input = st.text_input(
+    "Enter translation to english: ",
+    st.session_state.translation_input
+    )
 st.session_state.translation_input = translation_input
+# Input field for translation in latin characters
 translation_input_fr = st.text_input(
         "Enter translation to darija in latin characters: ",
         st.session_state.translation_input_fr
         )
 st.session_state.translation_input_fr = translation_input_fr
+# Submit button
 if st.button("Submit Translation"):
     if st.session_state.translation_input_fr or st.session_state.translation_input:
         store_submission(api,

src/components.py CHANGED Viewed

@@ -1,140 +1,58 @@
-from huggingface_hub import HfApi, CommitScheduler
-from typing import Any, Dict, List, Optional, Union
 import uuid
-from pathlib import Path
-import json
-import tempfile
-import pyarrow as pa
-import pyarrow.parquet as pq
-# Initialize the ParquetScheduler
-class ParquetScheduler(CommitScheduler):
-    """
-    Usage: configure the scheduler with a repo id. Once started, you can add data to be uploaded to the Hub. 1 `.append`
-    call will result in 1 row in your final dataset.
-    ```py
-    # Start scheduler
-    >>> scheduler = ParquetScheduler(repo_id="my-parquet-dataset")
-    # Append some data to be uploaded
-    >>> scheduler.append({...})
-    >>> scheduler.append({...})
-    >>> scheduler.append({...})
-    ```
-    The scheduler will automatically infer the schema from the data it pushes.
-    Optionally, you can manually set the schema yourself:
-    ```py
-    >>> scheduler = ParquetScheduler(
-    ...     repo_id="my-parquet-dataset",
-    ...     schema={
-    ...         "prompt": {"_type": "Value", "dtype": "string"},
-    ...         "negative_prompt": {"_type": "Value", "dtype": "string"},
-    ...         "guidance_scale": {"_type": "Value", "dtype": "int64"},
-    ...         "image": {"_type": "Image"},
-    ...     },
-    ... )
-    See https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Value for the list of
-    possible values.
-    """
-    def __init__(
-        self,
-        *,
-        repo_id: str,
-        schema: Optional[Dict[str, Dict[str, str]]] = None,
-        every: Union[int, float] = 5,
-        path_in_repo: Optional[str] = "data",
-        repo_type: Optional[str] = "dataset",
-        revision: Optional[str] = None,
-        private: bool = False,
-        token: Optional[str] = None,
-        allow_patterns: Union[List[str], str, None] = None,
-        ignore_patterns: Union[List[str], str, None] = None,
-        hf_api: Optional[HfApi] = None,
-    ) -> None:
-        super().__init__(
-            repo_id=repo_id,
-            folder_path="dummy",  # not used by the scheduler
-            every=every,
-            path_in_repo=path_in_repo,
-            repo_type=repo_type,
-            revision=revision,
-            private=private,
-            token=token,
-            allow_patterns=allow_patterns,
-            ignore_patterns=ignore_patterns,
-            hf_api=hf_api,
-        )
-        self._rows: List[Dict[str, Any]] = []
-        self._schema = schema
-    def append(self, row: Dict[str, Any]) -> None:
-        """Add a new item to be uploaded."""
-        with self.lock:
-            self._rows.append(row)
-    def push_to_hub(self):
-        # Check for new rows to push
-        with self.lock:
-            rows = self._rows
-            self._rows = []
-        if not rows:
-            return
-        print(f"Got {len(rows)} item(s) to commit.")
-        # Load images + create 'features' config for datasets library
-        schema: Dict[str, Dict] = self._schema or {}
-        path_to_cleanup: List[Path] = []
-        for row in rows:
-            for key, value in row.items():
-                # Infer schema (for `datasets` library)
-                if key not in schema:
-                    schema[key] = _infer_schema(key, value)
-                # Load binary files if necessary
-                if schema[key]["_type"] in ("Image", "Audio"):
-                    # It's an image or audio: we load the bytes and remember to cleanup the file
-                    file_path = Path(value)
-                    if file_path.is_file():
-                        row[key] = {
-                            "path": file_path.name,
-                            "bytes": file_path.read_bytes(),
-                        }
-                        path_to_cleanup.append(file_path)
-        # Complete rows if needed
-        for row in rows:
-            for feature in schema:
-                if feature not in row:
-                    row[feature] = None
-        # Export items to Arrow format
-        table = pa.Table.from_pylist(rows)
-        # Add metadata (used by datasets library)
-        table = table.replace_schema_metadata(
-            {"huggingface": json.dumps({"info": {"features": schema}})}
-        )
-        # Write to parquet file
-        archive_file = tempfile.NamedTemporaryFile()
-        pq.write_table(table, archive_file.name)
-        # Upload
-        self.api.upload_file(
-            repo_id=self.repo_id,
-            repo_type=self.repo_type,
-            revision=self.revision,
-            path_in_repo=f"{uuid.uuid4()}.parquet",
-            path_or_fileobj=archive_file.name,
-        )
-        print("Commit completed.")
-        # Cleanup
-        archive_file.close()
-        for path in path_to_cleanup:
-            path.unlink(missing_ok=True)

+import streamlit as st
+import datetime as dt
+import random
+import json
+import os
+from huggingface_hub import CommitScheduler
+from datasets import load_dataset
 import uuid
+REPO_ID = "imomayiz/darija-english"
+DATASET_REPO_URL = f"https://huggingface.co/datasets/{REPO_ID}"
+submissions_folder = "submissions"
+submissions_file = os.path.join(submissions_folder, f"submissions_{uuid.uuid4()}.json")
+def load_data(repo_id):
+    dataset = load_dataset(f'{repo_id}', name='sentences', split='sentences')
+    return dataset
+def fetch_sentence(dataset, column_name="darija_ar"):
+    # Get a random sentence
+    random_sentence_index = random.randint(0, len(dataset) - 1)
+    random_sentence = dataset[random_sentence_index][column_name]
+    st.session_state.sentence = random_sentence
+    st.session_state.translation_input = ""
+    st.session_state.translation_input_fr = ""
+    return random_sentence
+def store_submission(
+        scheduler: CommitScheduler, sentence: str, translation: str, translation_fr: str
+        ):
+    """
+    Append input/outputs and user feedback to a JSON Lines file
+    using a thread lock to avoid concurrent writes from different users.
+    """
+    ts = dt.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")
+    with scheduler.lock:
+        with open(submissions_file, "a") as f:
+            f.write(json.dumps({
+                "darija": translation_fr,
+                "eng": translation,
+                "darija_ar": sentence,
+                "timestamp": ts}),
+                ensure_ascii=False)
+            f.write("\n")
+    st.success(
+        f"""Translation submitted successfully.
+        You will see your commit in 1 minute at
+        {DATASET_REPO_URL}/tree/main/{submissions_folder}.
+        You can submit another translation or check the dataset."""
+        )

src/layout.py ADDED Viewed

	@@ -0,0 +1,5 @@

+INTRO_TEXT = """This mini-app allows you to contribute to the **darija-english** dataset
+as part of [DODa](https://darija-open-dataset.github.io/)
+project. To contribute, simply translate the given sentence from Arabic to English.
+The translated sentence will be submitted to the dataset
+[here](https://huggingface.co/datasets/imomayiz/darija-english)."""

src/utils.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from huggingface_hub import HfApi
+def push_data_to_hf(repo_id, folder_path, path_in_repo, token=None):
+    """
+    Pushes data to a dataset on the Hugging Face Hub.
+    Parameters:
+        - repo_id (str): The ID of the repository on the Hugging Face Hub.
+        - folder_path (str): Local path to the folder containing the data.
+        - path_in_repo (str): Path within the repository where the data should be stored.
+        - token (str, optional): Your authentication token for the Hugging Face Hub.
+    Returns:
+        - str: URL of the uploaded data.
+    """
+    api = HfApi(token=token)
+    try:
+        api.upload_folder(
+            folder_path=folder_path,
+            repo_id=repo_id,
+            repo_type="dataset",
+            path_in_repo=path_in_repo,
+        )
+    except Exception as e:
+        return f"Error uploading data: {str(e)}"
+    url = f"https://huggingface.co/{repo_id}/raw/main/{path_in_repo}"
+    return f"Data successfully uploaded. Access it at: {url}"