Spaces:

echung682
/

rlhf_app_for_emotionAI_model

Sleeping

App Files Files Community

echung682 commited on Feb 27

Commit

00e27e2

verified ·

1 Parent(s): 987f3af

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -46

app.py CHANGED Viewed

@@ -18,48 +18,64 @@ pipe = pipeline(model=model_ckpt)
 emotion_dataset = load_dataset("echung682/emotion-analysis-tweets")
 #in order to keep the data persistent on HuggingFace repo
 def save_to_repo():
-    # Add & commit the latest flagged.csv file to the Hugging Face Space repo
-    os.system("git pull origin main")  # Pull latest changes (to avoid conflicts)
-    os.system("git add feedback_data/flagged.csv")
-    os.system('git commit -m "Update flagged data"')
-    os.system("git push origin main")  # Push updated file to the repo
-STATE_FILE = "state.json"
 '''
 in order to keep track of what the last prompt was that was given human feedback
 '''
 def load_state():
-    if not os.path.exists(STATE_FILE):  # Handle missing file
-        return 0  # Default count value
     try:
-        with open(STATE_FILE, "r") as f:
-            content = f.read().strip()  # Remove leading/trailing spaces
-            if not content:  # Handle empty file
-                return 0  # Default count value
-            return json.loads(content).get("count", 0)  # Safely parse JSON
-    except (json.JSONDecodeError, OSError):  # Catch JSON errors or I/O issues
-        return 0  # Default count value
-        #if the file doesn't have count variable in it, then it will return 0, which is good - that's the first index
 # Save state to file
-def save_state(count):
     with open("state.json", "w") as f:
-        json.dump({"count": count}, f)
-def increment():
-    count = load_state()
-    count += 1
-    save_state(count)
     return count
 def save_state_to_repo():
-    os.system("git pull origin main")  # Pull latest changes (to avoid conflicts)
-    os.system("git add state.json")
-    os.system('git commit -m "Update state"')
-    os.system("git push origin main")  # Push updated file to the repo
 '''
@@ -80,7 +96,7 @@ def updateDataset(prompt, option1, option2, flagged_option):
         chosen = ""
         rejected = ""
-    index = increment()
     with open("feedback_data/flagged.csv", "a") as f:
         f.write(f"{prompt},{chosen},{rejected}\n")
@@ -98,28 +114,28 @@ extracting the top two scoring emotions
 returning these
 '''
 def emotion_analysis_data_collection():
-  index = load_state()
-  result = pipe(emotion_dataset["train"]["text"][index], top_k = None)
-  score_list = [] #empty list to hold the scores
-  emotion_list = [] #empty list to hold the emotions
-  for emotion in result:
-    emotion_list.append(emotion["label"]) #extracting the emotions from the results
-    score_list.append(emotion["score"]) #extracing the scores from the results
-  emotion_dict = {}
-  for index, value in enumerate(emotion_list):
-    emotion_dict[value] = score_list[index]
-  dictKeys_list = list(emotion_dict.keys())
-  emotion_highestScore = dictKeys_list[0]
-  emotion_secondHighestScore = dictKeys_list[1]
-  #print(emotion_highestScore)
-  #print(emotion_secondHighestScore)
-  #print(" ")
-  return emotion_dataset["train"]["text"][index], emotion_highestScore, emotion_secondHighestScore

 emotion_dataset = load_dataset("echung682/emotion-analysis-tweets")
 #in order to keep the data persistent on HuggingFace repo
+#they are saved as secrets in my HuggingFace space because they shouldn't be visible in the code
 def save_to_repo():
+    try:
+        subprocess.run(["git", "config", "--global", "user.email", os.environ["GIT_EMAIL"]], check=True)
+        subprocess.run(["git", "config", "--global", "user.name", os.environ["GIT_USER"]], check=True)
+        subprocess.run(["git", "pull", "origin", "main"], check=True)
+        subprocess.run(["git", "add", "feedback_data/flagged.csv"], check=True)
+        subprocess.run(["git", "commit", "-m", "Update flagged data"], check=True)
+        subprocess.run(["git", "push", "origin", "main"], check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Git operation failed: {e}")
 '''
 in order to keep track of what the last prompt was that was given human feedback
 '''
 def load_state():
     try:
+        with open("state.json", "r") as f:
+            state = json.load(f)
+            return state.get("count", 0), state.get("processed_indices", [])
+    except FileNotFoundError:
+        return 0, []
 # Save state to file
+def save_state(count, processed_indices):
     with open("state.json", "w") as f:
+        json.dump({
+            "count": count,
+            "processed_indices": processed_indices #list of prompts that we already processed
+        }, f)
+def get_next_prompt():
+    count, processed = load_state()
+    dataset_size = len(emotion_dataset["train"])
+    # If we've processed all prompts, start over
+    if len(processed) >= dataset_size:
+        processed = []
+    # Find next unprocessed index
+    while count in processed: #skipping the prompts that we already processed
+        count = (count + 1) % dataset_size
+    processed.append(count)
+    save_state(count, processed)
+    save_state_to_repo()
     return count
 def save_state_to_repo():
+    try:
+        subprocess.run(["git", "pull", "origin", "main"], check=True)
+        subprocess.run(["git", "add", "state.json"], check=True)
+        subprocess.run(["git", "commit", "-m", "Update state"], check=True)
+        subprocess.run(["git", "push", "origin", "main"], check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Git operation failed: {e}")
 '''
         chosen = ""
         rejected = ""
+    index = get_next_prompt()
     with open("feedback_data/flagged.csv", "a") as f:
         f.write(f"{prompt},{chosen},{rejected}\n")
 returning these
 '''
 def emotion_analysis_data_collection():
+    index = get_next_prompt()
+    result = pipe(emotion_dataset["train"]["text"][index], top_k = None)
+    score_list = [] #empty list to hold the scores
+    emotion_list = [] #empty list to hold the emotions
+    for emotion in result:
+        emotion_list.append(emotion["label"]) #extracting the emotions from the results
+        score_list.append(emotion["score"]) #extracing the scores from the results
+    emotion_dict = {}
+    for index, value in enumerate(emotion_list):
+        emotion_dict[value] = score_list[index]
+    dictKeys_list = list(emotion_dict.keys())
+    emotion_highestScore = dictKeys_list[0]
+    emotion_secondHighestScore = dictKeys_list[1]
+    #print(emotion_highestScore)
+    #print(emotion_secondHighestScore)
+    #print(" ")
+    return emotion_dataset["train"]["text"][index], emotion_highestScore, emotion_secondHighestScore