This-and-That / scripts /process_llama.py
HikariDawn777's picture
feat: initial push
59b2a81
raw
history blame
2.34 kB
'''
Process the llama file for the next step
'''
import os, shutil, sys
import json
import pandas as pd
import collections
if __name__ == "__main__":
# Define important path
json_path = "../SVD1/v1.jsonl"
folder_path = "/home/kiteret/Desktop/StableVideoDiffusion/full_text_tmp/"
# Read the json file
with open(json_path, 'r') as json_file:
json_list = list(json_file)
# Iterate all the json files
length_stats = collections.defaultdict(int)
for json_info in json_list:
json_info = json.loads(json_info)
# Define the path to write
key_start = len("/home/chfeng/llama3/full_text_tmp/")
key_end = len("lang.txt")
sub_path = json_info["file_path"][key_start:int(-1*key_end)]
new_text_path = os.path.join(folder_path, sub_path, "processed_text.txt")
if os.path.exists(new_text_path):
os.remove(new_text_path)
# Sanity check for the case where input is missed
if json_info["input"] == "":
print("It is weird for the input is empty in the LLM process for ", sub_path)
continue
# Re-Define the content
outputs = json_info["output"]
if outputs.find("action:") != 0:
print("It is weird for no actions: keyword in the outputs for ", sub_path, " with prompt ", outputs)
continue
# Prepare write file
contents = outputs.split('\n')
f = open(new_text_path, "a")
# Itearte
effective_length = 0
for idx, content in enumerate(contents):
key_word = content.split(":")[1][1:]
if key_word != "":
effective_length += 1
else:
if idx == 1:
print("It is abnormal for the this content to be empty ", sub_path, " with prompt ", outputs)
f.write(key_word + "\n")
# if effective_length == 2:
# print("short prompt case is ", sub_path, " with prompt ", outputs)
if effective_length < 2: # For those only 1 or zero, we won't consider them
print("The prompt is too short for ", sub_path, " with prompt ", outputs)
os.remove(new_text_path)
length_stats[effective_length] += 1
print("length_stats is ", length_stats)