Spaces:
Sleeping
Sleeping
Update my_model/dataset/dataset_processor.py
Browse files
my_model/dataset/dataset_processor.py
CHANGED
|
@@ -35,6 +35,7 @@ class OKVQADatasetProcessor:
|
|
| 35 |
self.df_answers = pd.DataFrame(self.annotations)
|
| 36 |
self.merged_df = None
|
| 37 |
|
|
|
|
| 38 |
def load_data_files(self) -> Tuple[List[dict], List[dict]]:
|
| 39 |
"""
|
| 40 |
Loads the question and annotation data from JSON files.
|
|
@@ -52,6 +53,7 @@ class OKVQADatasetProcessor:
|
|
| 52 |
|
| 53 |
return questions, annotations
|
| 54 |
|
|
|
|
| 55 |
@staticmethod
|
| 56 |
def find_most_frequent(my_list: List[str]) -> Optional[str]:
|
| 57 |
"""
|
|
@@ -69,6 +71,7 @@ class OKVQADatasetProcessor:
|
|
| 69 |
most_common = counter.most_common(1)
|
| 70 |
return most_common[0][0]
|
| 71 |
|
|
|
|
| 72 |
def merge_data(self) -> None:
|
| 73 |
"""
|
| 74 |
Merges the question and answer DataFrames on a common key.
|
|
@@ -81,9 +84,10 @@ class OKVQADatasetProcessor:
|
|
| 81 |
self.merged_df = pd.merge(self.df_questions, self.df_answers, on=['question_id', 'image_id'])
|
| 82 |
|
| 83 |
def join_words_with_hyphen(self, sentence):
|
| 84 |
-
|
| 85 |
return '-'.join(sentence.split())
|
| 86 |
|
|
|
|
| 87 |
def process_answers(self) -> None:
|
| 88 |
"""
|
| 89 |
Processes answers from merged DataFrame by extracting and identifying the most frequent answers.
|
|
@@ -103,6 +107,7 @@ class OKVQADatasetProcessor:
|
|
| 103 |
self.merged_df['single_word_answers'] = self.merged_df['most_frequent_processed_answer'].apply(
|
| 104 |
self.join_words_with_hyphen)
|
| 105 |
|
|
|
|
| 106 |
def get_processed_data(self) -> Optional[pd.DataFrame]:
|
| 107 |
"""
|
| 108 |
Retrieves the processed DataFrame.
|
|
@@ -117,6 +122,7 @@ class OKVQADatasetProcessor:
|
|
| 117 |
print("DataFrame is empty or not processed yet.")
|
| 118 |
return None
|
| 119 |
|
|
|
|
| 120 |
def save_to_csv(self, df: pd.DataFrame, saved_file_name: Optional[str]) -> None:
|
| 121 |
"""
|
| 122 |
Saves the DataFrame to a CSV file.
|
|
@@ -134,6 +140,7 @@ class OKVQADatasetProcessor:
|
|
| 134 |
else:
|
| 135 |
df.to_csv("data.csv", index=None)
|
| 136 |
|
|
|
|
| 137 |
def display_dataframe(self) -> None:
|
| 138 |
"""
|
| 139 |
Displays the processed DataFrame.
|
|
|
|
| 35 |
self.df_answers = pd.DataFrame(self.annotations)
|
| 36 |
self.merged_df = None
|
| 37 |
|
| 38 |
+
|
| 39 |
def load_data_files(self) -> Tuple[List[dict], List[dict]]:
|
| 40 |
"""
|
| 41 |
Loads the question and annotation data from JSON files.
|
|
|
|
| 53 |
|
| 54 |
return questions, annotations
|
| 55 |
|
| 56 |
+
|
| 57 |
@staticmethod
|
| 58 |
def find_most_frequent(my_list: List[str]) -> Optional[str]:
|
| 59 |
"""
|
|
|
|
| 71 |
most_common = counter.most_common(1)
|
| 72 |
return most_common[0][0]
|
| 73 |
|
| 74 |
+
|
| 75 |
def merge_data(self) -> None:
|
| 76 |
"""
|
| 77 |
Merges the question and answer DataFrames on a common key.
|
|
|
|
| 84 |
self.merged_df = pd.merge(self.df_questions, self.df_answers, on=['question_id', 'image_id'])
|
| 85 |
|
| 86 |
def join_words_with_hyphen(self, sentence):
|
| 87 |
+
|
| 88 |
return '-'.join(sentence.split())
|
| 89 |
|
| 90 |
+
|
| 91 |
def process_answers(self) -> None:
|
| 92 |
"""
|
| 93 |
Processes answers from merged DataFrame by extracting and identifying the most frequent answers.
|
|
|
|
| 107 |
self.merged_df['single_word_answers'] = self.merged_df['most_frequent_processed_answer'].apply(
|
| 108 |
self.join_words_with_hyphen)
|
| 109 |
|
| 110 |
+
|
| 111 |
def get_processed_data(self) -> Optional[pd.DataFrame]:
|
| 112 |
"""
|
| 113 |
Retrieves the processed DataFrame.
|
|
|
|
| 122 |
print("DataFrame is empty or not processed yet.")
|
| 123 |
return None
|
| 124 |
|
| 125 |
+
|
| 126 |
def save_to_csv(self, df: pd.DataFrame, saved_file_name: Optional[str]) -> None:
|
| 127 |
"""
|
| 128 |
Saves the DataFrame to a CSV file.
|
|
|
|
| 140 |
else:
|
| 141 |
df.to_csv("data.csv", index=None)
|
| 142 |
|
| 143 |
+
|
| 144 |
def display_dataframe(self) -> None:
|
| 145 |
"""
|
| 146 |
Displays the processed DataFrame.
|