Spaces:
Sleeping
Sleeping
Merge pull request #12 from andreamalhera/11-debug-run-generation-for-f-triangle
Browse files- .conda.yml +151 -17
- .github/workflows/test_gedi.yml +8 -0
- data/test/grid_feat.csv +2 -0
- data/validation/genELexperiment3_04.json +1 -0
- data/validation/genELexperiment4_02.json +1 -0
- gedi/benchmark.py +1 -1
- gedi/features.py +1 -1
- gedi/generator.py +9 -7
- gedi/utils/io_helpers.py +7 -10
- utils/merge_jsons.py +5 -5
.conda.yml
CHANGED
@@ -1,22 +1,156 @@
|
|
1 |
name: gedi
|
2 |
channels:
|
3 |
- conda-forge
|
|
|
4 |
dependencies:
|
5 |
-
-
|
6 |
-
-
|
7 |
-
-
|
8 |
-
-
|
9 |
-
-
|
10 |
-
-
|
11 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
- pip:
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
name: gedi
|
2 |
channels:
|
3 |
- conda-forge
|
4 |
+
- defaults
|
5 |
dependencies:
|
6 |
+
- asttokens=2.4.1=pyhd8ed1ab_0
|
7 |
+
- backcall=0.2.0=pyh9f0ad1d_0
|
8 |
+
- certifi=2024.2.2=pyhd8ed1ab_0
|
9 |
+
- colorama=0.4.6=pyhd8ed1ab_0
|
10 |
+
- comm=0.2.2=pyhd8ed1ab_0
|
11 |
+
- cycler=0.12.1=pyhd8ed1ab_0
|
12 |
+
- decorator=5.1.1=pyhd8ed1ab_0
|
13 |
+
- executing=2.0.1=pyhd8ed1ab_0
|
14 |
+
- importlib-metadata=7.1.0=pyha770c72_0
|
15 |
+
- importlib-resources=6.4.0=pyhd8ed1ab_0
|
16 |
+
- importlib_metadata=7.1.0=hd8ed1ab_0
|
17 |
+
- importlib_resources=6.4.0=pyhd8ed1ab_0
|
18 |
+
- jedi=0.19.1=pyhd8ed1ab_0
|
19 |
+
- joblib=1.4.0=pyhd8ed1ab_0
|
20 |
+
- jupyter_client=8.6.1=pyhd8ed1ab_0
|
21 |
+
- matplotlib-inline=0.1.7=pyhd8ed1ab_0
|
22 |
+
- munkres=1.1.4=pyh9f0ad1d_0
|
23 |
+
- nest-asyncio=1.6.0=pyhd8ed1ab_0
|
24 |
+
- packaging=24.0=pyhd8ed1ab_0
|
25 |
+
- parso=0.8.4=pyhd8ed1ab_0
|
26 |
+
- pickleshare=0.7.5=py_1003
|
27 |
+
- pip=24.0=pyhd8ed1ab_0
|
28 |
+
- platformdirs=4.2.0=pyhd8ed1ab_0
|
29 |
+
- ply=3.11=pyhd8ed1ab_2
|
30 |
+
- prompt-toolkit=3.0.42=pyha770c72_0
|
31 |
+
- prompt_toolkit=3.0.42=hd8ed1ab_0
|
32 |
+
- pure_eval=0.2.2=pyhd8ed1ab_0
|
33 |
+
- pygments=2.17.2=pyhd8ed1ab_0
|
34 |
+
- pyparsing=3.1.2=pyhd8ed1ab_0
|
35 |
+
- python-dateutil=2.9.0=pyhd8ed1ab_0
|
36 |
+
- python-tzdata=2024.1=pyhd8ed1ab_0
|
37 |
+
- python_abi=3.9=4_cp39
|
38 |
+
- pytz=2024.1=pyhd8ed1ab_0
|
39 |
+
- setuptools=69.5.1=pyhd8ed1ab_0
|
40 |
+
- six=1.16.0=pyh6c4a22f_0
|
41 |
+
- stack_data=0.6.2=pyhd8ed1ab_0
|
42 |
+
- threadpoolctl=3.4.0=pyhc1e730c_0
|
43 |
+
- toml=0.10.2=pyhd8ed1ab_0
|
44 |
+
- tomli=2.0.1=pyhd8ed1ab_0
|
45 |
+
- traitlets=5.14.3=pyhd8ed1ab_0
|
46 |
+
- typing_extensions=4.11.0=pyha770c72_0
|
47 |
+
- tzdata=2024a=h0c530f3_0
|
48 |
+
- wcwidth=0.2.13=pyhd8ed1ab_0
|
49 |
+
- wheel=0.43.0=pyhd8ed1ab_1
|
50 |
+
- zipp=3.17.0=pyhd8ed1ab_0
|
51 |
- pip:
|
52 |
+
- altair==5.3.0
|
53 |
+
- asttokens==2.4.1
|
54 |
+
- attrs==23.2.0
|
55 |
+
- backcall==0.2.0
|
56 |
+
- blinker==1.8.2
|
57 |
+
- brotli==1.1.0
|
58 |
+
- cachetools==5.4.0
|
59 |
+
- certifi==2024.2.2
|
60 |
+
- charset-normalizer==3.3.2
|
61 |
+
- click==8.1.7
|
62 |
+
- cloudpickle==3.0.0
|
63 |
+
- comm==0.2.2
|
64 |
+
- configspace==0.7.1
|
65 |
+
- contourpy==1.2.1
|
66 |
+
- cycler==0.12.1
|
67 |
+
- cvxopt==1.3.2
|
68 |
+
- dask==2024.4.1
|
69 |
+
- dask-jobqueue==0.8.5
|
70 |
+
- debugpy==1.8.1
|
71 |
+
- decorator==5.1.1
|
72 |
+
- deprecation==2.1.0
|
73 |
+
- distributed==2024.4.1
|
74 |
+
- emcee==3.1.4
|
75 |
+
- executing==2.0.1
|
76 |
+
- feeed==1.2.0
|
77 |
+
- fsspec==2024.3.1
|
78 |
+
- fonttools==4.51.0
|
79 |
+
- gitdb==4.0.11
|
80 |
+
- gitpython==3.1.43
|
81 |
+
- graphviz==0.20.3
|
82 |
+
- idna==3.7
|
83 |
+
- importlib-metadata==7.1.0
|
84 |
+
- importlib-resources==6.4.0
|
85 |
+
- imbalanced-learn==0.12.2
|
86 |
+
- imblearn==0.0
|
87 |
+
- intervaltree==3.1.0
|
88 |
+
- ipykernel==6.29.3
|
89 |
+
- ipython==8.12.0
|
90 |
+
- jedi==0.19.1
|
91 |
+
- jinja2==3.1.3
|
92 |
+
- joblib==1.4.0
|
93 |
+
- jsonschema==4.23.0
|
94 |
+
- jsonschema-specifications==2023.12.1
|
95 |
+
- jupyter_client==8.6.1
|
96 |
+
- jupyter_core==5.7.2
|
97 |
+
- kiwisolver==1.4.5
|
98 |
+
- levenshtein==0.23.0
|
99 |
+
- llvmlite==0.42.0
|
100 |
+
- locket==1.0.0
|
101 |
+
- lxml==5.2.1
|
102 |
+
- markdown-it-py==3.0.0
|
103 |
+
- markupsafe==2.1.5
|
104 |
+
- matplotlib==3.8.4
|
105 |
+
- matplotlib-inline==0.1.7
|
106 |
+
- mdurl==0.1.2
|
107 |
+
- more-itertools==10.2.0
|
108 |
+
- msgpack==1.0.8
|
109 |
+
- munkres==1.1.4
|
110 |
+
- networkx==3.2.1
|
111 |
+
- numba==0.59.1
|
112 |
+
- numpy==1.26.4
|
113 |
+
- opyenxes==0.3.0
|
114 |
+
- partd==1.4.1
|
115 |
+
- pandas==2.2.2
|
116 |
+
- pm4py==2.7.2
|
117 |
+
- protobuf==5.27.2
|
118 |
+
- pyarrow==17.0.0
|
119 |
+
- pydeck==0.9.1
|
120 |
+
- pydotplus==2.0.2
|
121 |
+
- pynisher==1.0.10
|
122 |
+
- pyrfr==0.9.0
|
123 |
+
- python-dateutil==2.9.0
|
124 |
+
- pyyaml==6.0.1
|
125 |
+
- rapidfuzz==3.8.1
|
126 |
+
- referencing==0.35.1
|
127 |
+
- regex==2023.12.25
|
128 |
+
- requests==2.32.3
|
129 |
+
- rich==13.7.1
|
130 |
+
- rpds-py==0.19.0
|
131 |
+
- seaborn==0.13.2
|
132 |
+
- scikit-learn==1.2.2
|
133 |
+
- scipy==1.13.0
|
134 |
+
- slicer==0.0.8
|
135 |
+
- smac==2.0.2
|
136 |
+
- smmap==5.0.1
|
137 |
+
- sortedcontainers==2.4.0
|
138 |
+
- stack_data==0.6.2
|
139 |
+
- streamlit==1.36.0
|
140 |
+
- stringdist==1.0.9
|
141 |
+
- tabulate==0.9.0
|
142 |
+
- tblib==3.0.0
|
143 |
+
- tenacity==8.5.0
|
144 |
+
- threadpoolctl==3.4.0
|
145 |
+
- toml==0.10.2
|
146 |
+
- tomli==2.0.1
|
147 |
+
- tornado==6.4
|
148 |
+
- tqdm==4.65.0
|
149 |
+
- toolz==0.12.1
|
150 |
+
- tzdata==2024.1
|
151 |
+
- urllib3==2.2.1
|
152 |
+
- watchdog==4.0.1
|
153 |
+
- xgboost==2.1.0
|
154 |
+
- zict==3.0.0
|
155 |
+
- zipp==3.17.0
|
156 |
+
- zstd==1.5.5.1
|
.github/workflows/test_gedi.yml
CHANGED
@@ -69,6 +69,14 @@ jobs:
|
|
69 |
run:
|
70 |
diff data/validation/genELexperiment1_04_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment1_04_02.json
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
test_benchmark:
|
73 |
runs-on: ubuntu-latest
|
74 |
|
|
|
69 |
run:
|
70 |
diff data/validation/genELexperiment1_04_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment1_04_02.json
|
71 |
|
72 |
+
- name: Compare output 3
|
73 |
+
run:
|
74 |
+
diff data/validation/genELexperiment3_04.json output/features/grid_feat/2_enself_rt20v/genELexperiment3_04_nan.json
|
75 |
+
|
76 |
+
- name: Compare output 4
|
77 |
+
run:
|
78 |
+
diff data/validation/genELexperiment4_02.json output/features/grid_feat/2_enself_rt20v/genELexperiment4_nan_02.json
|
79 |
+
|
80 |
test_benchmark:
|
81 |
runs-on: ubuntu-latest
|
82 |
|
data/test/grid_feat.csv
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
log,ratio_top_20_variants,epa_normalized_sequence_entropy_linear_forgetting
|
2 |
experiment1,0.2,0.4
|
3 |
experiment2,0.4,0.7
|
|
|
|
|
|
1 |
log,ratio_top_20_variants,epa_normalized_sequence_entropy_linear_forgetting
|
2 |
experiment1,0.2,0.4
|
3 |
experiment2,0.4,0.7
|
4 |
+
experiment3,NaN,0.4
|
5 |
+
experiment4,0.2,NaN
|
data/validation/genELexperiment3_04.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"epa_normalized_sequence_entropy_linear_forgetting": 0.052097205658647734, "log": "experiment3"}
|
data/validation/genELexperiment4_02.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"ratio_top_20_variants": 0.2, "log": "experiment4"}
|
gedi/benchmark.py
CHANGED
@@ -109,7 +109,7 @@ class BenchmarkTest:
|
|
109 |
results['log'] = log_name
|
110 |
|
111 |
print(f" SUCCESS: {miner} miner for {results} took {dt.now()-start_miner} sec.")
|
112 |
-
dump_features_json(results, dump_path, log_name, content_type="benchmark")
|
113 |
return
|
114 |
|
115 |
def split_miner_wrapper(self, log_path="data/real_event_logs/BPI_Challenges/BPI_Challenge_2012.xes"):
|
|
|
109 |
results['log'] = log_name
|
110 |
|
111 |
print(f" SUCCESS: {miner} miner for {results} took {dt.now()-start_miner} sec.")
|
112 |
+
dump_features_json(results, os.path.join(dump_path, log_name), content_type="benchmark")
|
113 |
return
|
114 |
|
115 |
def split_miner_wrapper(self, log_path="data/real_event_logs/BPI_Challenges/BPI_Challenge_2012.xes"):
|
gedi/features.py
CHANGED
@@ -159,6 +159,6 @@ class EventLogFeatures(EventLogFile):
|
|
159 |
|
160 |
identifier = file.rsplit(".", 1)[0]
|
161 |
print(f" DONE: {file_path}. FEEED computed {feature_set}")
|
162 |
-
dump_features_json(features, self.root_path,
|
163 |
return features
|
164 |
|
|
|
159 |
|
160 |
identifier = file.rsplit(".", 1)[0]
|
161 |
print(f" DONE: {file_path}. FEEED computed {feature_set}")
|
162 |
+
dump_features_json(features, os.path.join(self.root_path,identifier))
|
163 |
return features
|
164 |
|
gedi/generator.py
CHANGED
@@ -19,7 +19,8 @@ from pm4py.sim import play_out
|
|
19 |
from smac import HyperparameterOptimizationFacade, Scenario
|
20 |
from utils.param_keys import OUTPUT_PATH, INPUT_PATH
|
21 |
from utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, CONFIG_SPACE, N_TRIALS
|
22 |
-
from gedi.utils.io_helpers import get_output_key_value_location, dump_features_json
|
|
|
23 |
import xml.etree.ElementTree as ET
|
24 |
import re
|
25 |
from xml.dom import minidom
|
@@ -80,7 +81,7 @@ def removeextralines(elem):
|
|
80 |
element.tail=""
|
81 |
if not re.search(hasWords,str(element.text)):
|
82 |
element.text = ""
|
83 |
-
|
84 |
def add_extension_before_traces(xes_file):
|
85 |
# Register the namespace
|
86 |
ET.register_namespace('', "http://www.xes-standard.org/")
|
@@ -158,12 +159,13 @@ class GenerateEventLogs():
|
|
158 |
tasks=tasks.rename(columns={"ratio_variants_per_number_of_traces": "ratio_unique_traces_per_trace"})
|
159 |
|
160 |
if tasks is not None:
|
|
|
161 |
num_cores = multiprocessing.cpu_count() if len(tasks) >= multiprocessing.cpu_count() else len(tasks)
|
162 |
#self.generator_wrapper([*tasks.iterrows()][0])# For testing
|
163 |
with multiprocessing.Pool(num_cores) as p:
|
164 |
print(f"INFO: Generator starting at {start.strftime('%H:%M:%S')} using {num_cores} cores for {len(tasks)} tasks...")
|
165 |
random.seed(RANDOM_SEED)
|
166 |
-
log_config = p.map(self.generator_wrapper, tasks.iterrows())
|
167 |
self.log_config = log_config
|
168 |
|
169 |
else:
|
@@ -192,7 +194,7 @@ class GenerateEventLogs():
|
|
192 |
except IndexError:
|
193 |
identifier = task[0]+1
|
194 |
task = task[1].loc[lambda x, identifier=identifier: x!=identifier]
|
195 |
-
self.objectives = task.to_dict()
|
196 |
random.seed(RANDOM_SEED)
|
197 |
self.configs = self.optimize()
|
198 |
|
@@ -207,8 +209,8 @@ class GenerateEventLogs():
|
|
207 |
if self.objectives.get('ratio_unique_traces_per_trace'):#HOTFIX
|
208 |
self.objectives['ratio_variants_per_number_of_traces']=self.objectives.pop('ratio_unique_traces_per_trace')
|
209 |
|
210 |
-
save_path = get_output_key_value_location(
|
211 |
-
self.output_path, identifier)+".xes"
|
212 |
|
213 |
write_xes(log_config['log'], save_path)
|
214 |
add_extension_before_traces(save_path)
|
@@ -219,7 +221,7 @@ class GenerateEventLogs():
|
|
219 |
if features_to_dump.get('ratio_unique_traces_per_trace'):#HOTFIX
|
220 |
features_to_dump['ratio_variants_per_number_of_traces']=features_to_dump.pop('ratio_unique_traces_per_trace')
|
221 |
features_to_dump['log'] = identifier.replace('genEL', '')
|
222 |
-
dump_features_json(features_to_dump,
|
223 |
return log_config
|
224 |
|
225 |
def generate_optimized_log(self, config):
|
|
|
19 |
from smac import HyperparameterOptimizationFacade, Scenario
|
20 |
from utils.param_keys import OUTPUT_PATH, INPUT_PATH
|
21 |
from utils.param_keys.generator import GENERATOR_PARAMS, EXPERIMENT, CONFIG_SPACE, N_TRIALS
|
22 |
+
from gedi.utils.io_helpers import get_output_key_value_location, dump_features_json
|
23 |
+
from gedi.utils.io_helpers import read_csvs
|
24 |
import xml.etree.ElementTree as ET
|
25 |
import re
|
26 |
from xml.dom import minidom
|
|
|
81 |
element.tail=""
|
82 |
if not re.search(hasWords,str(element.text)):
|
83 |
element.text = ""
|
84 |
+
|
85 |
def add_extension_before_traces(xes_file):
|
86 |
# Register the namespace
|
87 |
ET.register_namespace('', "http://www.xes-standard.org/")
|
|
|
159 |
tasks=tasks.rename(columns={"ratio_variants_per_number_of_traces": "ratio_unique_traces_per_trace"})
|
160 |
|
161 |
if tasks is not None:
|
162 |
+
self.feature_keys = sorted([feature for feature in tasks.columns.tolist() if feature != "log"])
|
163 |
num_cores = multiprocessing.cpu_count() if len(tasks) >= multiprocessing.cpu_count() else len(tasks)
|
164 |
#self.generator_wrapper([*tasks.iterrows()][0])# For testing
|
165 |
with multiprocessing.Pool(num_cores) as p:
|
166 |
print(f"INFO: Generator starting at {start.strftime('%H:%M:%S')} using {num_cores} cores for {len(tasks)} tasks...")
|
167 |
random.seed(RANDOM_SEED)
|
168 |
+
log_config = p.map(self.generator_wrapper, [(index, row) for index, row in tasks.iterrows()])
|
169 |
self.log_config = log_config
|
170 |
|
171 |
else:
|
|
|
194 |
except IndexError:
|
195 |
identifier = task[0]+1
|
196 |
task = task[1].loc[lambda x, identifier=identifier: x!=identifier]
|
197 |
+
self.objectives = task.dropna().to_dict()
|
198 |
random.seed(RANDOM_SEED)
|
199 |
self.configs = self.optimize()
|
200 |
|
|
|
209 |
if self.objectives.get('ratio_unique_traces_per_trace'):#HOTFIX
|
210 |
self.objectives['ratio_variants_per_number_of_traces']=self.objectives.pop('ratio_unique_traces_per_trace')
|
211 |
|
212 |
+
save_path = get_output_key_value_location(task.to_dict(),
|
213 |
+
self.output_path, identifier, self.feature_keys)+".xes"
|
214 |
|
215 |
write_xes(log_config['log'], save_path)
|
216 |
add_extension_before_traces(save_path)
|
|
|
221 |
if features_to_dump.get('ratio_unique_traces_per_trace'):#HOTFIX
|
222 |
features_to_dump['ratio_variants_per_number_of_traces']=features_to_dump.pop('ratio_unique_traces_per_trace')
|
223 |
features_to_dump['log'] = identifier.replace('genEL', '')
|
224 |
+
dump_features_json(features_to_dump, save_path)
|
225 |
return log_config
|
226 |
|
227 |
def generate_optimized_log(self, config):
|
gedi/utils/io_helpers.py
CHANGED
@@ -52,9 +52,10 @@ def get_keys_abbreviation(obj_keys):
|
|
52 |
abbreviated_keys.append(abbreviated_key)
|
53 |
return '_'.join(abbreviated_keys)
|
54 |
|
55 |
-
def get_output_key_value_location(obj, output_path, identifier):
|
56 |
obj_sorted = dict(sorted(obj.items()))
|
57 |
-
obj_keys
|
|
|
58 |
|
59 |
obj_values = [round(x, 4) for x in [*obj_sorted.values()]]
|
60 |
|
@@ -71,15 +72,11 @@ def get_output_key_value_location(obj, output_path, identifier):
|
|
71 |
save_path = os.path.join(folder_path, generated_file_name)
|
72 |
return save_path
|
73 |
|
74 |
-
def dump_features_json(features: dict, output_path,
|
75 |
-
output_parts = PurePath(output_path).parts
|
76 |
-
|
77 |
*output_parts[1:])
|
78 |
-
|
79 |
-
json_path = get_output_key_value_location(objectives,
|
80 |
-
feature_dir, identifier)+".json"
|
81 |
-
else:
|
82 |
-
json_path = os.path.join(feature_dir, identifier)+".json"
|
83 |
|
84 |
os.makedirs(os.path.split(json_path)[0], exist_ok=True)
|
85 |
with open(json_path, 'w') as fp:
|
|
|
52 |
abbreviated_keys.append(abbreviated_key)
|
53 |
return '_'.join(abbreviated_keys)
|
54 |
|
55 |
+
def get_output_key_value_location(obj, output_path, identifier, obj_keys=None):
|
56 |
obj_sorted = dict(sorted(obj.items()))
|
57 |
+
if obj_keys is None:
|
58 |
+
obj_keys = [*obj_sorted.keys()]
|
59 |
|
60 |
obj_values = [round(x, 4) for x in [*obj_sorted.values()]]
|
61 |
|
|
|
72 |
save_path = os.path.join(folder_path, generated_file_name)
|
73 |
return save_path
|
74 |
|
75 |
+
def dump_features_json(features: dict, output_path, content_type="features"):
|
76 |
+
output_parts = PurePath(output_path.split(".xes")[0]).parts
|
77 |
+
features_path = os.path.join(output_parts[0], content_type,
|
78 |
*output_parts[1:])
|
79 |
+
json_path = features_path+'.json'
|
|
|
|
|
|
|
|
|
80 |
|
81 |
os.makedirs(os.path.split(json_path)[0], exist_ok=True)
|
82 |
with open(json_path, 'w') as fp:
|
utils/merge_jsons.py
CHANGED
@@ -4,12 +4,12 @@ import csv
|
|
4 |
import os
|
5 |
|
6 |
"""
|
7 |
-
Run using:
|
8 |
python merge_jsons.py path_to_your_json_directory output.csv
|
9 |
|
10 |
"""
|
11 |
def json_to_csv(json_dir, output_csv):
|
12 |
-
|
13 |
json_files = [os.path.join(json_dir, file) for file in os.listdir(json_dir) if file.endswith('.json')]
|
14 |
|
15 |
# Collect data from all JSON files
|
@@ -18,13 +18,13 @@ def json_to_csv(json_dir, output_csv):
|
|
18 |
with open(json_file, 'r') as f:
|
19 |
data = json.load(f)
|
20 |
all_data.append(data)
|
21 |
-
|
22 |
# Extract the headers from the first JSON object
|
23 |
if all_data:
|
24 |
-
headers =
|
25 |
else:
|
26 |
raise ValueError("No data found in JSON files")
|
27 |
-
|
28 |
# Write data to CSV
|
29 |
with open(output_csv, 'w', newline='') as f:
|
30 |
writer = csv.DictWriter(f, fieldnames=headers)
|
|
|
4 |
import os
|
5 |
|
6 |
"""
|
7 |
+
Run using:
|
8 |
python merge_jsons.py path_to_your_json_directory output.csv
|
9 |
|
10 |
"""
|
11 |
def json_to_csv(json_dir, output_csv):
|
12 |
+
|
13 |
json_files = [os.path.join(json_dir, file) for file in os.listdir(json_dir) if file.endswith('.json')]
|
14 |
|
15 |
# Collect data from all JSON files
|
|
|
18 |
with open(json_file, 'r') as f:
|
19 |
data = json.load(f)
|
20 |
all_data.append(data)
|
21 |
+
|
22 |
# Extract the headers from the first JSON object
|
23 |
if all_data:
|
24 |
+
headers = {elem for s in [set(i) for i in [d.keys() for d in all_data]] for elem in s}
|
25 |
else:
|
26 |
raise ValueError("No data found in JSON files")
|
27 |
+
|
28 |
# Write data to CSV
|
29 |
with open(output_csv, 'w', newline='') as f:
|
30 |
writer = csv.DictWriter(f, fieldnames=headers)
|