Joshua Sundance Bailey commited on
Commit
11a4b10
·
1 Parent(s): 4442689
geospatial-data-converter/kml_tricks.py CHANGED
@@ -15,7 +15,6 @@ def parse_description_to_gdf(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
15
  html_df = pd.read_html(StringIO(desc), flavor="lxml")
16
  yield html_df[-1].T
17
  except (lxml.etree.ParserError, lxml.etree.XMLSyntaxError) as e:
18
- print(desc)
19
  raise pd.errors.ParserError from e
20
 
21
  parsed_dataframes = list(_gen())
 
15
  html_df = pd.read_html(StringIO(desc), flavor="lxml")
16
  yield html_df[-1].T
17
  except (lxml.etree.ParserError, lxml.etree.XMLSyntaxError) as e:
 
18
  raise pd.errors.ParserError from e
19
 
20
  parsed_dataframes = list(_gen())
geospatial-data-converter/kml_tricks_refactor.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import zipfile
2
+ from io import StringIO
3
+
4
+ import bs4
5
+ import fiona
6
+ import geopandas as gpd
7
+ import lxml # nosec
8
+ import pandas as pd
9
+
10
+ fiona.drvsupport.supported_drivers["KML"] = "rw"
11
+
12
+
13
+ def parse_descriptions_to_geodf(geodf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
14
+ """Parses Descriptions from Google Earth file to a GeoDataFrame object"""
15
+
16
+ dataframes = []
17
+
18
+ # Iterate over descriptions and extract data
19
+ for desc in geodf["Description"]:
20
+ desc_as_io = StringIO(desc)
21
+
22
+ # Try to read the description into a DataFrame
23
+ parsed_html = pd.read_html(desc_as_io)
24
+ try:
25
+ temp_df = parsed_html[1].T
26
+ except IndexError:
27
+ temp_df = parsed_html[0].T
28
+
29
+ # Set DataFrame header and remove the first row
30
+ temp_df.columns = temp_df.iloc[0]
31
+ temp_df = temp_df.iloc[1:]
32
+
33
+ dataframes.append(temp_df)
34
+
35
+ # Combine all DataFrames
36
+ combined_df = pd.concat(dataframes, ignore_index=True)
37
+
38
+ # Add geometry data
39
+ combined_df["geometry"] = geodf["geometry"]
40
+
41
+ # Create a GeoDataFrame with the combined data and original CRS
42
+ result_geodf = gpd.GeoDataFrame(combined_df, crs=geodf.crs)
43
+
44
+ return result_geodf
45
+
46
+
47
+ def load_kmz_as_geodf(file_path: str) -> gpd.GeoDataFrame:
48
+ """Loads a KMZ file into a GeoPandas DataFrame, assuming the KMZ contains one KML file"""
49
+
50
+ # Open the KMZ file
51
+ with zipfile.ZipFile(file_path, "r") as kmz:
52
+ # List all KML files in the KMZ
53
+ kml_files = [file for file in kmz.namelist() if file.endswith(".kml")]
54
+
55
+ # Ensure there's only one KML file in the KMZ
56
+ if len(kml_files) != 1:
57
+ raise IndexError(
58
+ "KMZ contains more than one KML. Please extract or convert to multiple KMLs.",
59
+ )
60
+
61
+ # Read the KML file into a GeoDataFrame
62
+ geodf = gpd.read_file(f"zip://{file_path}/{kml_files[0]}", driver="KML")
63
+
64
+ return geodf
65
+
66
+
67
+ def load_ge_file(file_path: str) -> gpd.GeoDataFrame:
68
+ """Loads a KML or KMZ file and parses its descriptions into a GeoDataFrame"""
69
+
70
+ if file_path.endswith(".kml"):
71
+ return parse_descriptions_to_geodf(gpd.read_file(file_path, driver="KML"))
72
+ elif file_path.endswith(".kmz"):
73
+ return parse_descriptions_to_geodf(load_kmz_as_geodf(file_path))
74
+ raise ValueError("The file must have a .kml or .kmz extension.")
75
+
76
+
77
+ def extract_data_from_kml_code(kml_code: str) -> pd.DataFrame:
78
+ """Extracts data from KML code into a DataFrame using SimpleData tags, excluding embedded tables in feature descriptions"""
79
+
80
+ # Parse the KML source code
81
+ soup = bs4.BeautifulSoup(kml_code, "html.parser")
82
+
83
+ # Find all SchemaData tags (representing rows)
84
+ schema_data_tags = soup.find_all("schemadata")
85
+
86
+ # Create a generator that yields a dictionary for each row, containing the Placemark name and each SimpleData field
87
+ row_dicts = (
88
+ {
89
+ "Placemark_name": tag.parent.parent.find("name").text,
90
+ **{field.get("name"): field.text for field in tag.find_all("simpledata")},
91
+ }
92
+ for tag in schema_data_tags
93
+ )
94
+
95
+ # Convert the row dictionaries into a DataFrame
96
+ df = pd.DataFrame(row_dicts)
97
+
98
+ return df
99
+
100
+
101
+ def extract_kml_code_from_file(file_path: str) -> str:
102
+ """Extracts KML source code from a Google Earth file (KML or KMZ)"""
103
+
104
+ file_extension = file_path.lower().split(".")[-1]
105
+
106
+ if file_extension == "kml":
107
+ with open(file_path, "r") as kml_file:
108
+ kml_code = kml_file.read()
109
+ elif file_extension == "kmz":
110
+ with zipfile.ZipFile(file_path) as kmz_file:
111
+ # Find all KML files in the KMZ
112
+ kml_files = [
113
+ file for file in kmz_file.namelist() if file.lower().endswith(".kml")
114
+ ]
115
+
116
+ if len(kml_files) != 1:
117
+ raise IndexError(
118
+ "KMZ file contains more than one KML. Please extract or convert to multiple KMLs.",
119
+ )
120
+
121
+ with kmz_file.open(kml_files[0]) as kml_file:
122
+ # Decode the KML file's content from bytes to string
123
+ kml_code = kml_file.read().decode()
124
+ else:
125
+ raise ValueError("The input file must have a .kml or .kmz extension.")
126
+
127
+ return kml_code
128
+
129
+
130
+ def extract_data_from_ge_file(file_path: str) -> gpd.GeoDataFrame:
131
+ """Extracts data from a Google Earth file (KML or KMZ) into a GeoDataFrame using SimpleData tags, excluding embedded tables in feature descriptions"""
132
+
133
+ data_df = extract_data_from_kml_code(extract_kml_code_from_file(file_path))
134
+
135
+ if file_path.endswith(".kmz"):
136
+ ge_file_gdf = load_kmz_as_geodf(file_path)
137
+ else:
138
+ ge_file_gdf = gpd.read_file(file_path, driver="KML")
139
+
140
+ geo_df = gpd.GeoDataFrame(
141
+ data_df,
142
+ geometry=ge_file_gdf["geometry"],
143
+ crs=ge_file_gdf.crs,
144
+ )
145
+
146
+ return geo_df
147
+
148
+
149
+ def load_ge_data(file_path: str) -> pd.DataFrame:
150
+ """Extracts data from a Google Earth file (KML or KMZ) and handles errors due to parsing issues"""
151
+
152
+ kml_code = extract_kml_code_from_file(file_path)
153
+
154
+ # Choose the extraction method based on the presence of SimpleData or SimpleField tags in the KML code
155
+ primary_func, fallback_func = (
156
+ (extract_data_from_ge_file, load_ge_file)
157
+ if any(tag in kml_code.lower() for tag in ("<simpledata", "<simplefield"))
158
+ else (load_ge_file, extract_data_from_ge_file)
159
+ )
160
+
161
+ try:
162
+ data_df = primary_func(file_path)
163
+ except (
164
+ pd.errors.ParserError,
165
+ lxml.etree.ParserError,
166
+ lxml.etree.XMLSyntaxError,
167
+ ValueError,
168
+ ):
169
+ data_df = fallback_func(file_path)
170
+
171
+ return data_df
geospatial-data-converter/utils.py CHANGED
@@ -7,7 +7,8 @@ from typing import BinaryIO
7
  import geopandas as gpd
8
 
9
  # from kml_tricks import read_ge_file
10
- from kml_tricks_before import readge as read_ge_file
 
11
 
12
  output_format_dict = {
13
  "ESRI Shapefile": ("shp", "zip", "application/zip"), # must be zipped
 
7
  import geopandas as gpd
8
 
9
  # from kml_tricks import read_ge_file
10
+ # from kml_tricks_before import readge as read_ge_file
11
+ from kml_tricks_refactor import load_ge_data as read_ge_file
12
 
13
  output_format_dict = {
14
  "ESRI Shapefile": ("shp", "zip", "application/zip"), # must be zipped